<a href="https://colab.research.google.com/github/abiralchy0987/movie_recommendation_system/blob/main/optimized_CF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np
import spacy
import ast
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# Load SpaCy's English model with disabled components for efficiency
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

# Reading the data
movies = pd.read_csv('/content/tmdb_5000_movies.csv')
credits = pd.read_csv('/content/tmdb_5000_credits.csv')

# Merge datasets using movie_id in 'credits' and 'id' in 'movies'
movies_merged = movies.merge(credits, left_on='id', right_on='movie_id')

# Select relevant columns and rename for clarity
movies = movies_merged[['movie_id', 'title_x', 'overview', 'genres', 'keywords', 'cast', 'crew']]
movies = movies.rename(columns={'title_x': 'title'})

# Drop rows with missing values
movies.dropna(inplace=True)

# Safely convert JSON-like strings to lists
def convert(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)]
    except (ValueError, SyntaxError):
        return []  # Return empty list if parsing fails

movies['genres'] = movies['genres'].apply(convert)
movies['keywords'] = movies['keywords'].apply(convert)

# Convert cast to a list of top 3 actors
def convert_cast(text):
    try:
        return [i['name'] for i in ast.literal_eval(text)[:3]]
    except (ValueError, SyntaxError):
        return []

movies['cast'] = movies['cast'].apply(convert_cast)

# Fetch director from crew
def fetch_director(text):
    try:
        for i in ast.literal_eval(text):
            if i['job'] == 'Director':
                return [i['name']]
    except (ValueError, SyntaxError):
        return []
    return []

movies['crew'] = movies['crew'].apply(fetch_director)

# Process text: replace spaces with underscores
def process_text(text):
    return [i.replace(" ", "_") for i in text]

movies['cast'] = movies['cast'].apply(process_text)
movies['crew'] = movies['crew'].apply(process_text)

# Convert 'overview' to a list of words
movies['overview'] = movies['overview'].apply(lambda x: x.split())

# Combine all features into 'tags' as lists
movies['tags'] = (
    movies['overview'] +
    movies['genres'].apply(lambda x: x * 2) +  # Weight genres
    movies['keywords'].apply(lambda x: x * 2) +  # Weight keywords
    movies['cast'].apply(lambda x: x * 3) +  # Weight cast
    movies['crew'].apply(lambda x: x * 3)  # Weight crew
)

# Convert 'tags' to a single string
movies['tags'] = movies['tags'].apply(lambda x: " ".join(x))

# Use SpaCy's nlp.pipe for batch processing
def preprocess_text_batch(texts):
    processed_texts = []
    for doc in nlp.pipe(texts, batch_size=50):
        processed_tokens = [
            token.lemma_.lower() for token in doc
            if not token.is_stop and not token.is_punct
        ]
        processed_texts.append(" ".join(processed_tokens))
    return processed_texts

# Apply batch preprocessing to 'tags'
movies['tags'] = preprocess_text_batch(movies['tags'])

# TF-IDF Vectorization
tfidf = TfidfVectorizer(
    max_features=10000,
    ngram_range=(1, 3),
    min_df=0.001,
    max_df=0.8,
    stop_words='english'
)
vectors = tfidf.fit_transform(movies['tags'])

# Compute cosine similarity
similarity = cosine_similarity(vectors)

# Recommendation function
def recommend(movie, num_recommendations=5):
    index = movies[movies['title'] == movie].index
    if len(index) == 0:
        return f"Movie '{movie}' not found in database"

    index = index[0]  # Get the first index safely
    sim_scores = list(enumerate(similarity[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    recommendations = [
        {'title': movies.iloc[i[0]]['title'], 'similarity': f"{i[1] * 100:.1f}%"}
        for i in sim_scores[1:num_recommendations + 1]
    ]

    return pd.DataFrame(recommendations)

# Example usage
print(recommend('Avatar'))

                     title similarity
0                   Aliens      43.2%
1                   Alien³      35.1%
2  Star Trek Into Darkness      31.0%
3                    Alien      30.2%
4           Silent Running      29.4%


In [3]:
def precision_recall_at_k(movie_title, k=5, threshold=0.5):
    """
    Calculate precision@k and recall@k for a given movie.

    Args:
        movie_title (str): The movie title to evaluate.
        k (int): Number of recommendations to consider.
        threshold (float): Similarity threshold for relevance.

    Returns:
        precision (float): Precision@k.
        recall (float): Recall@k.
    """
    # Get the index of the movie
    index = movies[movies['title'] == movie_title].index
    if len(index) == 0:
        return 0, 0  # Movie not found

    index = index[0]
    sim_scores = list(enumerate(similarity[index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)

    # Get top-k recommendations (excluding the movie itself)
    top_k_indices = [i[0] for i in sim_scores[1:k+1]]
    top_k_scores = [i[1] for i in sim_scores[1:k+1]]

    # Count relevant recommendations (similarity score > threshold)
    relevant = sum(score > threshold for score in top_k_scores)

    # Precision@k: Proportion of recommended items that are relevant
    precision = relevant / k

    # Recall@k: Proportion of relevant items that are recommended
    # Total relevant items is the number of movies with similarity > threshold
    total_relevant = sum(score > threshold for score in [i[1] for i in sim_scores[1:]])
    recall = relevant / total_relevant if total_relevant > 0 else 0

    return precision, recall

# Example usage
movie_title = 'Avatar'
precision, recall = precision_recall_at_k(movie_title, k=5, threshold=0.5)
print(f"Precision@5: {precision:.2f}")
print(f"Recall@5: {recall:.2f}")

Precision@5: 0.00
Recall@5: 0.00


In [7]:
def f1_score(precision, recall):
    """
    Calculate F1-score given precision and recall.

    Args:
        precision (float): Precision@k.
        recall (float): Recall@k.

    Returns:
        f1 (float): F1-score.
    """
    if precision + recall == 0:
        return 0
    return 2 * (precision * recall) / (precision + recall)

# Example usage
f1 = f1_score(precision, recall)
print(f"F1-Score: {f1:.2f}")

F1-Score: 0.00


In [8]:
def coverage(similarity_matrix, k=5):
    """
    Calculate coverage of the recommendation system.

    Args:
        similarity_matrix (numpy array): Cosine similarity matrix.
        k (int): Number of recommendations to consider.

    Returns:
        coverage (float): Percentage of items that can be recommended.
    """
    num_movies = similarity_matrix.shape[0]
    recommended_items = set()

    for i in range(num_movies):
        sim_scores = list(enumerate(similarity_matrix[i]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_k_indices = [i[0] for i in sim_scores[1:k+1]]
        recommended_items.update(top_k_indices)

    return len(recommended_items) / num_movies

# Example usage
coverage_score = coverage(similarity, k=5)
print(f"Coverage: {coverage_score * 100:.2f}%")

Coverage: 94.25%


In [9]:
def mean_average_precision(movie_titles, k=5, threshold=0.5):
    """
    Calculate Mean Average Precision (MAP) for a list of movies.

    Args:
        movie_titles (list): List of movie titles to evaluate.
        k (int): Number of recommendations to consider.
        threshold (float): Similarity threshold for relevance.

    Returns:
        map_score (float): Mean Average Precision.
    """
    ap_scores = []

    for movie_title in movie_titles:
        precision, _ = precision_recall_at_k(movie_title, k, threshold)
        ap_scores.append(precision)

    return np.mean(ap_scores)

# Example usage
movie_titles = ['Avatar', 'Inception', 'The Dark Knight']
map_score = mean_average_precision(movie_titles, k=5, threshold=0.5)
print(f"Mean Average Precision (MAP): {map_score:.2f}")

Mean Average Precision (MAP): 0.13


In [10]:
def diversity(similarity_matrix, k=5):
    """
    Calculate diversity of recommendations.

    Args:
        similarity_matrix (numpy array): Cosine similarity matrix.
        k (int): Number of recommendations to consider.

    Returns:
        diversity_score (float): Average pairwise dissimilarity of recommendations.
    """
    num_movies = similarity_matrix.shape[0]
    diversity_scores = []

    for i in range(num_movies):
        sim_scores = list(enumerate(similarity_matrix[i]))
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
        top_k_indices = [i[0] for i in sim_scores[1:k+1]]

        # Calculate pairwise similarity between top-k recommendations
        pairwise_similarity = np.mean([similarity_matrix[i][j] for i in top_k_indices for j in top_k_indices if i != j])
        diversity_scores.append(1 - pairwise_similarity)  # Dissimilarity

    return np.mean(diversity_scores)

# Example usage
diversity_score = diversity(similarity, k=5)
print(f"Diversity: {diversity_score:.2f}")

Diversity: 0.83


In [11]:
# Evaluate for a list of movies
movie_titles = ['Avatar', 'Inception', 'The Dark Knight']

# Calculate metrics
precision_list = []
recall_list = []
f1_list = []

for movie_title in movie_titles:
    precision, recall = precision_recall_at_k(movie_title, k=5, threshold=0.5)
    f1 = f1_score(precision, recall)

    precision_list.append(precision)
    recall_list.append(recall)
    f1_list.append(f1)

# Print results
print(f"Average Precision@5: {np.mean(precision_list):.2f}")
print(f"Average Recall@5: {np.mean(recall_list):.2f}")
print(f"Average F1-Score: {np.mean(f1_list):.2f}")
print(f"Coverage: {coverage(similarity, k=5) * 100:.2f}%")
print(f"Diversity: {diversity(similarity, k=5):.2f}")

Average Precision@5: 0.13
Average Recall@5: 0.33
Average F1-Score: 0.19
Coverage: 94.25%
Diversity: 0.83
