# Content-based Filtering

In [34]:
# Import necessary libraries and submodules
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 

from surprise import Dataset, Reader
from surprise import SVD # implementation of Funk's SVD (gradient descent-based matrix factorization)
from surprise import accuracy # metric
from surprise.model_selection import train_test_split, GridSearchCV #train/test splits, crossval

from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.model_selection import train_test_split

In [None]:
# Load data into dataframes
ratings = pd.read_csv('../data/ratings.csv')
movies = pd.read_csv('../data/movies.csv')
tags = pd.read_csv('../data/tags.csv')

### One-hot encode genres

In [26]:
# One-hot encode genres
movies['genres_split'] = movies['genres'].str.split('|')  # Split genres into lists
mlb = MultiLabelBinarizer()
genre_encoded = mlb.fit_transform(movies['genres_split'])
genre_df = pd.DataFrame(genre_encoded, columns=mlb.classes_)

### Creating a content feature matrix

In [27]:
# Vectorize tags using TF-IDF
tag_vectorizer = TfidfVectorizer(max_features=500)  # Adjust max_features as needed
tag_features = tag_vectorizer.fit_transform(tags['tag'])

# Align indices and combine features
tag_df = pd.DataFrame(tag_features.toarray()).reset_index(drop=True)
content_features = pd.concat([genre_df, tag_df], axis=1)

content_features = content_features.fillna(0)

### Calculating similarity between movies

In [28]:
# Calculate cosine similarity between all movies
similarity_matrix = cosine_similarity(content_features)

### Generating recommendations

Custom function for recommending the 10 most similar movies (according to cosine similarity) to the movie used as input.

In [29]:
def recommend_movies(movie_title, movies_df, similarity_matrix, top_n=10):
    # Get the index of the movie
    idx = movies_df[movies_df['title'] == movie_title].index[0]
    
    # Get similarity scores for the movie
    sim_scores = list(enumerate(similarity_matrix[idx]))
    
    # Sort movies by similarity scores
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    
    # Get indices of top_n most similar movies
    top_movies = sim_scores[1:top_n+1]  # Exclude the movie itself
    top_indices = [i[0] for i in top_movies]
    
    # Return titles of top similar movies
    return movies_df.iloc[top_indices]['title'].values

#### Let's test it out!

In [30]:
recommendations = recommend_movies('Toy Story (1995)', movies, similarity_matrix)
print(recommendations)

['Adventures of Rocky and Bullwinkle, The (2000)' 'Wild, The (2006)'
 'Shrek the Third (2007)' 'Tale of Despereaux, The (2008)'
 'Asterix and the Vikings (Astérix et les Vikings) (2006)' 'Turbo (2013)'
 'The Good Dinosaur (2015)' 'Moana (2016)' 'Antz (1998)'
 'Toy Story 2 (1999)']


Looks alright at least with Toy Story.

### More personalized recommendations

Finding movies that the user has rated highly and using those movies as a basis for finding similar movies.

In [33]:
# Get movies rated highly by a user
user_high_ratings = ratings[ratings['userId'] == 2][ratings['rating'] >= 4.0]

# Find similar movies to those rated highly
similar_movies = []
for movie_id in user_high_ratings['movieId']:
    similar_movies.extend(recommend_movies(movies.loc[movies['movieId'] == movie_id, 'title'].values[0], 
                                           movies, similarity_matrix))

# Deduplicate and sort recommendations
personalized_recommendations = list(set(similar_movies))
print(personalized_recommendations)

['RoboCop 3 (1993)', 'Fear of a Black Hat (1994)', 'Going Places (Valseuses, Les) (1974)', 'Heavy (1995)', 'Warriors of Heaven and Earth (Tian di ying xiong) (2003)', 'Roger & Me (1989)', 'Village, The (2004)', 'Jane Eyre (1996)', "Microcosmos (Microcosmos: Le peuple de l'herbe) (1996)", 'My Crazy Life (Mi vida loca) (1993)', 'Shadow of a Doubt (1943)', 'Three Colors: Red (Trois couleurs: Rouge) (1994)', 'Man on Fire (2004)', 'Pootie Tang (2001)', 'Year of the Horse (1997)', 'Insomnia (2002)', 'Total Eclipse (1995)', 'Paper Moon (1973)', "I'm Gonna Git You Sucka (1988)", 'To End All Wars (2001)', 'John Q (2002)', 'Man in the Iron Mask, The (1998)', 'Source Code (2011)', 'Watchmen (2009)', 'Total Recall (1990)', 'Miami Blues (1990)', 'Witness for the Prosecution (1957)', 'Ready to Wear (Pret-A-Porter) (1994)', 'Guardian, The (2006)', 'X2: X-Men United (2003)', 'Untouchables, The (1987)', 'Ladybird Ladybird (1994)', 'Living in Oblivion (1995)', 'Line King: The Al Hirschfeld Story, The (1

  user_high_ratings = ratings[ratings['userId'] == 2][ratings['rating'] >= 4.0]


## Model evaluation

In [35]:
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

In [49]:
def get_content_recommendations(user_id, train_data, similarity_matrix, top_n=10):
    # Get movies the user has rated
    user_rated_movies = train_data[train_data['userId'] == user_id]['movieId'].tolist()

    # Map movie IDs to similarity matrix indices
    user_rated_indices = [movie_id_to_index[movie_id] for movie_id in user_rated_movies if movie_id in movie_id_to_index]

    # Aggregate similarity scores for rated movies
    if not user_rated_indices:  # Handle users with no valid ratings
        return []
    
    scores = np.mean(similarity_matrix[user_rated_indices], axis=0)

    # Sort scores and recommend top_n movies
    recommended_movie_indices = np.argsort(scores)[::-1][:top_n]

    return [movies.iloc[idx]['movieId'] for idx in recommended_movie_indices if idx in movie_id_to_index.values()]

In [50]:
def evaluate_precision_recall(user_id, test_data, train_data, similarity_matrix, top_n=10):
    """
    Evaluate Precision@k and Recall@k for a specific user.
    """
    # Ground truth: Movies the user interacted with in the test set
    test_movies = test_data[test_data['userId'] == user_id]['movieId'].tolist()

    # Generate content-based recommendations
    recommended_movies = get_content_recommendations(user_id, train_data, similarity_matrix, top_n)

    # Calculate precision and recall
    relevant_recommendations = len(set(recommended_movies) & set(test_movies))
    precision = relevant_recommendations / top_n
    recall = relevant_recommendations / len(test_movies) if test_movies else 0

    return precision, recall

# Evaluate Precision@k and Recall@k for all users in the test set
precisions = []
recalls = []

for user_id in test_data['userId'].unique():
    precision, recall = evaluate_precision_recall(user_id, test_data, train_data, similarity_matrix, top_n=10)
    precisions.append(precision)
    recalls.append(recall)

# Compute and display average Precision@k and Recall@k
avg_precision = np.mean(precisions)
avg_recall = np.mean(recalls)
print(f"Average Precision@10: {avg_precision:.4f}")
print(f"Average Recall@10: {avg_recall:.4f}")

Average Precision@10: 0.0066
Average Recall@10: 0.0026


In [48]:
# Map movieId to the corresponding row index in similarity_matrix
movie_id_to_index = {movie_id: idx for idx, movie_id in enumerate(movies['movieId'])}

In [51]:
def evaluate_ndcg(user_id, test_data, train_data, similarity_matrix, top_n=10):
    """
    Evaluate NDCG@k for a specific user.
    """
    # Ground truth: Movies the user interacted with in the test set
    test_movies = test_data[test_data['userId'] == user_id]['movieId'].tolist()

    # Generate content-based recommendations
    recommended_movies = get_content_recommendations(user_id, train_data, similarity_matrix, top_n)

    # Calculate DCG (Discounted Cumulative Gain)
    dcg = sum([1 / np.log2(i + 2) for i, movie in enumerate(recommended_movies) if movie in test_movies])

    # Calculate IDCG (Ideal DCG)
    idcg = sum([1 / np.log2(i + 2) for i in range(len(test_movies)) if i < top_n])

    # Compute NDCG
    ndcg = dcg / idcg if idcg > 0 else 0
    return ndcg

# Evaluate NDCG for all users in the test set
ndcgs = []

for user_id in test_data['userId'].unique():
    ndcg = evaluate_ndcg(user_id, test_data, train_data, similarity_matrix, top_n=10)
    ndcgs.append(ndcg)

# Compute and display average NDCG@k
avg_ndcg = np.mean(ndcgs)
print(f"Average NDCG@10: {avg_ndcg:.4f}")

Average NDCG@10: 0.0068


These scores are absolutely horrendous.