# 10.2 Exercise: Recommender System


In [16]:
# Import libraries
import pandas as pd
import warnings

In [17]:
# Load data
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('ratings.csv')
movie_ratings = ratings.merge(movies, on='movieId')
user_movie_matrix = movie_ratings.pivot_table(index='userId', columns='title', values='rating')
movie_stats = movie_ratings.groupby('title')['rating'].agg(['mean', 'count'])

In [18]:
def recommend_movies_corr_filtered(movie_title, ratings_matrix, min_ratings=50, n=10):
    # If the requested movie isn't in the matrix, return a message
    if movie_title not in ratings_matrix:
        return f"Movie '{movie_title}' not found."

    # Filter out movies that have fewer than `min_ratings` total user ratings
    sufficient_data = movie_stats[movie_stats['count'] >= min_ratings].index
    filtered_matrix = ratings_matrix[sufficient_data.intersection(ratings_matrix.columns)]

    # Drop movies with zero variance (same rating from everyone) since they can't be used in Pearson correlation
    filtered_matrix = filtered_matrix.loc[:, filtered_matrix.std(skipna=True) > 0]

    # Check the target movie exists after filtering
    if movie_title not in filtered_matrix:
        return f"Movie '{movie_title}' has insufficient data after filtering."

    # Get the column of ratings for the selected movie
    target_ratings = filtered_matrix[movie_title]

    # Compute Pearson correlation between the selected movie and all others
    # but suppress warnings (the output had things like RuntimeWarning: Degrees of freedom <= 0 for slice)
    with warnings.catch_warnings():
        warnings.simplefilter("ignore", category=RuntimeWarning)
        correlations = filtered_matrix.corrwith(target_ratings)

    # Convert correlation results to a DataFrame and drop any NaNs
    corr_df = pd.DataFrame(correlations, columns=['correlation']).dropna()

    # Join with total rating counts to filter out low-signal correlations
    corr_df = corr_df.join(movie_stats['count'])

    # Keep only those movies with enough co-ratings and sort by correlation
    filtered = corr_df[corr_df['count'] >= min_ratings].sort_values('correlation', ascending=False)

    # Return the top N recommendations, excluding the original movie itself
    return filtered.drop(index=movie_title, errors='ignore').head(n)

In [19]:
# Example
recommend_movies_corr_filtered('Toy Story (1995)', user_movie_matrix)

Unnamed: 0_level_0,correlation,count
title,Unnamed: 1_level_1,Unnamed: 2_level_1
Toy Story 2 (1999),0.699211,97
Arachnophobia (1990),0.652424,53
"Incredibles, The (2004)",0.643301,125
Finding Nemo (2003),0.618701,141
Aladdin (1992),0.611892,183
Erin Brockovich (2000),0.598016,70
Wallace & Gromit: The Wrong Trousers (1993),0.589625,56
Blazing Saddles (1974),0.585892,62
"Wolf of Wall Street, The (2013)",0.578479,54
Toy Story 3 (2010),0.577446,55


In [20]:
# User input
import ipywidgets as widgets
from IPython.display import display

popular_movies = movie_stats[movie_stats['count'] >= 100].index.tolist()
popular_movies.sort()
movie_dropdown = widgets.Dropdown(options=popular_movies, description='Pick Movie:')
output = widgets.Output()

def on_movie_selected(change):
    with output:
        output.clear_output()
        print(f"Top 10 recommendations for: {change.new}")
        display(recommend_movies_corr_filtered(change.new, user_movie_matrix))

movie_dropdown.observe(on_movie_selected, names='value')
display(movie_dropdown, output)

Dropdown(description='Pick Movie:', options=('2001: A Space Odyssey (1968)', 'Ace Ventura: Pet Detective (1994…

Output()

## Process Summary

1. **Load Data**: Import ratings and movies data from CSV files.
2. **Preprocess**: Merge ratings with movie titles and create a user-movie ratings matrix.
3. **Correlation Calculation**: Compute correlations between the input movie and all others using shared ratings.
4. **Filtering**: Filter out movies with fewer than 50 shared ratings to improve recommendation quality.
5. **Recommendation**: Return top 10 correlated movies excluding the input.

The recommender system is built using item-item similarity via Pearson correlation between movie rating vectors. The idea is that if two movies tend to be rated similarly by the same users, they are likely to appeal to the same person.