# Movie Recommender System

In [1]:
import pandas as pd
import numpy as np

In [2]:
#ratings dataset
ratings_cols = ["user_id", "movie_id", "rating"]
ratings = pd.read_csv("Movie Ratings Dataset/ratings.csv", 
                      skiprows=1,
                      usecols=[0, 1, 2],
                      names=ratings_cols)
ratings["movie_id"] = ratings["movie_id"].apply(int)

In [3]:
#movies dataset
movies_cols = ["movie_id", "title"]
movies = pd.read_csv("Movie Ratings Dataset/movies.csv", 
                     skiprows=1, 
                     usecols=[0, 1], 
                     names=movies_cols)
movies["title"] = movies["title"].apply(lambda title: title.split(" (")[0]) #drop year information from titles

In [4]:
#merge datasets
ratings_with_titles = pd.merge(movies, ratings, on="movie_id")
ratings_with_titles.head()

Unnamed: 0,movie_id,title,user_id,rating
0,1,Toy Story,1,4.0
1,1,Toy Story,5,4.0
2,1,Toy Story,7,4.5
3,1,Toy Story,15,2.5
4,1,Toy Story,17,4.5


In [5]:
movie_ratings = ratings_with_titles.pivot_table(index=["user_id"], columns="title", values="rating")
movie_ratings.head()

title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,4.0,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,


Note that in the .corr method below, the "min_periods" parameter controls for the "minimum number of observations required per pair of columns." The higher this number is, the more likely an user will be recommended with "well-known" movies with lots of ratings. On the contrary, the lower the parameter is, the more likely an user will be recommended with less popular movies. The choice of 80 here is an arbitrary choice.

In [6]:
corr_matrix = movie_ratings.corr(min_periods=80)
corr_matrix.head()

title,'71,'Hellboy': The Seeds of Creation,'Round Midnight,'Salem's Lot,'Til There Was You,'Tis the Season for Love,"'burbs, The",'night Mother,(500) Days of Summer,*batteries not included,...,Zulu,[REC],[REC]²,[REC]³ 3 Génesis,anohana: The Flower We Saw That Day - The Movie,eXistenZ,xXx,xXx: State of the Union,¡Three Amigos!,À nous la liberté
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'71,,,,,,,,,,,...,,,,,,,,,,
'Hellboy': The Seeds of Creation,,,,,,,,,,,...,,,,,,,,,,
'Round Midnight,,,,,,,,,,,...,,,,,,,,,,
'Salem's Lot,,,,,,,,,,,...,,,,,,,,,,
'Til There Was You,,,,,,,,,,,...,,,,,,,,,,


Suppose an user enjoys Sci-fi movies and superhero movies, and dislikes old, romantic movies, our system is able to make pretty reasonable recommendations.

In [7]:
some_ratings_df = pd.DataFrame({"title": ["Inception", 
                                        "X-Men",
                                        "Dark Knight, The",
                                        "Independence Day", 
                                        "Lord of the Rings: The Two Towers, The", 
                                        "Gone with the Wind"], 
                              "rating": [5, 5, 5, 5, 4, 1]}).set_index("title")
some_ratings = some_ratings_df["rating"]
some_ratings

title
Inception                                 5
X-Men                                     5
Dark Knight, The                          5
Independence Day                          5
Lord of the Rings: The Two Towers, The    4
Gone with the Wind                        1
Name: rating, dtype: int64

In [8]:
similar_candidates = pd.Series()

for i in range(len(some_ratings.index)):
    candidates = corr_matrix[some_ratings.index[i]].dropna()
    candidates = candidates.map(lambda x: x * (some_ratings[i] - 3))
    similar_candidates = similar_candidates.append(candidates)
    
similar_candidates = similar_candidates.groupby(similar_candidates.index).sum()
similar_candidates.sort_values(inplace=True, ascending=False)

for i in range(len(some_ratings.index)):
    if some_ratings.index[i] in similar_candidates.index:
        similar_candidates = similar_candidates.drop(some_ratings.index[i])

similar_candidates.head(10)

Matrix, The                                           3.002193
Lord of the Rings: The Return of the King, The        2.733098
Lord of the Rings: The Fellowship of the Ring, The    2.623932
Forrest Gump                                          2.051940
Men in Black                                          2.028007
Jurassic Park                                         2.015916
Raiders of the Lost Ark                               2.004781
Silence of the Lambs, The                             1.849120
Shawshank Redemption, The                             1.797040
Memento                                               1.793641
dtype: float64

This also applies to existing users in our dataset, say, the 10th user.

In [9]:
some_ratings = movie_ratings.loc[10].dropna()

In [10]:
similar_candidates = pd.Series()

for i in range(len(some_ratings.index)):
    candidates = corr_matrix[some_ratings.index[i]].dropna()
    candidates = candidates.map(lambda x: x * (some_ratings[i] - 3)) #penalize movies with low scores
    similar_candidates = similar_candidates.append(candidates)
    
similar_candidates = similar_candidates.groupby(similar_candidates.index).sum()
similar_candidates.sort_values(inplace=True, ascending=False)

for i in range(len(some_ratings.index)):
    if some_ratings.index[i] in similar_candidates.index:
        similar_candidates = similar_candidates.drop(some_ratings.index[i])

similar_candidates.head(10)

Star Wars: Episode IV - A New Hope                        1.856979
Pirates of the Caribbean: The Curse of the Black Pearl    1.775061
Shrek 2                                                   1.232099
Star Wars: Episode VI - Return of the Jedi                1.000983
Mrs. Doubtfire                                            0.907209
Harry Potter and the Chamber of Secrets                   0.749222
Beauty and the Beast                                      0.738212
Ghost                                                     0.658529
Harry Potter and the Sorcerer's Stone                     0.613875
Mission: Impossible                                       0.548908
dtype: float64