# **Movies Dataset Content-Based and Collaborative Filtering Models**

## **Collaborators** 
- Ashna Sood 
- Urmi Suresh
- Tae Kim 
- Xianglong Wang

## **Imports** 

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import ast
import os 
import pickle
import math

import seaborn as sns
sns.set()
sns.set_context('talk')

import warnings
warnings.filterwarnings('ignore')

import patsy
import statsmodels.api as sm
import scipy.stats as stats

from sklearn.metrics import make_scorer, accuracy_score, plot_confusion_matrix
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold, GridSearchCV
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.manifold import TSNE

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import linear_kernel, cosine_similarity

from nltk.stem.snowball import SnowballStemmer

from sklearn import metrics

## **Content Based Recommender System Using Cleaned Metadata**

In [2]:
# read in cleaned movies metadata csv file
movies_df = pd.read_csv("movies_metadata_cleaned.csv")
movies_df

Unnamed: 0,ID,IMDB ID,Title,Collection,Genres,Language,Spoken Languages,Release Date,Runtime,Revenue,...,Production Countries,Popularity Rating,Vote Count,Vote Average,Keywords,Cast,Director,Writer,Producer,Metadata
0,461257,tt6980792,Queerama,,[],en,['en'],2017-06-09,75.0,,...,['United Kingdom'],0.163015,,,[],[],daisyasquith,,,daisyasquith
1,92323,tt0081758,Willie and Phil,,[],en,[],1980-08-15,115.0,,...,[],0.326500,,,[],"['michaelontkean', 'raysharkey', 'margotkidder']",paulmazursky,paulmazursky,,paulmazursky paulmazursky michaelontkean rays...
2,114838,tt0029949,Brother Rat,,['Comedy'],en,['en'],1938-10-29,87.0,,...,['United States of America'],0.174691,,,['basedonplayormusical'],"['ronaldreagan', 'janewyman', 'priscillalane',...",williamkeighley,jerrywald,,williamkeighley jerrywald Comedy ronaldreagan...
3,264723,tt0070580,Le pélican,,[],en,[],1974-02-06,83.0,,...,[],0.000115,,,[],[],gérardblain,,,gérardblain
4,88061,tt0055459,"So Evil, So Young",,['Drama'],en,['en'],1963-01-01,77.0,,...,[],0.001662,,,"['prison', ""women'sprison""]","['jillireland', 'ellenpollock', 'joanhaythorne...",godfreygrayson,markgrantham,,godfreygrayson markgrantham Drama jillireland...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42273,24428,tt0848228,The Avengers,theavengerscollection,"['Science Fiction', 'Action', 'Adventure']",en,['en'],2012-04-25,143.0,1.519558e+09,...,['United States of America'],89.887648,12000.0,7.4,"['newyork', 'shield', 'marvelcomic', 'superher...","['robertdowneyjr.', 'chrisevans', 'markruffalo...",josswhedon,josswhedon,stanlee,josswhedon josswhedon stanlee theavengerscolle...
42274,19995,tt0499549,Avatar,avatarcollection,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",en,"['en', 'es']",2009-12-10,162.0,2.787965e+09,...,"['United States of America', 'United Kingdom']",185.070892,12114.0,7.2,"['cultureclash', 'future', 'spacewar', 'spacec...","['samworthington', 'zoesaldana', 'sigourneywea...",jamescameron,jamescameron,jamescameron,jamescameron jamescameron jamescameron avatarc...
42275,155,tt0468569,The Dark Knight,thedarkknightcollection,"['Drama', 'Action', 'Crime', 'Thriller']",en,"['en', 'zh']",2008-07-16,152.0,1.004558e+09,...,"['United Kingdom', 'United States of America']",123.167259,12269.0,8.3,"['dccomics', 'crimefighter', 'secretidentity',...","['christianbale', 'michaelcaine', 'heathledger...",christophernolan,christophernolan,charlesroven,christophernolan christophernolan charlesroven...
42276,27205,tt1375666,Inception,,"['Action', 'Thriller', 'Science Fiction', 'Mys...",en,['en'],2010-07-14,148.0,8.255328e+08,...,"['United Kingdom', 'United States of America']",29.108149,14075.0,8.1,"['lossoflover', 'dream', 'kidnapping', 'sleep'...","['leonardodicaprio', 'josephgordon-levitt', 'e...",christophernolan,christophernolan,christophernolan,christophernolan christophernolan christophern...


In [19]:
# vectorize the movies' metadata
count = CountVectorizer(analyzer='word',ngram_range=(1, 2),min_df=0, stop_words='english')
# tokenizes the strings and returns a vector for each string 
count_matrix = count.fit_transform(movies_df["Metadata"])

# calculate cosine similarity between the movies
cosine_sim = cosine_similarity(count_matrix, count_matrix)

# save cosine similarity matrix 
outfile = "metadata_cosineSim"
np.save(outfile, cosine_sim)

In [4]:
# load metadata cosine similarity matrix 
cosine_sim_loaded = np.load("metadata_cosineSim.npy")

In [42]:
movies_df = movies_df.reset_index()
movie_titles = movies_df['Title']
indices = pd.Series(movies_df.index, index=movies_df['Title'])

Content Based recommender method to extract movie recommendations based on input movie title.

In [43]:
# method concept inspired from kaggle notebook 
def get_recommendations(movie_title):
    movie_index = indices[movie_title]
    sim_scores = list(enumerate(cosine_sim_loaded[movie_index]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:20]
    movie_indices = [i[0] for i in sim_scores]
    return movie_titles.iloc[movie_indices]

#### Testing Content Based Recommender 

In [53]:
get_recommendations("Legally Blonde")

39816                Legally Blonde 2: Red, White & Blonde
25443                              The Wendell Baker Story
39528                                     Four Christmases
41123                                                 Wild
39936                                             Penelope
40969                                     Cruel Intentions
4570                                                Mentor
23468                                     The Last Templar
33208                                      Legally Blondes
22225                                               Tenure
22907                                                 Girl
24419    Godspell: A Musical Based on the Gospel Accord...
27956                                        Finding Bliss
28708                                      Life With Mikey
28879                                   Overnight Delivery
29111                                         Dear Eleanor
30400                                       Ordinary Wor

In [73]:
# testing reccommender results
get_recommendations("Kabhi Khushi Kabhie Gham")

36719                    Kal Ho Naa Ho
33342              Student of the Year
8498                     The Matriarch
36169               Kuch Kuch Hota Hai
32928           Kabhi Alvida Naa Kehna
38783                  My Name Is Khan
33824               Ae Dil Hai Mushkil
715           The Trouble with Dee Dee
1937                Elizabeth Ekadashi
2497                   Yedyanchi Jatra
20663                    Chupke Chupke
40663      Dilwale Dulhania Le Jayenge
5643                     Wrong Side Up
3473               The Flying Dutchman
7459     The Living Room of the Nation
10026               Prince of Broadway
34860                     Return to Me
28607           Dark Blue Almost Black
34850                       Veer-Zaara
Name: Title, dtype: object

In [36]:
# testing reccommender results
get_recommendations("The Avengers")

42240                          Avengers: Age of Ultron
42207              Captain America: The Winter Soldier
42216                                          Ant-Man
42206                                   Doctor Strange
42168                             Thor: The Dark World
42248                       Captain America: Civil War
42241                                       Iron Man 2
42262                                         Iron Man
42001                              The Incredible Hulk
42235                                             Thor
36848                         Marvel One-Shot: Item 47
42263                                       Iron Man 3
32729            Marvel Studios: Assembling a Universe
42243               Captain America: The First Avenger
34903    Avengers Confidential: Black Widow & Punisher
27752        Iron Man & Captain America: Heroes United
42272                                         Deadpool
33212                   Iron Man & Hulk: Heroes United
36165     

In [18]:
# testing reccomender results 
get_recommendations("The Dark Knight")

42265                 The Dark Knight Rises
42249                         Batman Begins
40136            Batman: Under the Red Hood
42134                          The Prestige
39268             Batman: Assault on Arkham
39673                             Following
41663                        Batman Returns
26688                 The Holcroft Covenant
42276                             Inception
41942                               Dunkirk
6549                               Shooters
40206              Batman: The Killing Joke
35510          Batman & Mr. Freeze: SubZero
2157               The Last Desperate Hours
36562                             Doodlebug
2143                        Evil Behind You
7085                   Atom Man vs Superman
42245    Batman v Superman: Dawn of Justice
14592                      Chicago Overcoat
Name: Title, dtype: object

In [19]:
# testing reccomender results 
get_recommendations("Harry Potter and the Philosopher's Stone")

42212              Harry Potter and the Chamber of Secrets
42190               Harry Potter and the Half-Blood Prince
42203                  Harry Potter and the Goblet of Fire
42217             Harry Potter and the Prisoner of Azkaban
42198            Harry Potter and the Order of the Phoenix
42201         Harry Potter and the Deathly Hallows: Part 1
42220         Harry Potter and the Deathly Hallows: Part 2
1432                                           Bhoot Unkle
1438                                 Jill And Joy's Winter
7643                           A Journey Through Fairyland
16207                                 Wow! A Talking Fish!
41789    Percy Jackson & the Olympians: The Lightning T...
7099                                Puff, the Magic Dragon
7787                                        All She Wishes
863                           The Wonderful Ice Cream Suit
3761                                     Der Struwwelpeter
40224                                          Hocus Poc

In [20]:
# testing reccomender results 
get_recommendations("Tangled")

39802                                   Tangled Ever After
41418                            The Princess and the Frog
5239                                           Cheburashka
8391                    VeggieTales: Josh and the Big Wall
12697    VeggieTales: Minnesota Cuke and the Search for...
41439                                      Sleeping Beauty
41551                                            Enchanted
41697                                                 Bolt
1351                                      Green Legend Ran
1438                                 Jill And Joy's Winter
6489                                           Dragon Hill
9987                                           Malice@Doll
12124                            Кентервильское привидение
36224                                              Tin Toy
35376                                          Red's Dream
37608                                             Luxo Jr.
15898                                        A Flying Sh

In [21]:
# testing reccomender results 
get_recommendations("Cinderella")

39930     Ever After: A Cinderella Story
35993        Three Wishes for Cinderella
41439                    Sleeping Beauty
31032        The Cave of the Golden Rose
10532                        Aşk Kırmızı
38847    Cinderella III: A Twist in Time
10023          Cirque du Soleil: Varekai
15161                More Than a Miracle
13308        Prince and the Evening Star
39026    Cinderella II: Dreams Come True
836           Jails, Hospitals & Hip-Hop
1076                             Sundome
1432                         Bhoot Unkle
1438               Jill And Joy's Winter
2017                                Joni
2520                    Ill Gotten Gains
3760                             Το γάλα
4304                  Counting Backwards
7643         A Journey Through Fairyland
Name: Title, dtype: object

### **Collaborative Filtering**

In [3]:
# read in movie ratings 
ratings_df = pd.read_csv('Movies Data/ratings.csv')

# rename columns
ratings_df = ratings_df.rename(columns={"userId": "User ID", 
                                        "movieId": "Movie ID", 
                                        "timestamp": "Timestamp"})

In [4]:
ratings_df

Unnamed: 0,User ID,Movie ID,rating,Timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556
...,...,...,...,...
26024284,270896,58559,5.0,1257031564
26024285,270896,60069,5.0,1257032032
26024286,270896,63082,4.5,1257031764
26024287,270896,64957,4.5,1257033990


In [5]:
# check for null values
ratings_df.isnull().any()

User ID      False
Movie ID     False
rating       False
Timestamp    False
dtype: bool

In [23]:
!pip install scikit-surprise
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate
from surprise import dump



In [4]:
reader = Reader()

In [8]:
data = Dataset.load_from_df(ratings_df[['User ID', 'Movie ID', 'rating']], reader)
algo = SVD()
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7965  0.7962  0.7956  0.7962  0.7960  0.7961  0.0003  
MAE (testset)     0.6023  0.6025  0.6019  0.6023  0.6021  0.6022  0.0002  
Fit time          685.16  694.63  693.42  692.81  695.55  692.31  3.70    
Test time         57.71   58.90   49.77   49.97   46.38   52.54   4.88    


{'test_rmse': array([0.79654272, 0.79619573, 0.79561253, 0.79622099, 0.79598232]),
 'test_mae': array([0.60225976, 0.60245458, 0.60187422, 0.6023217 , 0.6021279 ]),
 'fit_time': (685.1593697071075,
  694.6251404285431,
  693.4244229793549,
  692.8122618198395,
  695.5460493564606),
 'test_time': (57.705753564834595,
  58.895099401474,
  49.7657413482666,
  49.96788167953491,
  46.383798360824585)}

In [9]:
training_data = data.build_full_trainset()
algo.fit(training_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fd5a709e520>

In [None]:
algo.predict(1, 302, 3).est

# check if the algorithm outputs 

In [None]:
# Dump algorithm to save 
file_name = "SVD_model_ex"
dump.dump(file_name, algo=algo)

#### Reload Collaboraitve Filtering Model

In [24]:
# reload algorithm
from surprise import dump
file_name = "SVD_model_ex"
_, loaded_algo = dump.load(file_name)

Surprise package predict method takes in: 
- uid = user ID
- iid = item id 
- rui = true rating -- optional

#### Testing Collaborative Filtering Model

In [26]:
pred1 = loaded_algo.predict(1, 302, 3)
rating1 = pred1.est
rating1

4.262429134519651

### Hybrid Recommender combining Content Based and Collaborative Filtering Models

Hybrid Recommender combining content based model and collaborative filtering model. Going to input the User ID and title of the movie and return top 20 movies that are similar based on both the metadata of the input movie and the user's preferences and predicted ratings of those movies.

In [27]:
movies_df

Unnamed: 0,index,ID,IMDB ID,Title,Collection,Genres,Language,Spoken Languages,Release Date,Runtime,...,Production Countries,Popularity Rating,Vote Count,Vote Average,Keywords,Cast,Director,Writer,Producer,Metadata
0,0,461257,tt6980792,Queerama,,[],en,['en'],2017-06-09,75.0,...,['United Kingdom'],0.163015,,,[],[],daisyasquith,,,daisyasquith
1,1,92323,tt0081758,Willie and Phil,,[],en,[],1980-08-15,115.0,...,[],0.326500,,,[],"['michaelontkean', 'raysharkey', 'margotkidder']",paulmazursky,paulmazursky,,paulmazursky paulmazursky michaelontkean rays...
2,2,114838,tt0029949,Brother Rat,,['Comedy'],en,['en'],1938-10-29,87.0,...,['United States of America'],0.174691,,,['basedonplayormusical'],"['ronaldreagan', 'janewyman', 'priscillalane',...",williamkeighley,jerrywald,,williamkeighley jerrywald Comedy ronaldreagan...
3,3,264723,tt0070580,Le pélican,,[],en,[],1974-02-06,83.0,...,[],0.000115,,,[],[],gérardblain,,,gérardblain
4,4,88061,tt0055459,"So Evil, So Young",,['Drama'],en,['en'],1963-01-01,77.0,...,[],0.001662,,,"['prison', ""women'sprison""]","['jillireland', 'ellenpollock', 'joanhaythorne...",godfreygrayson,markgrantham,,godfreygrayson markgrantham Drama jillireland...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42273,42273,24428,tt0848228,The Avengers,theavengerscollection,"['Science Fiction', 'Action', 'Adventure']",en,['en'],2012-04-25,143.0,...,['United States of America'],89.887648,12000.0,7.4,"['newyork', 'shield', 'marvelcomic', 'superher...","['robertdowneyjr.', 'chrisevans', 'markruffalo...",josswhedon,josswhedon,stanlee,josswhedon josswhedon stanlee theavengerscolle...
42274,42274,19995,tt0499549,Avatar,avatarcollection,"['Action', 'Adventure', 'Fantasy', 'Science Fi...",en,"['en', 'es']",2009-12-10,162.0,...,"['United States of America', 'United Kingdom']",185.070892,12114.0,7.2,"['cultureclash', 'future', 'spacewar', 'spacec...","['samworthington', 'zoesaldana', 'sigourneywea...",jamescameron,jamescameron,jamescameron,jamescameron jamescameron jamescameron avatarc...
42275,42275,155,tt0468569,The Dark Knight,thedarkknightcollection,"['Drama', 'Action', 'Crime', 'Thriller']",en,"['en', 'zh']",2008-07-16,152.0,...,"['United Kingdom', 'United States of America']",123.167259,12269.0,8.3,"['dccomics', 'crimefighter', 'secretidentity',...","['christianbale', 'michaelcaine', 'heathledger...",christophernolan,christophernolan,charlesroven,christophernolan christophernolan charlesroven...
42276,42276,27205,tt1375666,Inception,,"['Action', 'Thriller', 'Science Fiction', 'Mys...",en,['en'],2010-07-14,148.0,...,"['United Kingdom', 'United States of America']",29.108149,14075.0,8.1,"['lossoflover', 'dream', 'kidnapping', 'sleep'...","['leonardodicaprio', 'josephgordon-levitt', 'e...",christophernolan,christophernolan,christophernolan,christophernolan christophernolan christophern...


In [28]:
# read in movies ID map csv file
movies_ID_map = pd.read_csv("movies_ID_map.csv")
movies_ID_map

Unnamed: 0,Title,ID,Movie ID
0,Queerama,461257,176279
1,Willie and Phil,92323,112577
2,Brother Rat,114838,112548
3,Le pélican,264723,112510
4,"So Evil, So Young",88061,112467
...,...,...,...
42272,Deadpool,293660,122904
42273,The Avengers,24428,89745
42274,Avatar,19995,72998
42275,The Dark Knight,155,58559


In [78]:
def hybrid_recommender(userID, title):
    index = indices[title]
    tmdbId = movies_ID_map.loc[movies_ID_map["Title"] == title]['ID']
    movie_id = movies_ID_map.loc[movies_ID_map["Title"] == title]['Movie ID']
    
    # take top 25 movies based on similarity scores to calculate the vote of the 60th percentile movie
    # then calculate the weighted rating of each movie using IMDB formula 
    sim_scores = list(enumerate(cosine_sim_loaded[int(index)]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:21]
    similar_movie_indices = [i[0] for i in sim_scores]
    
    movies = movies_df.iloc[similar_movie_indices][['Title', 'Vote Count', 'Vote Average', 'ID']]
    similar_movies_IDs = list(map(int, movies["ID"].values))

    for ID, index in zip(similar_movies_IDs, similar_movie_indices):
        movieID = int(movies_ID_map.loc[movies_ID_map["ID"] == ID]['Movie ID'])
        est_rating = loaded_algo.predict(userID, movieID).est
        movies.loc[index, "est"] = est_rating
        
    movies = movies.sort_values('est', ascending=False)
    return movies

In [79]:
hybrid_recommender(35, 'Om Shanti Om')

Unnamed: 0,Title,Vote Count,Vote Average,ID,est
40663,Dilwale Dulhania Le Jayenge,661.0,9.1,19404,4.188756
1937,Elizabeth Ekadashi,,,305594,4.035075
2949,Room 314,1.0,4.0,60002,4.017024
1275,All The Days Before Tomorrow,,,19509,3.966125
2222,A perfect match,,,52738,3.92129
30496,Take Care,30.0,5.6,253283,3.883554
715,The Trouble with Dee Dee,,,49218,3.872203
38783,My Name Is Khan,237.0,7.7,26022,3.863465
8498,The Matriarch,3.0,5.3,148077,3.851328
158,Silja - nuorena nukkunut,,,468343,3.826293


In [80]:
hybrid_recommender(10, 'The Avengers')

Unnamed: 0,Title,Vote Count,Vote Average,ID,est
42262,Iron Man,8951.0,7.4,1726,4.358564
36165,Team Thor,93.0,7.5,413279,4.226156
41411,Serenity,1287.0,7.4,16320,4.225701
42248,Captain America: Civil War,7462.0,7.1,271110,4.222177
42272,Deadpool,11444.0,7.4,293660,4.203255
32729,Marvel Studios: Assembling a Universe,44.0,6.6,259910,4.160966
42263,Iron Man 3,8951.0,6.8,68721,4.054722
42206,Doctor Strange,5880.0,7.1,284052,4.051236
42207,Captain America: The Winter Soldier,5881.0,7.6,100402,4.002187
42243,Captain America: The First Avenger,7174.0,6.6,1771,3.992446


In [81]:
hybrid_recommender(35, 'The Avengers')

Unnamed: 0,Title,Vote Count,Vote Average,ID,est
41411,Serenity,1287.0,7.4,16320,4.361905
42206,Doctor Strange,5880.0,7.1,284052,4.153873
42262,Iron Man,8951.0,7.4,1726,4.126833
36165,Team Thor,93.0,7.5,413279,4.112717
32729,Marvel Studios: Assembling a Universe,44.0,6.6,259910,4.099205
42248,Captain America: Civil War,7462.0,7.1,271110,4.091808
42272,Deadpool,11444.0,7.4,293660,4.054944
42216,Ant-Man,6029.0,7.0,102899,4.043048
42207,Captain America: The Winter Soldier,5881.0,7.6,100402,3.992822
42240,Avengers: Age of Ultron,6908.0,7.3,99861,3.966149


In [83]:
hybrid_recommender(10, 'Mean Girls')

Unnamed: 0,Title,Vote Count,Vote Average,ID,est
31692,Puella Magi Madoka Magica the Movie Part III: ...,36.0,7.3,212162,4.394826
14299,Live from New York!,5.0,5.4,334328,3.755567
34656,Just One of the Guys,64.0,6.4,24548,3.75471
38215,Geek Charming,188.0,6.0,81250,3.725091
28333,Screwballs,22.0,4.7,25164,3.665975
35695,Frenemies,83.0,5.2,84105,3.548348
39125,It's a Boy Girl Thing,279.0,6.3,37725,3.443764
37231,Zapped,131.0,5.6,278774,3.432579
41469,The DUFF,1372.0,6.8,272693,3.397726
35992,The Cheetah Girls,90.0,4.9,32293,3.391446


In [84]:
hybrid_recommender(35, 'Mean Girls')

Unnamed: 0,Title,Vote Count,Vote Average,ID,est
31692,Puella Magi Madoka Magica the Movie Part III: ...,36.0,7.3,212162,4.404914
14299,Live from New York!,5.0,5.4,334328,3.952208
35695,Frenemies,83.0,5.2,84105,3.521542
39125,It's a Boy Girl Thing,279.0,6.3,37725,3.472875
34656,Just One of the Guys,64.0,6.4,24548,3.401603
38215,Geek Charming,188.0,6.0,81250,3.387064
35099,How to Build a Better Boy,71.0,5.7,286987,3.364189
28333,Screwballs,22.0,4.7,25164,3.328902
37231,Zapped,131.0,5.6,278774,3.312382
41469,The DUFF,1372.0,6.8,272693,3.311737
