## Imports

In [43]:
import pandas as pd

import surprise
from surprise.prediction_algorithms import *
import pandas as pd
import numpy as np
import datetime as dt

from surprise import Dataset
from surprise import Reader

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate

## EDA

In [44]:
movies = pd.read_csv('../../ml-latest-small/movies.csv')

In [45]:
link = pd.read_csv('../../ml-latest-small/links.csv')

In [46]:
rating = pd.read_csv('../../ml-latest-small/ratings.csv')

In [47]:
tags = pd.read_csv('../../ml-latest-small/tags.csv')

In [48]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [49]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [50]:
movies.title.value_counts()

Emma (1996)                               2
Eros (2004)                               2
War of the Worlds (2005)                  2
Confessions of a Dangerous Mind (2002)    2
Saturn 3 (1980)                           2
                                         ..
Africa: The Serengeti (1994)              1
All Roads Lead to Rome (2016)             1
Human Traffic (1999)                      1
Beautiful Losers (2008)                   1
Portrait of a Lady, The (1996)            1
Name: title, Length: 9737, dtype: int64

In [51]:
#Check to see if there are any duplicate titles
movies.title.duplicated().sum()

5

In [52]:
#Drop the 5 duplicated movie titles
movies.drop_duplicates(subset='title', inplace=True)

In [53]:
#Sanity check to ensure all duplicates were dropped from title column

movies.title.duplicated().sum()

0

In [54]:
movies[movies.title.duplicated() == True]

Unnamed: 0,movieId,title,genres


In [55]:
#Check length of DataFrame

len(movies)

9737

In [56]:
# Split title column into two new columns: Title and year 

movies['Title'] = movies['title'].str.split('(', n=1, expand=True)[0]

movies['year_released'] = movies['title'].str.split('(', n=1, expand=True)[1]

In [57]:
# Drop original column title

movies.drop(columns='title', inplace=True)

In [58]:
# Sanity Check 
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995)
1,2,Adventure|Children|Fantasy,Jumanji,1995)
2,3,Comedy|Romance,Grumpier Old Men,1995)
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995)
4,5,Comedy,Father of the Bride Part II,1995)
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017)
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017)
9739,193585,Drama,Flint,2017)
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018)


In [59]:
# Format year_released column

movies['year_released'] = movies.year_released.str.replace(')', '')

In [60]:
# Sanity Check to ensure formatting was completed
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9739,193585,Drama,Flint,2017
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [61]:
link

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [62]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [63]:
#converted timestamp to datetime
rating['timestamp'] = pd.to_datetime(rating['timestamp'], unit='s')

In [64]:
rating.rating.value_counts(normalize=True)

4.0    0.265957
3.0    0.198808
5.0    0.131015
3.5    0.130271
4.5    0.084801
2.0    0.074884
2.5    0.055040
1.0    0.027877
1.5    0.017762
0.5    0.013586
Name: rating, dtype: float64

In [65]:
rating.duplicated().sum()

0

In [66]:
rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [67]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [68]:
movie_rating = movies.merge(rating, on='movieId', how='outer')

In [69]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,1.0,4.0,2000-07-30 18:45:03
1,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,5.0,4.0,1996-11-08 06:36:02
2,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,7.0,4.5,2005-01-25 06:52:26
3,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,15.0,2.5,2017-11-13 12:59:30
4,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100849,64997,,,,68.0,2.5,2008-12-28 20:55:15
100850,144606,,,,111.0,4.0,2018-01-31 23:27:37
100851,147002,,,,318.0,4.0,2017-08-08 15:45:52
100852,26958,,,,509.0,3.5,2015-07-04 17:42:33


In [70]:
movie_rating.dropna(inplace=True)

In [71]:
movie_rating['genres']=[row.strip().lower().replace('|',',') for row in movie_rating['genres']]

In [72]:
movie_rating['userId'].nunique()

610

In [73]:
stats = movie_rating.filter(['rating', 'timestamp']).describe()
stats

Unnamed: 0,rating
count,100813.0
mean,3.501557
std,1.042494
min,0.5
25%,3.0
50%,3.5
75%,4.0
max,5.0


In [74]:
# print(dt.datetime.fromtimestamp(stats.loc['min', 'timestamp']))
# print(dt.datetime.fromtimestamp(stats.loc['max', 'timestamp']))

In [75]:
movie_rating['Title'] = movie_rating['Title'].str.split('(', n=1, expand=True)[0]

movie_rating['year_released'] = movie_rating['Title'].str.split('(', n=1, expand=True)[0]

In [76]:
# Drop original column title
#movie_rating.drop('title',axis=1,inplace=True)

In [77]:
# Format year_released column

movie_rating['year_released'] = movie_rating.year_released.str.replace(')', '')

In [78]:
#converted timestamp to datetime
movie_rating['timestamp'] = pd.to_datetime(movie_rating['timestamp'], unit='s')

In [79]:
movie_rating['genres'] = [row.strip().lower().replace('|',',') for row in movie_rating['genres']]

In [80]:
movie_rating['genres'] = movie_rating['genres'].to_list()

In [81]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,1.0,4.0,2000-07-30 18:45:03
1,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,5.0,4.0,1996-11-08 06:36:02
2,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,7.0,4.5,2005-01-25 06:52:26
3,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,15.0,2.5,2017-11-13 12:59:30
4,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100843,193581,"action,animation,comedy,fantasy",Black Butler: Book of the Atlantic,Black Butler: Book of the Atlantic,184.0,4.0,2018-09-16 14:44:42
100844,193583,"animation,comedy,fantasy",No Game No Life: Zero,No Game No Life: Zero,184.0,3.5,2018-09-16 14:52:25
100845,193585,drama,Flint,Flint,184.0,3.5,2018-09-16 14:56:45
100846,193587,"action,animation",Bungo Stray Dogs: Dead Apple,Bungo Stray Dogs: Dead Apple,184.0,3.5,2018-09-16 15:00:21


## Split

In [82]:
train, test = train_test_split(movie_rating)

In [83]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)

In [84]:
test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)

## Tuning

In [86]:
svd = SVD(n_factors=125,n_epochs=45,lr_all=0.015,reg_all=0.1)
cross_validate(svd, train_data, measures=['RMSE', 'MAE'], cv=3, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 3 split(s).

                  Fold 1  Fold 2  Fold 3  Mean    Std     
RMSE (testset)    0.8745  0.8817  0.8708  0.8757  0.0045  
MAE (testset)     0.6729  0.6751  0.6699  0.6726  0.0021  
Fit time          6.11    6.04    6.04    6.06    0.03    
Test time         0.15    0.20    0.14    0.16    0.03    


{'test_rmse': array([0.87448027, 0.88171964, 0.87080648]),
 'test_mae': array([0.67287595, 0.67507657, 0.66990601]),
 'fit_time': (6.110288381576538, 6.038872241973877, 6.0396811962127686),
 'test_time': (0.14614510536193848, 0.20148754119873047, 0.14259004592895508)}

In [88]:
svd=SVD(n_factors=125,n_epochs=45,lr_all=0.015,reg_all=0.1)

## Deployment 

https://towardsdatascience.com/how-you-can-build-simple-recommender-systems-with-surprise-b0d32a8e4802 code stolen

In [89]:
data = train_data

In [90]:
trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2963d470ee0>

In [93]:
svd.predict(uid=10, iid=100)

Prediction(uid=10, iid=100, r_ui=None, est=2.976361264678584, details={'was_impossible': False})

In [94]:
Prediction(uid=10, iid=100, r_ui=None, est=4.051206489275292, details={'was_impossible': False})

Prediction(uid=10, iid=100, r_ui=None, est=4.051206489275292, details={'was_impossible': False})

In [102]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,1.0,4.0,2000-07-30 18:45:03
1,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,5.0,4.0,1996-11-08 06:36:02
2,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,7.0,4.5,2005-01-25 06:52:26
3,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,15.0,2.5,2017-11-13 12:59:30
4,1,"adventure,animation,children,comedy,fantasy",Toy Story,Toy Story,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100843,193581,"action,animation,comedy,fantasy",Black Butler: Book of the Atlantic,Black Butler: Book of the Atlantic,184.0,4.0,2018-09-16 14:44:42
100844,193583,"animation,comedy,fantasy",No Game No Life: Zero,No Game No Life: Zero,184.0,3.5,2018-09-16 14:52:25
100845,193585,drama,Flint,Flint,184.0,3.5,2018-09-16 14:56:45
100846,193587,"action,animation",Bungo Stray Dogs: Dead Apple,Bungo Stray Dogs: Dead Apple,184.0,3.5,2018-09-16 15:00:21


In [112]:
import difflib
import random

def get_movie_id(movie_title, metadata):
    
    """
    Gets the movie ID for a movie title based on the closest match in the metadata dataframe.
    """
    
    existing_titles = list(metadata['Title'].values)
    closest_titles = difflib.get_close_matches(movie_title, existing_titles)
    movie_id = metadata[metadata['Title'] == closest_titles[0]]['movieId'].values[0]
    return movie_id

def get_movie_info(movie_id, metadata):
    
    """
    Returns some basic information about a movie given the movie id and the metadata dataframe.
    """
    
    movie_info = metadata[metadata['movieId'] == movie_id][['genres','Title','year_released']]
    return movie_info.to_dict(orient='records')

def predict_review(user_id, movie_title, model, metadata):
    
    """
    Predicts the review (on a scale of 1-5) that a user would assign to a specific movie. 
    """
    
    movie_id = get_movie_id(movie_title, metadata)
    review_prediction = model.predict(uid=user_id, iid=movie_id)
    return review_prediction.est

def generate_recommendation(user_id, model, metadata, thresh=4):
    
    """
    Generates a movie recommendation for a user based on a rating threshold. Only
    movie with a predicted rating at or above the threshold will be recommended
    """
    
    movie_titles = list(metadata['Title'].values)
    random.shuffle(movie_titles)
    
    for movie_title in movie_titles:
        rating = predict_review(user_id, movie_title, model, metadata)
        if rating >= thresh:
            movie_id = get_movie_id(movie_title, metadata)
            return get_movie_info(movie_id, metadata)

In [115]:
user_id = 10
model = svd
metadata = movies
thresh=4

In [121]:
movie_titles = list(metadata['Title'].values)
random.shuffle(movie_titles)
    
for movie_title in movie_titles:
    rating = predict_review(user_id, movie_title, model, metadata)
    if rating >= thresh:
        movie_id = get_movie_id(movie_title, metadata)
        print(get_movie_info(movie_id, metadata))

[{'genres': 'Romance|Sci-Fi', 'Title': 'Jetée, La ', 'year_released': '1962'}]
[{'genres': 'Drama|Romance', 'Title': 'Afterglow ', 'year_released': '1997'}]
[{'genres': 'Drama', 'Title': "Guess Who's Coming to Dinner ", 'year_released': '1967'}]
[{'genres': 'Drama|Horror|Mystery|Sci-Fi|Thriller', 'Title': 'Black Mirror: White Christmas ', 'year_released': '2014'}]
[{'genres': 'Drama|Romance', 'Title': 'Call Me by Your Name ', 'year_released': '2017'}]
[{'genres': 'Comedy', 'Title': 'Dinner Game, The ', 'year_released': 'Dîner de cons, Le (1998'}]
[{'genres': 'Comedy', 'Title': 'The Intern ', 'year_released': '2015'}]
[{'genres': 'Drama', 'Title': 'Streetcar Named Desire, A ', 'year_released': '1951'}]
[{'genres': 'Comedy|Romance', 'Title': 'First Daughter ', 'year_released': '2004'}]
[{'genres': 'Animation|Children|Comedy', 'Title': 'Charlie Brown Christmas, A ', 'year_released': '1965'}]
[{'genres': 'Drama|Romance', 'Title': 'Lady Jane ', 'year_released': '1986'}]
[{'genres': 'Comedy|

KeyboardInterrupt: 

In [123]:
generate_recommendation(user_id, svd, metadata)

[{'genres': 'Action|Adventure|Mystery|Romance|Thriller',
  'Title': 'North by Northwest ',
  'year_released': '1959'}]

In [125]:
predict_review(user_id, 'North by Northwest ', model, metadata)

4.300419744898059