## Imports

In [1]:
import pandas as pd

import surprise
from surprise.prediction_algorithms import *
import pandas as pd
import numpy as np
import datetime as dt

from surprise import Dataset
from surprise import Reader

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from surprise.model_selection import cross_validate

from nltk.tokenize import RegexpTokenizer

import pickle 

## EDA

In [2]:
movies = pd.read_csv('../../ml-latest-small/movies.csv')

In [3]:
link = pd.read_csv('../../ml-latest-small/links.csv')

In [4]:
rating = pd.read_csv('../../ml-latest-small/ratings.csv')

In [5]:
tags = pd.read_csv('../../ml-latest-small/tags.csv')

In [6]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [7]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [8]:
movies.title.value_counts()

Saturn 3 (1980)                             2
Emma (1996)                                 2
War of the Worlds (2005)                    2
Eros (2004)                                 2
Confessions of a Dangerous Mind (2002)      2
                                           ..
Defending Your Life (1991)                  1
Big Mommas: Like Father, Like Son (2011)    1
You Are the Apple of My Eye (2011)          1
Amateur (1994)                              1
Unknown (2006)                              1
Name: title, Length: 9737, dtype: int64

In [9]:
#Check to see if there are any duplicate titles
movies.title.duplicated().sum()

5

In [10]:
#Drop the 5 duplicated movie titles
movies.drop_duplicates(subset='title', inplace=True)

In [11]:
#Sanity check to ensure all duplicates were dropped from title column

movies.title.duplicated().sum()

0

In [12]:
movies[movies.title.duplicated() == True]

Unnamed: 0,movieId,title,genres


In [13]:
#Check length of DataFrame

len(movies)

9737

In [14]:
# Split title column into two new columns: Title and year 

movies['Title'] = movies['title'].str.split('(', n=1, expand=True)[0]

movies['year_released'] = movies['title'].str.split('(', n=1, expand=True)[1]

In [15]:
# Drop original column title

movies.drop(columns='title', inplace=True)

In [16]:
# Sanity Check 
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995)
1,2,Adventure|Children|Fantasy,Jumanji,1995)
2,3,Comedy|Romance,Grumpier Old Men,1995)
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995)
4,5,Comedy,Father of the Bride Part II,1995)
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017)
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017)
9739,193585,Drama,Flint,2017)
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018)


In [17]:
# Format year_released column

movies['year_released'] = movies.year_released.str.replace(')', '')

In [18]:
# Sanity Check to ensure formatting was completed
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9739,193585,Drama,Flint,2017
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [19]:
link

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [20]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [21]:
#converted timestamp to datetime
rating['timestamp'] = pd.to_datetime(rating['timestamp'], unit='s')

In [22]:
rating.rating.value_counts(normalize=True)

4.0    0.265957
3.0    0.198808
5.0    0.131015
3.5    0.130271
4.5    0.084801
2.0    0.074884
2.5    0.055040
1.0    0.027877
1.5    0.017762
0.5    0.013586
Name: rating, dtype: float64

In [23]:
rating.duplicated().sum()

0

In [24]:
rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [25]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [26]:
movie_rating = movies.merge(rating, on='movieId', how='outer')

In [27]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,1.0,4.0,2000-07-30 18:45:03
1,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,5.0,4.0,1996-11-08 06:36:02
2,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,7.0,4.5,2005-01-25 06:52:26
3,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,15.0,2.5,2017-11-13 12:59:30
4,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100849,64997,,,,68.0,2.5,2008-12-28 20:55:15
100850,144606,,,,111.0,4.0,2018-01-31 23:27:37
100851,147002,,,,318.0,4.0,2017-08-08 15:45:52
100852,26958,,,,509.0,3.5,2015-07-04 17:42:33


In [28]:
movie_rating.dropna(inplace=True)

In [29]:
movie_rating['genres']=[row.strip().lower().replace('|',',') for row in movie_rating['genres']]

In [30]:
movie_rating['userId'].nunique()

610

In [31]:
stats = movie_rating.filter(['rating', 'timestamp']).describe()
stats

Unnamed: 0,rating
count,100813.0
mean,3.501557
std,1.042494
min,0.5
25%,3.0
50%,3.5
75%,4.0
max,5.0


In [32]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,1.0,4.0,2000-07-30 18:45:03
1,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,5.0,4.0,1996-11-08 06:36:02
2,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,7.0,4.5,2005-01-25 06:52:26
3,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,15.0,2.5,2017-11-13 12:59:30
4,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100843,193581,"action,animation,comedy,fantasy",Black Butler: Book of the Atlantic,2017,184.0,4.0,2018-09-16 14:44:42
100844,193583,"animation,comedy,fantasy",No Game No Life: Zero,2017,184.0,3.5,2018-09-16 14:52:25
100845,193585,drama,Flint,2017,184.0,3.5,2018-09-16 14:56:45
100846,193587,"action,animation",Bungo Stray Dogs: Dead Apple,2018,184.0,3.5,2018-09-16 15:00:21


In [33]:
# movie_rating['genres'] = movie_rating['genres'].to_list()

In [34]:
# pattern = "([a-zA-Z]+(?:'[a-z]+)?)"
# tokenizer = RegexpTokenizer(pattern)
# movie_rating['genres'] = [tokenizer.tokenize(token) for token in movie_rating['genres']]

## Split

In [35]:
train, test = train_test_split(movie_rating, random_state=42)

In [36]:
reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)

In [37]:
test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)

## Tuning

### KNNBasics

In [38]:
param_grid = {'k':[10, 50, 100],'min_k': [1, 5, 10]}
base_model = GridSearchCV(KNNBasic,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.4s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.9s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   26.1s finished


In [39]:
base_model.best_params

{'rmse': {'k': 10, 'min_k': 5}, 'mae': {'k': 10, 'min_k': 5}}

In [38]:
param_grid = {'k':[5, 10, 15],'min_k': [1, 5, 10]}
base_model = GridSearchCV(KNNBasic,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.3s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.8s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   21.8s finished


In [39]:
base_model.best_params

{'rmse': {'k': 15, 'min_k': 5}, 'mae': {'k': 15, 'min_k': 5}}

In [40]:
param_grid = {'k':[10, 15, 20],'min_k': [1, 5, 10]}
base_model = GridSearchCV(KNNBasic,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.9s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.5s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    2.0s remaining:    0.0s


Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

[Parallel(n_jobs=1)]: Done  45 out of  45 | elapsed:   23.3s finished


In [41]:
base_model.best_params

{'rmse': {'k': 15, 'min_k': 5}, 'mae': {'k': 15, 'min_k': 1}}

Best Params for KNNBasics is k=15, min_k = 5.

## Cross Validation

In [42]:
#instantiate KNN model 
model1=KNNBasic(k=15, min_k=5)


In [43]:
#instaniatite and perform cross val with training data
cross_validate(model1, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9437  0.9530  0.9613  0.9616  0.9426  0.9524  0.0082  
MAE (testset)     0.7249  0.7314  0.7384  0.7349  0.7227  0.7304  0.0059  
Fit time          0.03    0.04    0.04    0.04    0.04    0.04    0.00    
Test time         0.54    0.47    0.48    0.47    0.48    0.49    0.03    


{'test_rmse': array([0.94370878, 0.95299406, 0.96133316, 0.96156111, 0.94257903]),
 'test_mae': array([0.72486334, 0.73135331, 0.7383573 , 0.73492824, 0.72272605]),
 'fit_time': (0.034474849700927734,
  0.04156970977783203,
  0.04493284225463867,
  0.0423130989074707,
  0.039087772369384766),
 'test_time': (0.5382709503173828,
  0.47374629974365234,
  0.47782015800476074,
  0.4720458984375,
  0.48007702827453613)}

In [50]:
#use model to predict
model1.predict(uid=10, iid = 20)
#this model just predicted what user 10 would predict movie 20

Prediction(uid=10, iid=20, r_ui=None, est=2.7059151340014385, details={'actual_k': 5, 'was_impossible': False})

looking at the output above we can determine that our model predicted that use 10 would rate movie 20 a 2.5 give or take our RSME which is .95.

## Models

### SVD

In [44]:
model2 = SVD()

In [45]:
cross_validate(model2, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8836  0.8831  0.8792  0.8893  0.8730  0.8816  0.0054  
MAE (testset)     0.6795  0.6780  0.6768  0.6858  0.6769  0.6794  0.0034  
Fit time          3.06    2.98    2.98    2.98    3.09    3.02    0.05    
Test time         0.14    0.06    0.11    0.07    0.10    0.10    0.03    


{'test_rmse': array([0.88359335, 0.88306213, 0.87918779, 0.88929084, 0.87302769]),
 'test_mae': array([0.67953846, 0.67797359, 0.6767603 , 0.68583881, 0.67687146]),
 'fit_time': (3.0647788047790527,
  2.9783637523651123,
  2.982062816619873,
  2.983582019805908,
  3.089405059814453),
 'test_time': (0.14478778839111328,
  0.0646810531616211,
  0.10504508018493652,
  0.0673820972442627,
  0.1048119068145752)}

In [46]:
param_grid = {'n_factors':[10,20,50],'n_epochs': [5, 10, 15], 'lr_all': [0.002,0.005,0.01],
             'reg_all': [0.2,0.4,0.6]}
gs_model2 = GridSearchCV(SVD,param_grid=param_grid,joblib_verbose=5)
gs_model2.fit(train_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    1.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    1.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed:  5.0min finished


In [47]:
gs_model2.best_params

{'rmse': {'n_factors': 50, 'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.2},
 'mae': {'n_factors': 50, 'n_epochs': 15, 'lr_all': 0.01, 'reg_all': 0.2}}

In [48]:
param_grid = {'n_factors':[25, 50, 75],'n_epochs': [15, 30, 45], 'lr_all': [0.01,0.05,0.1],
             'reg_all': [0.1,0.2,0.3]}
gs_model2 = GridSearchCV(SVD,param_grid=param_grid,joblib_verbose=5)
gs_model2.fit(train_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    1.1s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    2.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:    3.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:    4.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed: 19.3min finished


In [49]:
gs_model2.best_params

{'rmse': {'n_factors': 75, 'n_epochs': 45, 'lr_all': 0.01, 'reg_all': 0.1},
 'mae': {'n_factors': 75, 'n_epochs': 45, 'lr_all': 0.01, 'reg_all': 0.1}}

In [51]:
cross_validate(SVD(n_factors=75,n_epochs=45,lr_all=0.01,reg_all=0.1), train_data, measures=['RMSE', 'MAE'], 
               cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8691  0.8660  0.8571  0.8645  0.8531  0.8620  0.0059  
MAE (testset)     0.6697  0.6631  0.6591  0.6591  0.6573  0.6617  0.0044  
Fit time          5.48    5.63    5.33    5.51    5.46    5.48    0.09    
Test time         0.06    0.16    0.06    0.13    0.06    0.10    0.04    


{'test_rmse': array([0.86910199, 0.86600535, 0.85707017, 0.86451862, 0.85313578]),
 'test_mae': array([0.6696761 , 0.66309994, 0.65910254, 0.65909401, 0.65734777]),
 'fit_time': (5.477036952972412,
  5.625604867935181,
  5.330721855163574,
  5.513956069946289,
  5.464638948440552),
 'test_time': (0.0649251937866211,
  0.162553071975708,
  0.0646810531616211,
  0.1313638687133789,
  0.06158089637756348)}

In [52]:
param_grid = {'n_factors':[75, 100, 125],'n_epochs': [45, 60, 75], 'lr_all': [.005, 0.01, .015],
             'reg_all': [.05, 0.1, 0.15]}
gs_model2 = GridSearchCV(SVD,param_grid=param_grid,joblib_verbose=5)
gs_model2.fit(train_data)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    5.5s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.3s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   16.9s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   22.6s remaining:    0.0s
[Parallel(n_jobs=1)]: Done 405 out of 405 | elapsed: 61.2min finished


In [53]:
gs_model2.best_params

{'rmse': {'n_factors': 125, 'n_epochs': 45, 'lr_all': 0.015, 'reg_all': 0.1},
 'mae': {'n_factors': 125, 'n_epochs': 75, 'lr_all': 0.01, 'reg_all': 0.1}}

At some point we have to stop. If we have more time we will run another grid search for the model above. 
Best Params for SVD are n_factors=125, n_epochs= 45, lr_all= .015, reg_all= .1.

In [55]:
cross_validate(SVD(n_factors=125,n_epochs=45,lr_all=0.015,reg_all=0.1), train_data, measures=['RMSE', 'MAE'], 
               cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.8521  0.8617  0.8624  0.8667  0.8577  0.8601  0.0049  
MAE (testset)     0.6547  0.6617  0.6621  0.6646  0.6563  0.6599  0.0038  
Fit time          7.97    7.92    7.89    7.87    7.92    7.91    0.03    
Test time         0.07    0.06    0.12    0.06    0.06    0.08    0.02    


{'test_rmse': array([0.85210958, 0.86165924, 0.86243766, 0.86669774, 0.85770389]),
 'test_mae': array([0.65469781, 0.66169461, 0.6620919 , 0.66460263, 0.6562725 ]),
 'fit_time': (7.966294050216675,
  7.921723127365112,
  7.888775110244751,
  7.873623847961426,
  7.921288251876831),
 'test_time': (0.06583905220031738,
  0.06383705139160156,
  0.1222848892211914,
  0.06290531158447266,
  0.06190013885498047)}

Best model so far with an RMSE of .86.

### NMF

In [37]:
model3 = NMF()

In [38]:
cross_validate(model3, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm NMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9412  0.9303  0.9453  0.9486  0.9410  0.9413  0.0062  
MAE (testset)     0.7225  0.7151  0.7272  0.7290  0.7229  0.7234  0.0048  
Fit time          5.98    5.82    6.05    5.74    5.87    5.89    0.11    
Test time         0.20    0.11    0.11    0.12    0.11    0.13    0.03    


{'test_rmse': array([0.94119998, 0.9303245 , 0.94527904, 0.94861899, 0.94104819]),
 'test_mae': array([0.72247869, 0.71512044, 0.72718943, 0.72902898, 0.72294451]),
 'fit_time': (5.9826483726501465,
  5.817740440368652,
  6.0517191886901855,
  5.735777854919434,
  5.866620302200317),
 'test_time': (0.1996474266052246,
  0.11468911170959473,
  0.11469411849975586,
  0.11967945098876953,
  0.10674643516540527)}

In [None]:
param_grid = {'n_factors':[10,15,20],'n_epochs': [25, 50, 75]}
gs_model3 = GridSearchCV(NMF,param_grid=param_grid,joblib_verbose=5)
gs_model3.fit(train_data)

In [None]:
gs_model3.best_params

In [None]:
cross_validate(NMF(n_factors=10,n_epochs=25), train_data, measures=['RMSE', 'MAE'], 
               cv=5, verbose=True,  n_jobs= -2)

In [None]:
param_grid = {'n_factors':[3,5,10],'n_epochs': [15, 20, 25]}
gs_model3 = GridSearchCV(NMF,param_grid=param_grid,joblib_verbose=5)
gs_model3.fit(train_data)

In [None]:
gs_model3.best_params

## Final Model Evaluation 

In [None]:
final_model = SVD(n_factors=125,n_epochs=45,lr_all=0.015,reg_all=0.1)

In [None]:
final_model.test(test_data)

## Deployment 

In [39]:
train['est_rating_user10'] = train['movieId'].apply(lambda x: model3.predict(train.userId[10], x).est)
train.sort_values(by='est_rating_user10', ascending=False, inplace=True)

In [40]:
train

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp,est_rating_user10
75711,8228,mystery,"Maltese Falcon, The",a.k.a. Dangerous Female (1931,280.0,3.0,2012-09-23 21:13:34,5.0
24072,1136,"adventure,comedy,fantasy",Monty Python and the Holy Grail,1975,288.0,5.0,2000-12-06 16:29:06,5.0
34302,1673,drama,Boogie Nights,1997,560.0,4.0,2016-07-27 21:19:04,5.0
44515,2511,"crime,film-noir","Long Goodbye, The",1973,268.0,5.0,1999-10-18 17:14:02,5.0
78437,27611,"drama,sci-fi,war",Battlestar Galactica,2003,76.0,4.0,2015-08-10 01:04:46,5.0
...,...,...,...,...,...,...,...,...
68704,5863,comedy,Take This Job and Shove It,1981,474.0,1.0,2003-08-05 17:58:26,1.0
78428,27595,"action,comedy,horror,musical",Jesus Christ Vampire Hunter,2001,599.0,0.5,2018-02-21 01:13:16,1.0
54342,3442,"action,crime,drama",Band of the Hand,1986,555.0,1.0,2001-01-06 02:44:45,1.0
76491,8626,"horror,sci-fi",Dr. Terror's House of Horrors,1965,230.0,1.5,2007-11-29 02:52:15,1.0


In [41]:
user = input('userId: ')
genre = input('What genres are you interested in? ')
num_recs = input('How many recomendations would you like? ')

userId: 5
What genres are you interested in? comedy
How many recomendations would you like? 5


In [42]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 75609 entries, 75711 to 69319
Data columns (total 8 columns):
 #   Column             Non-Null Count  Dtype         
---  ------             --------------  -----         
 0   movieId            75609 non-null  int64         
 1   genres             75609 non-null  object        
 2   Title              75609 non-null  object        
 3   year_released      75609 non-null  object        
 4   userId             75609 non-null  float64       
 5   rating             75609 non-null  float64       
 6   timestamp          75609 non-null  datetime64[ns]
 7   est_rating_user10  75609 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int64(1), object(3)
memory usage: 5.2+ MB


In [43]:
genre

'comedy'

In [44]:
genre_df = train[train['genres'].str.contains(genre)]

In [45]:
genre_df

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp,est_rating_user10
24072,1136,"adventure,comedy,fantasy",Monty Python and the Holy Grail,1975,288.0,5.0,2000-12-06 16:29:06,5.0
62375,4467,"adventure,comedy,fantasy","Adventures of Baron Munchausen, The",1988,182.0,3.0,2003-06-09 11:35:30,5.0
24091,1136,"adventure,comedy,fantasy",Monty Python and the Holy Grail,1975,367.0,3.0,2001-08-14 17:47:31,5.0
24048,1136,"adventure,comedy,fantasy",Monty Python and the Holy Grail,1975,187.0,4.0,2007-06-01 21:52:53,5.0
94155,92535,comedy,Louis C.K.: Live at the Beacon Theater,2011,487.0,1.0,2015-04-18 09:00:56,5.0
...,...,...,...,...,...,...,...,...
70418,6315,comedy,Wildcats,1986,414.0,2.5,2003-05-22 17:18:57,1.0
88039,61348,comedy,Disaster Movie,2008,219.0,0.5,2008-09-13 07:14:44,1.0
33395,1602,comedy,Leave It to Beaver,1997,600.0,2.0,2009-03-22 23:00:56,1.0
68704,5863,comedy,Take This Job and Shove It,1981,474.0,1.0,2003-08-05 17:58:26,1.0


In [46]:
movies['est_rating_user10'] = train['movieId'].apply(lambda x: model3.predict(uid=user, iid=x).est)
movies.sort_values(by='est_rating_user10', ascending=False, inplace=True) 

In [47]:
movies #['Title'].head(int(num_recs))

Unnamed: 0,movieId,genres,Title,year_released,est_rating_user10
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,3.498504
6464,52604,Crime|Drama|Mystery|Thriller,Fracture,2007,3.498504
6462,52462,Action|Adventure|Animation|Comedy|Fantasy|Myst...,Aqua Teen Hunger Force Colon Movie Film for Th...,2007,3.498504
6461,52458,Drama|Thriller,Disturbia,2007,3.498504
6460,52435,Animation|Comedy|Fantasy|Musical,How the Grinch Stole Christmas!,1966,3.498504
...,...,...,...,...,...
9724,190183,Sci-Fi|Thriller,The Darkest Minds,2018,
9731,191005,Action|Adventure|Comedy|Sci-Fi,Gintama,2017,
9732,193565,Action|Animation|Comedy|Sci-Fi,Gintama: The Movie,2010,
9735,193573,Animation,Love Live! The School Idol Movie,2015,


In [48]:
movies['est_rating_user10'].value_counts()

3.498504    7344
Name: est_rating_user10, dtype: int64

In [49]:
movies['Title'].head(int(num_recs))

0                                              Toy Story 
6464                                            Fracture 
6462    Aqua Teen Hunger Force Colon Movie Film for Th...
6461                                           Disturbia 
6460                     How the Grinch Stole Christmas! 
Name: Title, dtype: object

In [50]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,1.0,4.0,2000-07-30 18:45:03
1,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,5.0,4.0,1996-11-08 06:36:02
2,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,7.0,4.5,2005-01-25 06:52:26
3,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,15.0,2.5,2017-11-13 12:59:30
4,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100843,193581,"action,animation,comedy,fantasy",Black Butler: Book of the Atlantic,2017,184.0,4.0,2018-09-16 14:44:42
100844,193583,"animation,comedy,fantasy",No Game No Life: Zero,2017,184.0,3.5,2018-09-16 14:52:25
100845,193585,drama,Flint,2017,184.0,3.5,2018-09-16 14:56:45
100846,193587,"action,animation",Bungo Stray Dogs: Dead Apple,2018,184.0,3.5,2018-09-16 15:00:21


## Deployment 2.0

In [50]:
user = input('userId: ')
genre = input('What genres are you interested in? ')
num_recs = input('How many recomendations would you like? ')

userId: 10
What genres are you interested in? comedy
How many recomendations would you like? 10


In [None]:
df_user = rating_review.reset_index('userId')
df_user.drop(columns=['Title', 'year_released', 'timestamp', 'genres', ], inplace=True)
df_user.head()

In [None]:
reader = Reader(rating_scale=(1, 5))
full_data = Dataset.load_from_df(rating_review[['userId', 'movieId', 'rating']], reader)

In [None]:
final_model.fit(full_data)

In [None]:
#Function
def Stuff():
    user = input('userId: ')
    genre = input('What genres are you interested in? ')
    num_recs = input('How many recomendations would you like? ')
    sub1 = user full_data['']





