# Imports

In [1]:
import pandas as pd

from surprise import Dataset, Reader, accuracy
from surprise.model_selection import cross_validate,GridSearchCV
from surprise.prediction_algorithms import SVD, SVDpp, NMF, BaselineOnly, NormalPredictor
from sklearn.model_selection import train_test_split

# EDA

In [8]:
movies = pd.read_csv('./ml-latest-small/movies.csv')
rating = pd.read_csv('./ml-latest-small/ratings.csv')

In [9]:
#looking at the movies df
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [10]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [11]:
movies.title.value_counts()

Emma (1996)                               2
War of the Worlds (2005)                  2
Confessions of a Dangerous Mind (2002)    2
Eros (2004)                               2
Saturn 3 (1980)                           2
                                         ..
Lost and Delirious (2001)                 1
Rape Me (Baise-moi) (2000)                1
Alice (1990)                              1
Another Woman (1988)                      1
Andrew Dice Clay: Dice Rules (1991)       1
Name: title, Length: 9737, dtype: int64

In [12]:
movies.title.duplicated().sum()

5

In [13]:
#Drop the 5 duplicated movie titles
movies.drop_duplicates(subset='title', inplace=True)

In [14]:
#Sanity check to ensure all duplicates were dropped from title column

movies.title.duplicated().sum()

0

In [15]:
movies[movies.title.duplicated() == True]

Unnamed: 0,movieId,title,genres


In [16]:
len(movies)

9737

In [17]:
# Split title column into two new columns: Title and year 

movies['Title'] = movies['title'].str.split('(', n=1, expand=True)[0]

movies['year_released'] = movies['title'].str.split('(', n=1, expand=True)[1]

In [18]:
# Drop original column title

movies.drop(columns='title', inplace=True)

In [19]:
# Sanity Check 
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995)
1,2,Adventure|Children|Fantasy,Jumanji,1995)
2,3,Comedy|Romance,Grumpier Old Men,1995)
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995)
4,5,Comedy,Father of the Bride Part II,1995)
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017)
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017)
9739,193585,Drama,Flint,2017)
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018)


In [20]:
# Format year_released column

movies['year_released'] = movies.year_released.str.replace(')', '')

  movies['year_released'] = movies.year_released.str.replace(')', '')


In [21]:
# Sanity Check to ensure formatting was completed
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9739,193585,Drama,Flint,2017
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [22]:
#checking rating df
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [23]:
#converted timestamp to datetime
rating['timestamp'] = pd.to_datetime(rating['timestamp'], unit='s')

In [24]:
rating.rating.value_counts(normalize=True)

4.0    0.265957
3.0    0.198808
5.0    0.131015
3.5    0.130271
4.5    0.084801
2.0    0.074884
2.5    0.055040
1.0    0.027877
1.5    0.017762
0.5    0.013586
Name: rating, dtype: float64

In [25]:
rating.duplicated().sum()

0

In [26]:
rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [27]:
# merging rating and movies
movie_rating = movies.merge(rating, on='movieId', how='outer')

In [28]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,1.0,4.0,2000-07-30 18:45:03
1,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,5.0,4.0,1996-11-08 06:36:02
2,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,7.0,4.5,2005-01-25 06:52:26
3,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,15.0,2.5,2017-11-13 12:59:30
4,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100849,64997,,,,68.0,2.5,2008-12-28 20:55:15
100850,144606,,,,111.0,4.0,2018-01-31 23:27:37
100851,147002,,,,318.0,4.0,2017-08-08 15:45:52
100852,26958,,,,509.0,3.5,2015-07-04 17:42:33


In [29]:
movie_rating.dropna(inplace=True)

In [30]:
movie_rating['genres']=[row.strip().lower().replace('|',',') for row in movie_rating['genres']]

In [31]:
movie_rating['userId'].nunique()

610

In [32]:
movie_rating

Unnamed: 0,movieId,genres,Title,year_released,userId,rating,timestamp
0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,1.0,4.0,2000-07-30 18:45:03
1,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,5.0,4.0,1996-11-08 06:36:02
2,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,7.0,4.5,2005-01-25 06:52:26
3,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,15.0,2.5,2017-11-13 12:59:30
4,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,17.0,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...,...
100843,193581,"action,animation,comedy,fantasy",Black Butler: Book of the Atlantic,2017,184.0,4.0,2018-09-16 14:44:42
100844,193583,"animation,comedy,fantasy",No Game No Life: Zero,2017,184.0,3.5,2018-09-16 14:52:25
100845,193585,drama,Flint,2017,184.0,3.5,2018-09-16 14:56:45
100846,193587,"action,animation",Bungo Stray Dogs: Dead Apple,2018,184.0,3.5,2018-09-16 15:00:21


# Splits

In [33]:
#for function building/final model
reader = Reader(rating_scale=(1, 5))
complete_data = Dataset.load_from_df(movie_rating[['userId', 'movieId', 'rating']], reader)

In [None]:
#use sklearn for model selection/choosing 
train, test = train_test_split(movie_rating, random_state=42)

reader = Reader(rating_scale=(1, 5))
train_data = Dataset.load_from_df(train[['userId', 'movieId', 'rating']], reader)
test_data = Dataset.load_from_df(test[['userId', 'movieId', 'rating']], reader)


train_data2 = train_data.build_full_trainset()
test_data2 = test_data.build_full_trainset().build_testset()

# Modeling

### Baseline/dummy

**INSERT/MAKE THE DUMMY HERE**

### KNNBasics

In [None]:
param_grid = {'k':[10, 50, 100],'min_k': [1, 5, 10]}
base_model = GridSearchCV(KNNBasic,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train_data)

In [None]:
base_model.best_params

In [None]:
#trying different parameters
param_grid = {'k':[5, 10, 15],'min_k': [1, 5, 10]}
base_model = GridSearchCV(KNNBasic,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train_data)

In [None]:
base_model.best_params

In [None]:
param_grid = {'k':[10, 15, 20],'min_k': [1, 5, 10]}
base_model = GridSearchCV(KNNBasic,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train_data)

In [None]:
base_model.best_params

#### KNN Cross Validation

In [None]:
#instantiate KNN model 
model1=KNNBasic(k=15, min_k=5)

In [None]:
#instaniatite and perform cross val with training data
cross_validate(model1, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
#use model to predict
model1.predict(uid=10, iid = 20)
#this model just predicted what user 10 would predict movie 20

looking at the output above we can determine that our model predicted that use 10 would rate movie 20 a 2.5 give or take our RSME which is .95.

### SVD

In [None]:
model2 = SVD()

In [None]:
cross_validate(model2, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
param_grid = {'n_factors':[10,20,50],'n_epochs': [5, 10, 15], 'lr_all': [0.002,0.005,0.01],
             'reg_all': [0.2,0.4,0.6]}
gs_model2 = GridSearchCV(SVD,param_grid=param_grid,joblib_verbose=5)
gs_model2.fit(train_data)

In [None]:
gs_model2.best_params

In [None]:
param_grid = {'n_factors':[25, 50, 75],'n_epochs': [15, 30, 45], 'lr_all': [0.01,0.05,0.1],
             'reg_all': [0.1,0.2,0.3]}
gs_model2 = GridSearchCV(SVD,param_grid=param_grid,joblib_verbose=5)
gs_model2.fit(train_data)

In [None]:
gs_model2.best_params

In [None]:

cross_validate(SVD(n_factors=75,n_epochs=45,lr_all=0.01,reg_all=0.1), train_data, measures=['RMSE', 'MAE'], 
               cv=5, verbose=True)

In [None]:
param_grid = {'n_factors':[75, 100, 125],'n_epochs': [45, 60, 75], 'lr_all': [.005, 0.01, .015],
             'reg_all': [.05, 0.1, 0.15]}
gs_model2 = GridSearchCV(SVD,param_grid=param_grid,joblib_verbose=5)
gs_model2.fit(train_data)

In [None]:
gs_model2.best_params

In [None]:
cross_validate(SVD(n_factors=125,n_epochs=45,lr_all=0.015,reg_all=0.1), train_data, measures=['RMSE', 'MAE'], 
               cv=5, verbose=True)

### NMF

In [None]:
model3 = NMF()

In [None]:
cross_validate(model3, train_data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

In [None]:
param_grid = {'n_factors':[10,15,20],'n_epochs': [25, 50, 75]}
gs_model3 = GridSearchCV(NMF,param_grid=param_grid,joblib_verbose=5)
gs_model3.fit(train_data)

In [None]:
gs_model3.best_params

In [None]:
cross_validate(NMF(n_factors=10,n_epochs=25), train_data, measures=['RMSE', 'MAE'], 
               cv=5, verbose=True,  n_jobs= -2)

In [None]:
param_grid = {'n_factors':[3,5,10],'n_epochs': [15, 20, 25]}
gs_model3 = GridSearchCV(NMF,param_grid=param_grid,joblib_verbose=5)
gs_model3.fit(train_data)

In [None]:
gs_model3.best_params

### Final Model Evaluation

In [34]:
final_model = SVD(n_factors=125,n_epochs=45,lr_all=0.015,reg_all=0.1)

In [3]:
predictions = final_model.fit(train_data2).test(test_data2)

NameError: name 'train_data2' is not defined

In [None]:
accuracy.rmse(predictions)

## Deployment

In [35]:
#create trainset object from surprise dataset
full_data = complete_data.build_full_trainset()

In [36]:
final_model.fit(full_data)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x19046fecfd0>

#### Function Building

In [37]:
#setting index to ensure dropping
userdf = movie_rating.set_index('userId')

In [38]:
userdf

Unnamed: 0_level_0,movieId,genres,Title,year_released,rating,timestamp
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1.0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,4.0,2000-07-30 18:45:03
5.0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,4.0,1996-11-08 06:36:02
7.0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,4.5,2005-01-25 06:52:26
15.0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,2.5,2017-11-13 12:59:30
17.0,1,"adventure,animation,children,comedy,fantasy",Toy Story,1995,4.5,2011-05-18 05:28:03
...,...,...,...,...,...,...
184.0,193581,"action,animation,comedy,fantasy",Black Butler: Book of the Atlantic,2017,4.0,2018-09-16 14:44:42
184.0,193583,"animation,comedy,fantasy",No Game No Life: Zero,2017,3.5,2018-09-16 14:52:25
184.0,193585,drama,Flint,2017,3.5,2018-09-16 14:56:45
184.0,193587,"action,animation",Bungo Stray Dogs: Dead Apple,2018,3.5,2018-09-16 15:00:21


In [39]:
#create 
userdf1 = userdf['movieId']

In [40]:
userdf1.loc[9,]

userId
9.0      41
9.0     187
9.0     223
9.0     371
9.0     627
9.0     922
9.0     923
9.0    1037
9.0    1095
9.0    1198
9.0    1270
9.0    1674
9.0    1987
9.0    2011
9.0    2012
9.0    2023
9.0    2300
9.0    2877
9.0    2901
9.0    3173
9.0    3328
9.0    3735
9.0    4131
9.0    4558
9.0    4993
9.0    5218
9.0    5378
9.0    5445
9.0    5447
9.0    5451
9.0    5481
9.0    5507
9.0    5841
9.0    5843
9.0    5872
9.0    5890
9.0    5891
9.0    5893
9.0    5902
9.0    5952
9.0    5956
9.0    5962
9.0    5965
9.0    5988
9.0    6001
9.0    6044
Name: movieId, dtype: int64

In [41]:
user_rate = list(userdf1.loc[9,])

In [42]:
user_rate

[41,
 187,
 223,
 371,
 627,
 922,
 923,
 1037,
 1095,
 1198,
 1270,
 1674,
 1987,
 2011,
 2012,
 2023,
 2300,
 2877,
 2901,
 3173,
 3328,
 3735,
 4131,
 4558,
 4993,
 5218,
 5378,
 5445,
 5447,
 5451,
 5481,
 5507,
 5841,
 5843,
 5872,
 5890,
 5891,
 5893,
 5902,
 5952,
 5956,
 5962,
 5965,
 5988,
 6001,
 6044]

In [43]:
#setting index so we can drop base off of specific movieId
moviedf = movies.set_index('movieId')

In [44]:
moviedf

Unnamed: 0_level_0,genres,Title,year_released
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
2,Adventure|Children|Fantasy,Jumanji,1995
3,Comedy|Romance,Grumpier Old Men,1995
4,Comedy|Drama|Romance,Waiting to Exhale,1995
5,Comedy,Father of the Bride Part II,1995
...,...,...,...
193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
193585,Drama,Flint,2017
193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [45]:
moviedf.drop(user_rate, inplace = True)

In [46]:
moviedf

Unnamed: 0_level_0,genres,Title,year_released
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
2,Adventure|Children|Fantasy,Jumanji,1995
3,Comedy|Romance,Grumpier Old Men,1995
4,Comedy|Drama|Romance,Waiting to Exhale,1995
5,Comedy,Father of the Bride Part II,1995
...,...,...,...
193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
193585,Drama,Flint,2017
193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [47]:
#reset index to be able to use .apply on the movieId
moviedf = moviedf.reset_index()

In [48]:
moviedf

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9686,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9687,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9688,193585,Drama,Flint,2017
9689,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [49]:
moviedf['est_rating'] = moviedf['movieId'].apply(lambda x: final_model.predict(9, x).est)
moviedf.sort_values(by='est_rating', ascending=False, inplace=True)

In [56]:
moviedf.head(5)

Unnamed: 0,movieId,genres,Title,year_released,est_rating
7656,89904,Comedy|Drama|Romance,The Artist,2011,4.780234
4014,5747,Drama|War,Gallipoli,1981,4.705655
4344,6442,Comedy|Romance,Belle époque,1992,4.634932
5574,27156,Action|Animation|Drama|Fantasy|Sci-Fi,Neon Genesis Evangelion: The End of Evangelion,Shin seiki Evangelion Gekijô-ban: Air/Magokoro...,4.620769
4458,6666,Comedy|Drama|Fantasy,"Discreet Charm of the Bourgeoisie, The","Charme discret de la bourgeoisie, Le (1972",4.592631


#### Function

In [60]:
#creating a variable that only has the userid and movies that they've rated
#this uses the original train dataset not surprise's version
users_movies_seen = movie_rating[['movieId', 'userId']]
#setting user id as index to make it easier to use loc on it
users_movies_seen = users_movies_seen.set_index('userId')

In [63]:
def recommender():
    user = float(input('userId: '))
    genre = input('What genres are you interested in? ')
    num_recs = int(input('How many recomendations would you like? '))

    #using the train data
    #create the seen_movie
    seen_movie = list(users_movies_seen.loc[user, 'movieId'])

    #create df with all the unseen/unrated movies
    train_copy = movies.copy()
    train_copy2 = train_copy.set_index('movieId')

    #using the not_seen df, create a rating for a user
    not_seen = train_copy2.drop(index = seen_movie)

    #subsetting the df to unseen movies of a certain genre
    not_seen = not_seen[not_seen['genres'].str.contains(genre)]
    
    #reset not_seen index
    not_seen = not_seen.reset_index()
    
    #creating an est rating and the sorted it
    not_seen['est_rating'] = not_seen['movieId'].apply(lambda x: final_model.predict(user, x).est)
    not_seen.sort_values(by='est_rating', ascending=False, inplace=True)

    #returns the recommendations that the model predicts to be the highest rating
    return (not_seen.head(num_recs))

In [67]:
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9739,193585,Drama,Flint,2017
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [66]:
recommender()

userId: 3
What genres are you interested in? Comedy|Romance
How many recomendations would you like? 5


Unnamed: 0,movieId,genres,Title,year_released,est_rating
4166,141718,Comedy|Horror,Deathgasm,2015,3.70497
2535,25947,Comedy,Unfaithfully Yours,1948,3.641275
3528,87234,Comedy|Drama|Romance,Submarine,2010,3.61342
3631,93988,Drama|Romance,North & South,2004,3.590857
1668,4914,Crime|Drama|Romance,Breathless,À bout de souffle (1960,3.579387
