In [38]:
import pandas as pd

import surprise
from surprise.prediction_algorithms import *
import pandas as pd
import numpy as np
import datetime as dt

from surprise import Dataset
from surprise import Reader

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [39]:
movies = pd.read_csv('./ml-latest-small/movies.csv')

In [40]:
link = pd.read_csv('./ml-latest-small/links.csv')

In [41]:
rating = pd.read_csv('./ml-latest-small/ratings.csv')

In [42]:
tags = pd.read_csv('./ml-latest-small/tags.csv')

In [43]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [44]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [45]:
movies.title.value_counts()

Saturn 3 (1980)                           2
Eros (2004)                               2
War of the Worlds (2005)                  2
Emma (1996)                               2
Confessions of a Dangerous Mind (2002)    2
                                         ..
True Crime (1996)                         1
Junior (1994)                             1
Cloudburst (2011)                         1
Ariel (1988)                              1
White Hunter, Black Heart (1990)          1
Name: title, Length: 9737, dtype: int64

In [46]:
#Check to see if there are any duplicate titles
movies.title.duplicated().sum()

5

In [47]:
#Drop the 5 duplicated movie titles
movies.drop_duplicates(subset='title', inplace=True)

In [48]:
#Sanity check to ensure all duplicates were dropped from title column

movies.title.duplicated().sum()

0

In [49]:
movies[movies.title.duplicated() == True]

Unnamed: 0,movieId,title,genres


In [50]:
#Check length of DataFrame

len(movies)

9737

In [51]:
# Split title column into two new columns: Title and year 

movies['Title'] = movies['title'].str.split('(', n=1, expand=True)[0]

movies['year_released'] = movies['title'].str.split('(', n=1, expand=True)[1]

In [53]:
# Drop original column title

movies.drop(columns='title', inplace=True)

In [54]:
# Sanity Check 
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995)
1,2,Adventure|Children|Fantasy,Jumanji,1995)
2,3,Comedy|Romance,Grumpier Old Men,1995)
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995)
4,5,Comedy,Father of the Bride Part II,1995)
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017)
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017)
9739,193585,Drama,Flint,2017)
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018)


In [58]:
# Format year_released column

movies['year_released'] = movies.year_released.str.replace(')', '')

In [60]:
# Sanity Check to ensure formatting was completed
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9739,193585,Drama,Flint,2017
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [7]:
link

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [8]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [9]:
tags

Unnamed: 0,userId,movieId,tag,timestamp
0,2,60756,funny,1445714994
1,2,60756,Highly quotable,1445714996
2,2,60756,will ferrell,1445714992
3,2,89774,Boxing story,1445715207
4,2,89774,MMA,1445715200
...,...,...,...,...
3678,606,7382,for katie,1171234019
3679,606,7936,austere,1173392334
3680,610,3265,gun fu,1493843984
3681,610,3265,heroic bloodshed,1493843978


In [14]:
movie_rating = movies.join(rating, on='movieId', lsuffix='movie_', rsuffix='rating_', how='outer')

In [15]:
movie_rating

Unnamed: 0,movieId,movieIdmovie_,title,genres,userId,movieIdrating_,rating,timestamp
0.0,1,1.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,1.0,3.0,4.0,9.649812e+08
1.0,2,2.0,Jumanji (1995),Adventure|Children|Fantasy,1.0,6.0,4.0,9.649822e+08
2.0,3,3.0,Grumpier Old Men (1995),Comedy|Romance,1.0,47.0,5.0,9.649838e+08
3.0,4,4.0,Waiting to Exhale (1995),Comedy|Drama|Romance,1.0,50.0,5.0,9.649829e+08
4.0,5,5.0,Father of the Bride Part II (1995),Comedy,1.0,70.0,3.0,9.649824e+08
...,...,...,...,...,...,...,...,...
,100831,,,,610.0,166534.0,4.0,1.493848e+09
,100832,,,,610.0,168248.0,5.0,1.493850e+09
,100833,,,,610.0,168250.0,5.0,1.494273e+09
,100834,,,,610.0,168252.0,5.0,1.493846e+09


In [45]:
movie_rating.dropna(inplace=True)

In [46]:
movie_rating['userId'].nunique()

397

In [47]:
stats = movie_rating[['rating', 'timestamp']].describe()
stats

Unnamed: 0,rating,timestamp
count,8113.0,8113.0
mean,3.461666,1167825000.0
std,1.094076,227393800.0
min,0.5,828124600.0
25%,3.0,965706000.0
50%,3.5,1117507000.0
75%,4.0,1419025000.0
max,5.0,1537235000.0


In [48]:
print(dt.datetime.fromtimestamp(stats.loc['min', 'timestamp']))
print(dt.datetime.fromtimestamp(stats.loc['max', 'timestamp']))

1996-03-29 12:36:55
2018-09-17 20:49:14


## Split

In [34]:
movie_rating.columns

Index(['movieId', 'movieIdmovie_', 'title', 'genres', 'userId',
       'movieIdrating_', 'rating', 'timestamp'],
      dtype='object')

In [50]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movie_rating[['userId', 'movieId', 'rating']], reader)

In [51]:
data

<surprise.dataset.DatasetAutoFolds at 0x7fb2dabf0070>

In [52]:
train, test = surprise.model_selection.train_test_split(data, random_state=42)

In [53]:
train

<surprise.trainset.Trainset at 0x7fb2dabe4340>

In [54]:
model = KNNBasic().fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


In [55]:
model2 = SVD().fit(train)

In [56]:
model3 = NMF().fit(train)

In [74]:
model4 = knns.KNNWithMeans().fit(train)

Computing the msd similarity matrix...
Done computing similarity matrix.


## Tuning

In [76]:
model = KNNBasic()

In [78]:
train.

<surprise.trainset.Trainset at 0x7fb2dabe4340>

In [77]:
param_grid = {'k':[10, 50, 100],'min_k': [1, 5, 10]}
base_model = GridSearchCV(model,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train)

AttributeError: 'Trainset' object has no attribute 'raw_ratings'

In [None]:
# param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
# gs_model.fit(jokes)

## Cross Validation

In [66]:
CV1 = surprise.model_selection.split.KFold()

In [None]:
surprise.model_selection.split.RepeatedKFold()