In [3]:
import pandas as pd

import surprise
from surprise.prediction_algorithms import *
import pandas as pd
import numpy as np
import datetime as dt

from surprise import Dataset
from surprise import Reader

from surprise.prediction_algorithms import knns
from surprise.similarities import cosine, msd, pearson
from surprise import accuracy
from surprise.model_selection import GridSearchCV

In [4]:
movies = pd.read_csv('./ml-latest-small/movies.csv')

In [5]:
link = pd.read_csv('./ml-latest-small/links.csv')

In [6]:
rating = pd.read_csv('./ml-latest-small/ratings.csv')

In [7]:
tags = pd.read_csv('./ml-latest-small/tags.csv')

In [8]:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [9]:
movies.isna().sum()

movieId    0
title      0
genres     0
dtype: int64

In [10]:
movies.title.value_counts()

War of the Worlds (2005)                  2
Eros (2004)                               2
Confessions of a Dangerous Mind (2002)    2
Emma (1996)                               2
Saturn 3 (1980)                           2
                                         ..
North (1994)                              1
Barcelona (1994)                          1
It's a Boy Girl Thing (2006)              1
Thing: Terror Takes Shape, The (1998)     1
Jeremiah Johnson (1972)                   1
Name: title, Length: 9737, dtype: int64

In [11]:
#Check to see if there are any duplicate titles
movies.title.duplicated().sum()

5

In [12]:
#Drop the 5 duplicated movie titles
movies.drop_duplicates(subset='title', inplace=True)

In [13]:
#Sanity check to ensure all duplicates were dropped from title column

movies.title.duplicated().sum()

0

In [14]:
movies[movies.title.duplicated() == True]

Unnamed: 0,movieId,title,genres


In [15]:
#Check length of DataFrame

len(movies)

9737

In [16]:
# Split title column into two new columns: Title and year 

movies['Title'] = movies['title'].str.split('(', n=1, expand=True)[0]

movies['year_released'] = movies['title'].str.split('(', n=1, expand=True)[1]

In [17]:
# Drop original column title

movies.drop(columns='title', inplace=True)

In [18]:
# Sanity Check 
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995)
1,2,Adventure|Children|Fantasy,Jumanji,1995)
2,3,Comedy|Romance,Grumpier Old Men,1995)
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995)
4,5,Comedy,Father of the Bride Part II,1995)
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017)
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017)
9739,193585,Drama,Flint,2017)
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018)


In [19]:
# Format year_released column

movies['year_released'] = movies.year_released.str.replace(')', '')

In [20]:
# Sanity Check to ensure formatting was completed
movies

Unnamed: 0,movieId,genres,Title,year_released
0,1,Adventure|Animation|Children|Comedy|Fantasy,Toy Story,1995
1,2,Adventure|Children|Fantasy,Jumanji,1995
2,3,Comedy|Romance,Grumpier Old Men,1995
3,4,Comedy|Drama|Romance,Waiting to Exhale,1995
4,5,Comedy,Father of the Bride Part II,1995
...,...,...,...,...
9737,193581,Action|Animation|Comedy|Fantasy,Black Butler: Book of the Atlantic,2017
9738,193583,Animation|Comedy|Fantasy,No Game No Life: Zero,2017
9739,193585,Drama,Flint,2017
9740,193587,Action|Animation,Bungo Stray Dogs: Dead Apple,2018


In [21]:
link

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0
...,...,...,...
9737,193581,5476944,432131.0
9738,193583,5914996,445030.0
9739,193585,6397426,479308.0
9740,193587,8391976,483455.0


In [22]:
rating

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [29]:
#converted timestamp to datetime
rating['timestamp'] = pd.to_datetime(rating['timestamp'], unit='s')

In [39]:
rating.rating.value_counts(normalize=True)

4.0    0.265957
3.0    0.198808
5.0    0.131015
3.5    0.130271
4.5    0.084801
2.0    0.074884
2.5    0.055040
1.0    0.027877
1.5    0.017762
0.5    0.013586
Name: rating, dtype: float64

In [41]:
rating.duplicated().sum()

0

In [42]:
rating.isna().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [None]:
tags

In [None]:
movie_rating = movies.join(rating, on='movieId', lsuffix='movie_', rsuffix='rating_', how='outer')

In [None]:
movie_rating

In [None]:
movie_rating.dropna(inplace=True)

In [None]:
movie_rating['userId'].nunique()

In [None]:
stats = movie_rating[['rating', 'timestamp']].describe()
stats

In [None]:
print(dt.datetime.fromtimestamp(stats.loc['min', 'timestamp']))
print(dt.datetime.fromtimestamp(stats.loc['max', 'timestamp']))

## Split

In [None]:
movie_rating.columns

In [None]:
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(movie_rating[['userId', 'movieId', 'rating']], reader)

In [None]:
data

In [None]:
train, test = surprise.model_selection.train_test_split(data, random_state=42)

In [None]:
train

In [None]:
model = KNNBasic().fit(train)

In [None]:
model2 = SVD().fit(train)

In [None]:
model3 = NMF().fit(train)

In [None]:
model4 = knns.KNNWithMeans().fit(train)

## Tuning

In [None]:
model = KNNBasic()

In [None]:
train.

In [None]:
param_grid = {'k':[10, 50, 100],'min_k': [1, 5, 10]}
base_model = GridSearchCV(model,param_grid=param_grid,joblib_verbose=5)
base_model.fit(train)

In [None]:
# param_grid = {'n_factors':[20, 100],'n_epochs': [5, 10], 'lr_all': [0.002, 0.005],
#               'reg_all': [0.4, 0.6]}
# gs_model = GridSearchCV(SVD,param_grid=param_grid,n_jobs = -1,joblib_verbose=5)
# gs_model.fit(jokes)

## Cross Validation

In [None]:
CV1 = surprise.model_selection.split.KFold()

In [None]:
surprise.model_selection.split.RepeatedKFold()