# Netflix Movie Recommendation
* Notebook: https://www.kaggle.com/laowingkin/netflix-movie-recommendation
* Data: https://www.kaggle.com/netflix-inc/netflix-prize-data?select=qualifying.txt

In [1]:
import pandas as pd
import numpy as np
import math
import re
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from surprise import Reader, Dataset, SVD

'''
According to the documentation, the evaluate() method 
was deprecated in version 1.0.5 (functionally replaced 
by model_selection.cross_validate()) and was removed in version 1.1.0, 
which is likely what you have installed.
'''
# from surprise import evaluate 
from surprise.model_selection.validation import cross_validate

sns.set_style("darkgrid")

In [2]:
def readFile(file_path, rows=100000):
    data_dict = {'Cust_Id' : [], 'Movie_Id' : [], 'Rating' : [], 'Date' : []}
    f = open(file_path, "r")
    count = 0
    for line in f:
        count += 1
        if count > rows:
            break
            
        if ':' in line:
            movidId = line[:-2] # remove the last character ':'
            movieId = int(movidId)
        else:
            customerID, rating, date = line.split(',')
            data_dict['Cust_Id'].append(customerID)
            data_dict['Movie_Id'].append(movieId)
            data_dict['Rating'].append(rating)
            data_dict['Date'].append(date.rstrip("\n"))
    f.close()
            
    return pd.DataFrame(data_dict)

In [3]:
df1 = readFile('./data/netflix/combined_data_1.txt', rows=100000)
df2 = readFile('./data/netflix/combined_data_2.txt', rows=100000)
df3 = readFile('./data/netflix/combined_data_3.txt', rows=100000)
df4 = readFile('./data/netflix/combined_data_4.txt', rows=100000)

In [4]:
df1.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1488844,1,3,2005-09-06
1,822109,1,5,2005-05-13
2,885013,1,4,2005-10-19
3,30878,1,4,2005-12-26
4,823519,1,3,2004-05-03


In [5]:
df2.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,2532865,4500,4,2005-07-26
1,573364,4500,3,2005-06-20
2,1696725,4500,3,2004-02-27
3,1253431,4500,3,2004-03-31
4,1265574,4500,2,2003-09-01


In [6]:
df3.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1277134,9211,1,2003-12-02
1,2435457,9211,2,2005-06-01
2,2338545,9211,3,2001-02-17
3,2218269,9211,1,2002-12-27
4,441153,9211,4,2002-10-11


In [7]:
df4.head()

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,2385003,13368,4,2004-07-08
1,659432,13368,3,2005-03-16
2,751812,13368,2,2002-12-16
3,2625420,13368,2,2004-05-25
4,1650301,13368,1,2005-08-30


In [8]:
df1['Rating'] = df1['Rating'].astype(float)
df2['Rating'] = df2['Rating'].astype(float)
df3['Rating'] = df3['Rating'].astype(float)
df4['Rating'] = df4['Rating'].astype(float)

In [9]:
df = df1.copy()
df = df.append(df2)
df = df.append(df3)
df = df.append(df4)

df.index = np.arange(0,len(df))
df.shape

(399899, 4)

In [10]:
df.head(10)

Unnamed: 0,Cust_Id,Movie_Id,Rating,Date
0,1488844,1,3.0,2005-09-06
1,822109,1,5.0,2005-05-13
2,885013,1,4.0,2005-10-19
3,30878,1,4.0,2005-12-26
4,823519,1,3.0,2004-05-03
5,893988,1,3.0,2005-11-17
6,124105,1,4.0,2004-08-05
7,1248029,1,3.0,2004-04-22
8,1842128,1,4.0,2004-05-09
9,2238063,1,3.0,2005-05-11


In [11]:
df['Movie_Id'].value_counts()

28       39752
13384    37884
4506     33731
4520     26500
9235     20214
         ...  
9221       105
4502       104
9           95
7           93
9227        88
Name: Movie_Id, Length: 101, dtype: int64

In [18]:
df_title = pd.read_csv('./data/netflix/movie_titles.csv', encoding = "ISO-8859-1", header = None, names = ['Movie_Id', 'Year', 'Name'])
df_title.head(10)

Unnamed: 0,Movie_Id,Year,Name
0,1,2003.0,Dinosaur Planet
1,2,2004.0,Isle of Man TT 2004 Review
2,3,1997.0,Character
3,4,1994.0,Paula Abdul's Get Up & Dance
4,5,2004.0,The Rise and Fall of ECW
5,6,1997.0,Sick
6,7,1992.0,8 Man
7,8,2004.0,What the #$*! Do We Know!?
8,9,1991.0,Class of Nuke 'Em High 2
9,10,2001.0,Fighter


In [13]:
reader = Reader()

data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)
svd = SVD()
# Run 5-fold cross-validation and print results
cross_validate(svd, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0186  1.0161  1.0157  1.0136  1.0190  1.0166  0.0020  
MAE (testset)     0.8120  0.8077  0.8083  0.8058  0.8083  0.8084  0.0020  
Fit time          16.85   15.99   15.30   15.41   15.17   15.74   0.62    
Test time         0.66    0.67    0.65    0.67    0.65    0.66    0.01    


{'test_rmse': array([1.01858453, 1.01612624, 1.01568493, 1.0136429 , 1.01899016]),
 'test_mae': array([0.81203629, 0.80771617, 0.80829617, 0.80582657, 0.80827487]),
 'fit_time': (16.84959387779236,
  15.994084119796753,
  15.30280089378357,
  15.407926082611084,
  15.16874074935913),
 'test_time': (0.657844066619873,
  0.6682610511779785,
  0.6464948654174805,
  0.6709449291229248,
  0.6467597484588623)}

In [14]:
df_785314 = df[(df['Cust_Id'] == '785314') & (df['Rating'] >= 5)]
df_785314 = df_785314.set_index('Movie_Id')
df_785314 = df_785314.join(df_title)['Name']
df_785314.head(df_785314.shape[0])

Movie_Id
9236                       The Witches
13378    Kim Possible: A Sitch in Time
Name: Name, dtype: object

In [15]:
# getting full dataset
# data = Dataset.load_from_df(df[['Cust_Id', 'Movie_Id', 'Rating']], reader)

trainset = data.build_full_trainset()
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x12d599310>

In [16]:
titles = df_title.copy()

titles['Estimate_Score'] = titles['Movie_Id'].apply(lambda x: svd.predict(785314, x).est)
titles = titles.sort_values(by=['Estimate_Score'], ascending=False)

In [17]:
titles.head(10)

Unnamed: 0,Movie_Id,Year,Name,Estimate_Score
12,13,2003.0,Lord of the Rings: The Return of the King: Ext...,4.486798
9235,9236,1998.0,South Park: Season 2,4.122711
4505,4506,1961.0,Breakfast at Tiffany's,4.044488
24,25,1997.0,Inspector Morse 31: Death Is Now My Neighbour,4.008793
13379,13380,1949.0,Stray Dog,3.990185
4508,4509,1977.0,Little House on the Prairie: Season 4,3.989363
4520,4521,2002.0,Wire in the Blood: Justice Painted Blind,3.982499
13374,13375,1963.0,Andy Griffith Show: Classic Favorites,3.953485
13376,13377,1963.0,Winter Light,3.918503
4,5,2004.0,The Rise and Fall of ECW,3.91657
