# Movie recommendation using MovieLens data

Importing required packages

In [144]:
import pandas as pd
import graphlab
from sklearn.model_selection  import train_test_split,KFold,cross_val_predict
from matplotlib import pyplot as plt
import numpy as np
import nltk

## Data exploration

Importing datasets from csv file

In [74]:
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

Exploring Links dataset

In [75]:
#checking the head of the links dataset (at movie ID level)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [76]:
#checking the shape of the links dataset
links_shape = links.shape
print("Shape of links dataset = {}".format(links_shape))

Shape of links dataset = (9125, 3)


In [77]:
#number of unique movie Ids in the links dataset
uni_movieIds_links = links["movieId"].nunique()
print("No of unique movieIds in links dataset = {}".format(uni_movieIds_links))

No of unique movieIds in links dataset = 9125


In [78]:
#checking if a column has null values 
links.isnull().any()

movieId    False
imdbId     False
tmdbId      True
dtype: bool

In [79]:
#looking at rows with missing tmdbid
links[links["tmdbId"].isnull()].head()

Unnamed: 0,movieId,imdbId,tmdbId
607,720,118114,
608,721,114103,
640,769,116992,
910,1133,111357,
2282,2851,81454,


In [80]:
#number of movies from which tmdbid is missing (13)
missing_tmdbId = links.loc[links["tmdbId"].isnull(), "movieId"].nunique()
print("{} movies have missing tmdbId".format(missing_tmdbId))

13 movies have missing tmdbId


In [81]:
#Replacing the missing TmdbId with Zero
links.loc[(links.tmdbId.isnull()) , "tmdbId"] = 0

Exploring Movies Dataset

In [82]:
#checking the head of the movies dataset (at movie ID level)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [83]:
#checking the shape of the movies dataset
movies_shape = movies.shape
print("Shape of movies dataset = {}".format(movies_shape))

Shape of movies dataset = (9125, 3)


In [84]:
#Data is unique at movie Id level
no_movies = movies["movieId"].nunique()
print("No of unique movies in movies dataset = {}".format(no_movies))

No of unique movies in movies dataset = 9125


In [85]:
#checking for mising values 
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [86]:
#checking if the genres have not been listed
no_genre = movies.loc[movies.genres == "(no genres listed)"].movieId.count()
print("{} movies dont have genres listed".format(no_genre) )

18 movies dont have genres listed


In [87]:
#creating a new dataset with genres split into different columns
movies2 = pd.DataFrame(movies.genres.str.split('|').tolist(), columns = ["genres1","genres2","genres3","genres4","genres5","genres6","genres7","genres8","genres9","genres10"])

In [88]:
#concatenating the two datasets to create the final movies dataset
movies_new = pd.concat([movies,movies2],axis = 1)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,


In [89]:
#creating the year of release dataset from the title column
movies_new["Year_release"] = movies_new['title'].str.extract('(\d\d\d\d)', expand=True)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,,1995
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,,1995


In [90]:
movies_new.isnull().any()

movieId         False
title           False
genres          False
genres1         False
genres2          True
genres3          True
genres4          True
genres5          True
genres6          True
genres7          True
genres8          True
genres9          True
genres10         True
Year_release     True
dtype: bool

In [93]:
#No of movies with missing year of release 
missing_year = movies_new.loc[movies_new['Year_release'].isnull(),'movieId'].nunique()
print("{} movies have missing year of release".format(missing_year))

0 movies have missing year of release


In [92]:
#Adding the missing Year_release for 4 movies manually
movies_new.loc[9017,"Year_release"] = "2015"
movies_new.loc[9063,"Year_release"] = "2016"
movies_new.loc[9118,"Year_release"] = "2016"
movies_new.loc[9124,"Year_release"] = "2014"

In [94]:
#merging the the links and movies datasets as both are on movie ID level
movies_links = pd.merge(movies_new,links, on = "movieId")
movies_links.head()
movies_links.shape
movies_links_shape = movies_links.shape
print("Shape of movies_links dataset = {}".format(movies_links_shape))

Shape of movies_links dataset = (9125, 16)


In [95]:
movies_links.isnull().any()

movieId         False
title           False
genres          False
genres1         False
genres2          True
genres3          True
genres4          True
genres5          True
genres6          True
genres7          True
genres8          True
genres9          True
genres10         True
Year_release    False
imdbId          False
tmdbId          False
dtype: bool

In [96]:
#storing movie and links data as Sframe
movies_links.to_csv("movies_links.csv")
movie_metadata = graphlab.SFrame.read_csv('movies_links.csv' )

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,str,str,str,str,str,str,str,str,str,str,str,str,int,int,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Exploring Ratings Data

In [97]:
#checking the head of the ratings data (unique at userID, movieID, timestamp level)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [98]:
#checking the shape of the ratings data
ratings_shape = ratings.shape
print("{} is the shape of ratings data".format(ratings_shape))

(100004, 4) is the shape of ratings data


In [99]:
#No missing entries in ratings data
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [100]:
#No of unique users 671
m = ratings.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

AttributeError: 'DataFrame' object has no attribute 'nunique'

In [101]:
#unique values of ratings 
k = ratings['rating'].unique()
k = np.sort(k, kind= "quicksort")
print("Values of ratings =  {}".format(k))

Values of ratings =  [ 0.5  1.   1.5  2.   2.5  3.   3.5  4.   4.5  5. ]


In [102]:
#overall distribution of ratings 
ratings.groupby('rating').userId.count()

rating
0.5     1101
1.0     3326
1.5     1687
2.0     7271
2.5     4449
3.0    20064
3.5    10538
4.0    28750
4.5     7723
5.0    15095
Name: userId, dtype: int64

In [103]:
#Every customer has rated a movie only once
ratings.groupby(['userId','movieId']).userId.count().value_counts()

1    100004
Name: userId, dtype: int64

In [104]:
#converting time since epoch to datetime 
ratings["timestamp"] = pd.to_datetime(ratings['timestamp'], unit='ms')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1970-01-15 14:12:39.144
1,1,1029,3.0,1970-01-15 14:12:39.179
2,1,1061,3.0,1970-01-15 14:12:39.182
3,1,1129,2.0,1970-01-15 14:12:39.185
4,1,1172,4.0,1970-01-15 14:12:39.205


Explaoring Tags data

In [105]:
#checking tags data 
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [106]:
#converting time since epoch to datetime 
tags["timestamp"] = pd.to_datetime(tags['timestamp'], unit='ms')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1970-01-14 04:15:37.770
1,15,1955,dentist,1970-01-14 19:30:35.061
2,15,7478,Cambodia,1970-01-14 13:09:20.997
3,15,32892,Russian,1970-01-14 13:10:26.366
4,15,34162,forgettable,1970-01-14 05:03:11.765


In [107]:
#shape of tags data
tags_shape = tags.shape
print("Shape of tags dataset = {}".format(tags_shape))

Shape of tags dataset = (1296, 4)


In [108]:
#No of unique values in tags dataset
m = tags.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_tags = m[2]
print("No of unique users who have tagged a movie = {} ".format(uni_user))
print("No of unique movies that have been tagged = {} ".format(uni_movies))
print("No of unique values of tags = {} ".format(uni_tags))

AttributeError: 'DataFrame' object has no attribute 'nunique'

In [109]:
#checking Null values in the tags dataset
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [111]:
#looking at data with more than 5 tags for a movie
df = tags.groupby(['userId','movieId']).count()
df.loc[df['tag'] > 5].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tag,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
138,48780,6,6
138,79132,7,7
138,109487,12,12
212,64957,8,8
212,66097,7,7


In [112]:
#creating combined tag for a particular user movie combination
tags_new =tags.groupby(['userId','movieId'], as_index=False).sum()

In [113]:
tags_new.head()

Unnamed: 0,userId,movieId,tag
0,15,339,sandra 'boring' bullock
1,15,1955,dentist
2,15,7478,Cambodia
3,15,32892,Russian
4,15,34162,forgettable


In [114]:
#New shape of tags dataset
tags_shape = tags_new.shape
print("shape of tags dataset = {}".format(tags_shape))

shape of tags dataset = (772, 3)


In [115]:
#merging the ratings and tags data : information at user level
ratings_tags = pd.merge(ratings,tags_new[['userId','movieId','tag']],how = 'left', left_on = ['userId','movieId'], right_on = ['userId','movieId'])

In [116]:
#shape of the new ratings _tags dataset
ratings_tags_shape = ratings_tags.shape
print("shape of ratings_tags dataset = {}".format(ratings_tags_shape))

shape of ratings_tags dataset = (100004, 5)


In [117]:
#checking data for a particular entry
ratings_tags.loc[(ratings_tags.userId == 15) & (ratings_tags.movieId == 339)]

Unnamed: 0,userId,movieId,rating,timestamp,tag
1048,15,339,2.5,1970-01-13 23:49:36.622,sandra 'boring' bullock


In [118]:
#storing ratings and tags data as Sframe
ratings_tags.to_csv("ratings_tags.csv")
user_metadata = graphlab.SFrame.read_csv('ratings_tags.csv' )

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,int,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [119]:
#Including the movie information to the ratings_tags data
all_data = pd.merge(ratings_tags, movies_links,how = 'left', left_on = ['movieId'], right_on = ['movieId'])

In [120]:
#checking the head of the combined data
all_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release,imdbId,tmdbId
0,1,31,2.5,1970-01-15 14:12:39.144,,Dangerous Minds (1995),Drama,Drama,,,,,,,,,,1995,112792,9909.0
1,1,1029,3.0,1970-01-15 14:12:39.179,,Dumbo (1941),Animation|Children|Drama|Musical,Animation,Children,Drama,Musical,,,,,,,1941,33563,11360.0
2,1,1061,3.0,1970-01-15 14:12:39.182,,Sleepers (1996),Thriller,Thriller,,,,,,,,,,1996,117665,819.0
3,1,1129,2.0,1970-01-15 14:12:39.185,,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,Action,Adventure,Sci-Fi,Thriller,,,,,,,1981,82340,1103.0
4,1,1172,4.0,1970-01-15 14:12:39.205,,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,Drama,,,,,,,,,,1989,95765,11216.0


In [121]:
#shape of the combined data
all_data_shape = all_data.shape
print("shape of combined dataset = {}".format(all_data_shape))

shape of combined dataset = (100004, 20)


In [122]:
#converting the tags column to string type
all_data['tag'] = all_data['tag'].astype(str)

In [123]:
#checking data type of all columns 
all_data.dtypes

userId                   int64
movieId                  int64
rating                 float64
timestamp       datetime64[ns]
tag                     object
title                   object
genres                  object
genres1                 object
genres2                 object
genres3                 object
genres4                 object
genres5                 object
genres6                 object
genres7                 object
genres8                 object
genres9                 object
genres10                object
Year_release            object
imdbId                   int64
tmdbId                 float64
dtype: object

# Model based recommendation with Graphlabs

In [None]:
#splitting the data into test and train 80/20
train_data, test_data  = train_test_split(all_data , test_size=0.2, random_state=42)

In [None]:
#shape of train data
train_shape = train_data.shape
print("shape of train dataset = {}".format(train_shape))

In [None]:
#No of unique values in train data
m = train_data.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

In [None]:
#shape of test dataset
test_shape = test_data.shape
print("shape of train dataset = {}".format(test_shape))

In [None]:
#No of unique values in test data
m = test_data.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

In [None]:
#Keeping a copy of the test and train dataframes to use for sampling later
trainData = train_data
testData = test_data
#verifying
print(trainData.shape)
print(testData.shape)

In [None]:
#exporting the test and train dataset to csv 
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

In [None]:
#converting the test and train dataset to Sframe
hints = [int,int ,int ,float,str, str, str, str, str, str,str,str,str,str,str,str, str,str,float, int, float]
train = graphlab.SFrame.read_csv('train_data.csv',column_type_hints = hints )
test = graphlab.SFrame.read_csv('test_data.csv',column_type_hints = hints )

Using the follwoing three approaches to solve the regression.
* Adaptive Gradient Stochastic Gradient Descent
* Stochastic Gradient Descent
* Alternating Least Squares

In [None]:
#building a factorization_recommender model using the graphLab package with 4 latent factors
#Using adaptive gradient stochastic descent 
m1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',solver= "adagrad" , user_data = user_metadata, item_data = movie_metadata, num_factors=4)

In [None]:
#Using stochastic gradient descent 
m2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId', user_data = user_metadata, item_data = movie_metadata ,solver= "sgd" , num_factors=4)


In [None]:
#Using Implicit Alternating Least Squares 
m3 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',solver= "als" ,user_data = user_metadata, item_data = movie_metadata , num_factors=4)


Comparing the performance of the three models on the test data, to finalize the approach

In [None]:
 model_comp = graphlab.compare(test, [m1,m2, m3])

In [None]:
m1.evaluate_rmse(test,target='rating')

In [None]:
m2.evaluate_rmse(test,target='rating')

In [None]:
m3.evaluate_rmse(test,target='rating')

ADA model performance for different value of max iterations

In [None]:
#25 iterations
ada_m1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',solver= "adagrad",user_data = user_metadata, item_data = movie_metadata  ,max_iterations=25,num_factors=4)

In [None]:
#50 iterations
ada_m2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,max_iterations=50,num_factors=4)

In [None]:
ada_m2['coefficients']

Creating a visualization of the Model to helps explore and qualitatively evaluate the recommendations made by the model

In [None]:
view = ada_m2.views.explore(train)
view.show()

In [None]:
 ada_model_iter = graphlab.compare(test, [ada_m1, ada_m2])

In [None]:
ada_m1.evaluate_rmse(test,target='rating')

In [None]:
ada_m2.evaluate_rmse(test,target='rating')

In [None]:
#use ada_m2 to predict ratings 
ada_m2.recommend(users=range(1,11),k=1000)

Model performance for different values of L2 regularization

In [None]:
ada_R1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-10,num_factors=4)


In [None]:
ada_R2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-8,num_factors=4)


In [None]:
ada_R3 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=4)


In [None]:
ada_R4 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-4,num_factors=4)


In [None]:
ada_R5 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-2,num_factors=4)


Precision-Recall Plot of ada_R1,ada_R2,ada_R3,ada_R4,ada_R5

In [None]:
model_comp1 = graphlab.compare(test, [ada_R1,ada_R2,ada_R3,ada_R4,ada_R5],target='rating')
model_comp1_df = model_comp1.to_dataframe()

fig, ax = plt.subplots(1, 1)
for i in range(0,len(model_comp1_df.index)):
    xy_dict= model_comp1_df.loc[i,"results"]['precision_recall']
    xy = pd.DataFrame(xy_dict)    
    label1= model_comp1_df.loc[i,"model"]
    xy.plot(x='recall',y= 'precision',ax=ax,label=label1)
    
plt.show()


In [None]:
model_comp1 =graphlab.recommender.util.compare_models(test, [ada_R1,ada_R2,ada_R3,ada_R4,ada_R5], model_names=None,
                                         user_sample=1.0, metric='rmse',
                                         target='rating', exclude_known_for_precision_recall=True, 
                                         make_plot=False, verbose=False)
#fig, ax = plt.subplots(1, 1)
rmse_data = {'model':['ada_R1','ada_R2','ada_R3','ada_R4','ada_R5'], 'rmse':[]}
for i in range(0,len(model_comp1)):
    rmse_data['rmse'].append(model_comp1[i]['rmse_overall'])
    

rmse_df=pd.DataFrame(rmse_data)
rmse_df.plot(x='model',y='rmse')
plt.show()

In [None]:
ada_R1.evaluate_rmse(test,target='rating')

In [None]:
ada_R2.evaluate_rmse(test,target='rating')

In [None]:
ada_R3.evaluate_rmse(test,target='rating')

In [None]:
ada_NF1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=2)


In [None]:
ada_NF2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=4)


In [None]:
ada_NF3 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=6)


In [None]:
ada_NF4 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=8)


In [None]:
ada_NF5 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=10)


In [None]:
model_comp1 = graphlab.compare(test, [ada_NF1,ada_NF2,ada_NF3,ada_NF4,ada_NF5],target='rating')
model_comp1_df = model_comp1.to_dataframe()

In [None]:
fig, ax = plt.subplots(1, 1)
for i in range(0,len(model_comp1_df.index)):
    xy_dict= model_comp1_df.loc[i,"results"]['precision_recall']
    xy = pd.DataFrame(xy_dict)    
    #xy.plot(x='recall',y= 'precision',ax=ax)
    label1= model_comp1_df.loc[i,"model"]
    xy.plot(x='recall',y= 'precision',ax=ax,label=label1)
    
plt.show()

In [None]:
model_comp1 =graphlab.recommender.util.compare_models(test, [ada_NF1,ada_NF2,ada_NF3,ada_NF4,ada_NF5], model_names=None,
                                         user_sample=1.0, metric='rmse',
                                         target='rating', exclude_known_for_precision_recall=True, 
                                         make_plot=False, verbose=False)
#fig, ax = plt.subplots(1, 1)
rmse_data = {'model':['ada_NF1','ada_NF2','ada_NF3','ada_NF4','ada_NF5'], 'rmse':[]}
for i in range(0,len(model_comp1)):
    rmse_data['rmse'].append(model_comp1[i]['rmse_overall'])
    

rmse_df=pd.DataFrame(rmse_data)
rmse_df.plot(x='model',y='rmse')
plt.show()


# Evaluating Model Performance For Different Sample Sizes

In [None]:
train_sample1 = trainData.sample(frac =0.25,replace =False)
train_sample2 = trainData.sample(frac =0.5,replace =False)
train_sample3 = trainData.sample(frac =0.75,replace =False)
print("Training sample-1 shape - {}".format(train_sample1.shape))
print("Training sample-2 shape - {}".format(train_sample2.shape))
print("Training sample-3 shape - {}".format(train_sample3.shape))

In [None]:
#converting data to csv for Sframe parsing
train_sample1.to_csv("train_data1.csv")
train_sample2.to_csv("train_data2.csv")
train_sample3.to_csv("train_data3.csv")

In [None]:
#converting the train datasets to Sframe
hints = [int,int ,int ,float,str, str, str, str, str, str,str,str,str,str,str,str, str,str,float, int, float]
train1 = graphlab.SFrame.read_csv('train_data1.csv',column_type_hints = hints )
train2 = graphlab.SFrame.read_csv('train_data2.csv',column_type_hints = hints )
train3 = graphlab.SFrame.read_csv('train_data3.csv',column_type_hints = hints )

Building a factorization_recommender model using the graphLab package with 4 latent factors
Using adaptive gradient stochastic descent 

In [None]:
m1_t1 = graphlab.factorization_recommender.create(train1, target='rating', user_id='userId',
                                               item_id='movieId',solver= "adagrad" , user_data = user_metadata,
                                               item_data = movie_metadata, num_factors=4)
m1_t2 = graphlab.factorization_recommender.create(train2, target='rating', user_id='userId',
                                               item_id='movieId',solver= "adagrad" , user_data = user_metadata,
                                               item_data = movie_metadata, num_factors=4)
m1_t3 = graphlab.factorization_recommender.create(train3, target='rating', user_id='userId',
                                               item_id='movieId',solver= "adagrad" , user_data = user_metadata,
                                               item_data = movie_metadata, num_factors=4)


In [None]:
graphlab.recommender.util.compare_models(test, [m1_t1,m1_t2,m1_t3,m1], model_names=["m1_t1","m1_t2","m1_t3","m1"],
                                         user_sample=1.0, metric='auto',
                                         target='rating', exclude_known_for_precision_recall=True, 
                                         make_plot=True, verbose=False)

In [None]:
model_comp1 = graphlab.compare(test, [m1_t1,m1_t2,m1_t3,m1],target='rating')

# Plotting Model Performance Using The Evaluation Results (to-do)

In [None]:
model_comp1.dtype

In [None]:
model_comp1_df = model_comp1.to_dataframe()
xy_dict= model_comp1_df.loc[0,"results"]
xy_dict

In [None]:
fig, ax = plt.subplots(1, 1)
for i in range(0,len(model_comp1_df.index)):
    xy_dict= model_comp1_df.loc[i,"results"]['precision_recall']
    xy = pd.DataFrame(xy_dict)    
    xy.plot(x='recall',y= 'precision',ax=ax)
plt.show()

# Cross Validation 

In [138]:
#K-Fold data split
all_data.to_csv("full_dataset.csv")
hints = [int,int ,int ,float,str, str, str, str, str, str,str,str,str,str,str,str, str,str,float, int, float]
data = graphlab.SFrame.read_csv('full_dataset.csv',column_type_hints= hints)

data.head()

kfolds = graphlab.cross_validation.KFold(data, 5)
for train, test in kfolds:    
    model = graphlab.factorization_recommender.create(train, target='rating', user_id='userId',
                                               item_id='movieId',solver= "adagrad" , user_data = user_metadata,
                                              item_data = movie_metadata, num_factors=4)
    print(model.evaluate(test))




Precision and recall summary statistics by cutoff
+--------+----------------+-------------+
| cutoff | mean_precision | mean_recall |
+--------+----------------+-------------+
|   1    |      0.0       |     0.0     |
|   2    |      0.0       |     0.0     |
|   3    |      0.0       |     0.0     |
|   4    |      0.0       |     0.0     |
|   5    |      0.0       |     0.0     |
|   6    |      0.0       |     0.0     |
|   7    |      0.0       |     0.0     |
|   8    |      0.0       |     0.0     |
|   9    |      0.0       |     0.0     |
|   10   |      0.0       |     0.0     |
+--------+----------------+-------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 1.0144651692126367)

Per User RMSE (best)
+--------+-------+----------------+
| userId | count |      rmse      |
+--------+-------+----------------+
|   72   |  191  | 0.499426973379 |
+--------+-------+----------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+--------+-------+---------------+
| userId | coun


Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    |     0.03125      |  0.00018883183251 |
|   2    |     0.015625     |  0.00018883183251 |
|   3    |      0.0125      | 0.000215888109566 |
|   4    |     0.009375     | 0.000215888109566 |
|   5    |      0.0075      | 0.000215888109566 |
|   6    |     0.00625      | 0.000215888109566 |
|   7    | 0.00535714285714 | 0.000215888109566 |
|   8    |    0.0046875     | 0.000215888109566 |
|   9    | 0.00416666666667 | 0.000215888109566 |
|   10   |     0.00375      | 0.000215888109566 |
+--------+------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.984820434213491)

Per User RMSE (best)
+--------+-------+---------------+
| userId | count |      rmse     |
+--------+-------+---------------+
|  229   |   33  | 0.48429334213 |
+--------+-------+------


Precision and recall summary statistics by cutoff
+--------+----------------+-------------+
| cutoff | mean_precision | mean_recall |
+--------+----------------+-------------+
|   1    |      0.0       |     0.0     |
|   2    |      0.0       |     0.0     |
|   3    |      0.0       |     0.0     |
|   4    |      0.0       |     0.0     |
|   5    |      0.0       |     0.0     |
|   6    |      0.0       |     0.0     |
|   7    |      0.0       |     0.0     |
|   8    |      0.0       |     0.0     |
|   9    |      0.0       |     0.0     |
|   10   |      0.0       |     0.0     |
+--------+----------------+-------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9917488847414712)

Per User RMSE (best)
+--------+-------+----------------+
| userId | count |      rmse      |
+--------+-------+----------------+
|  397   |   59  | 0.505949904847 |
+--------+-------+----------------+
[1 rows x 3 columns]


Per User RMSE (worst)
+--------+-------+---------------+
| userId | coun


Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    |       0.0        |        0.0        |
|   2    | 0.00434782608696 | 5.53510641242e-06 |
|   3    | 0.0202898550725  | 0.000132848175022 |
|   4    | 0.0173913043478  | 0.000138383281435 |
|   5    |  0.015652173913  |  0.00017082974477 |
|   6    | 0.0130434782609  |  0.00017082974477 |
|   7    | 0.0111801242236  |  0.00017082974477 |
|   8    | 0.0108695652174  | 0.000185226519893 |
|   9    | 0.00966183574879 | 0.000185226519893 |
|   10   | 0.00869565217391 | 0.000185226519893 |
+--------+------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9861612134221129)

Per User RMSE (best)
+--------+-------+----------------+
| userId | count |      rmse      |
+--------+-------+----------------+
|  494   |   78  | 0.594704618082 |
+--------+-------+-


Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    |      0.016       | 0.000146270627063 |
|   2    |      0.008       | 0.000146270627063 |
|   3    | 0.00533333333333 | 0.000146270627063 |
|   4    |      0.004       | 0.000146270627063 |
|   5    |      0.0032      | 0.000146270627063 |
|   6    | 0.00533333333333 | 0.000171899740497 |
|   7    | 0.00457142857143 | 0.000171899740497 |
|   8    |      0.004       | 0.000171899740497 |
|   9    | 0.00444444444444 | 0.000198926767524 |
|   10   |      0.0056      | 0.000265197394586 |
+--------+------------------+-------------------+
[10 rows x 3 columns]

('\nOverall RMSE: ', 0.9891129559476167)

Per User RMSE (best)
+--------+-------+----------------+
| userId | count |      rmse      |
+--------+-------+----------------+
|  583   |   20  | 0.581184344515 |
+--------+-------+-

In [139]:
params = dict([('target', 'rating'), ('user_id', 'userId'), ('item_id', 'movieId'), ('solver', 'adagrad')
               , ('user_data', 'user_metadata'), ('item_data', 'movie_metadata'), ('num_factors', '4')])
graphlab.toolkits.cross_validation.cross_val_score(kfolds, graphlab.factorization_recommender.create, params)


[INFO] graphlab.deploy.job: Validating job.
[INFO] graphlab.deploy.map_job: Validation complete. Job: 'Cross-Validation-Nov-07-2017-17-28-35-717361-55734694' ready for execution
[INFO] graphlab.deploy.map_job: Job: 'Cross-Validation-Nov-07-2017-17-28-35-717361-55734694' scheduled.


Info
------
Job                : Cross-Validation-Nov-07-2017-17-28-35-717361-55734694
Function(s)        : _train_test_model-0-0, _train_test_model-0-1, _train_test_model-0-2 ... (total 6 functions).
Status             : Pending

Help
------
Visualize progress : self.show()
Query status       : self.get_status()
Get results        : self.get_results()

Environment
----------
LocalAsync: ["name": async]

Metrics
-------
Start time         : None
End time           : None
None

Execution Information
---------------------
Process pid          : 5251
Execution Directory  : /Users/B.Patel@ibm.com/.graphlab/artifacts/results/job-results-a7d39a85-0fbf-4501-82cb-753214bc84c4
Log file             : /Users/B.Patel@ibm.com/.graphlab/artifacts/results/job-results-a7d39a85-0fbf-4501-82cb-753214bc84c4/execution.log

In [141]:
#Sample fold
(train, test) = kfolds[1]
(train, test)

(Columns:
 	X1	int
 	userId	int
 	movieId	int
 	rating	float
 	timestamp	str
 	tag	str
 	title	str
 	genres	str
 	genres1	str
 	genres2	str
 	genres3	str
 	genres4	str
 	genres5	str
 	genres6	str
 	genres7	str
 	genres8	str
 	genres9	str
 	genres10	str
 	Year_release	float
 	imdbId	int
 	tmdbId	float
 
 Rows: 80003
 
 Data:
 +----+--------+---------+--------+-------------------------+-----+
 | X1 | userId | movieId | rating |        timestamp        | tag |
 +----+--------+---------+--------+-------------------------+-----+
 | 0  |   1    |    31   |  2.5   | 1970-01-15 14:12:39.144 | nan |
 | 1  |   1    |   1029  |  3.0   | 1970-01-15 14:12:39.179 | nan |
 | 2  |   1    |   1061  |  3.0   | 1970-01-15 14:12:39.182 | nan |
 | 3  |   1    |   1129  |  2.0   | 1970-01-15 14:12:39.185 | nan |
 | 4  |   1    |   1172  |  4.0   | 1970-01-15 14:12:39.205 | nan |
 | 5  |   1    |   1263  |  2.0   | 1970-01-15 14:12:39.151 | nan |
 | 6  |   1    |   1287  |  2.0   | 1970-01-15 14:12:39.187 | 

The cross-validation error for K=5 is .9933. 