# Movie recommendation using MovieLens data

Importing required packages

In [1]:
import pandas as pd
import graphlab
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np
import nltk

This non-commercial license of GraphLab Create for academic use is assigned to ritu.tak@columbia.edu and will expire on November 01, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1509952183.log


## Data exploration

Importing datasets from csv file

In [2]:
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

Exploring Links dataset

In [3]:
#checking the head of the links dataset (at movie ID level)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
#checking the shape of the links dataset
links_shape = links.shape
print("Shape of links dataset = {}".format(links_shape))

Shape of links dataset = (9125, 3)


In [5]:
#number of unique movie Ids in the links dataset
uni_movieIds_links = links["movieId"].nunique()
print("No of unique movieIds in links dataset = {}".format(uni_movieIds_links))

No of unique movieIds in links dataset = 9125


In [6]:
#checking if a column has null values 
links.isnull().any()

movieId    False
imdbId     False
tmdbId      True
dtype: bool

In [7]:
#looking at rows with missing tmdbid
links[links["tmdbId"].isnull()].head()

Unnamed: 0,movieId,imdbId,tmdbId
607,720,118114,
608,721,114103,
640,769,116992,
910,1133,111357,
2282,2851,81454,


In [8]:
#number of movies from which tmdbid is missing (13)
missing_tmdbId = links.loc[links["tmdbId"].isnull(), "movieId"].nunique()
print("{} movies have missing tmdbId".format(missing_tmdbId))

13 movies have missing tmdbId


In [9]:
#Replacing the missing TmdbId with Zero
links.loc[(links.tmdbId.isnull()) , "tmdbId"] = 0

Exploring Movies Dataset

In [10]:
#checking the head of the movies dataset (at movie ID level)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
#checking the shape of the movies dataset
movies_shape = movies.shape
print("Shape of movies dataset = {}".format(movies_shape))

Shape of movies dataset = (9125, 3)


In [12]:
#Data is unique at movie Id level
no_movies = movies["movieId"].nunique()
print("No of unique movies in movies dataset = {}".format(no_movies))

No of unique movies in movies dataset = 9125


In [13]:
#checking for mising values 
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [14]:
#checking if the genres have not been listed
no_genre = movies.loc[movies.genres == "(no genres listed)"].movieId.count()
print("{} movies dont have genres listed".format(no_genre) )

18 movies dont have genres listed


In [15]:
#creating a new dataset with genres split into different columns
movies2 = pd.DataFrame(movies.genres.str.split('|').tolist(), columns = ["genres1","genres2","genres3","genres4","genres5","genres6","genres7","genres8","genres9","genres10"])

In [16]:
#concatenating the two datasets to create the final movies dataset
movies_new = pd.concat([movies,movies2],axis = 1)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,


In [17]:
#creating the year of release dataset from the title column
movies_new["Year_release"] = movies_new['title'].str.extract('(\d\d\d\d)', expand=True)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,,1995
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,,1995


In [18]:
movies_new.isnull().any()

movieId         False
title           False
genres          False
genres1         False
genres2          True
genres3          True
genres4          True
genres5          True
genres6          True
genres7          True
genres8          True
genres9          True
genres10         True
Year_release     True
dtype: bool

In [19]:
#No of movies with missing year of release 
missing_year = movies_new.loc[movies_new['Year_release'].isnull(),'movieId'].nunique()
print("{} movies have missing year of release".format(missing_year))

4 movies have missing year of release


In [20]:
#Adding the missing Year_release for 4 movies manually
movies_new.loc[9017,"Year_release"] = "2015"
movies_new.loc[9063,"Year_release"] = "2016"
movies_new.loc[9118,"Year_release"] = "2016"
movies_new.loc[9124,"Year_release"] = "2014"

In [21]:
#merging the the links and movies datasets as both are on movie ID level
movies_links = pd.merge(movies_new,links, on = "movieId")
movies_links.head()
movies_links.shape
movies_links_shape = movies_links.shape
print("Shape of movies_links dataset = {}".format(movies_links_shape))

Shape of movies_links dataset = (9125, 16)


In [22]:
movies_links.isnull().any()

movieId         False
title           False
genres          False
genres1         False
genres2          True
genres3          True
genres4          True
genres5          True
genres6          True
genres7          True
genres8          True
genres9          True
genres10         True
Year_release    False
imdbId          False
tmdbId          False
dtype: bool

In [23]:
#storing movie and links data as Sframe
movies_links.to_csv("movies_links.csv")
movie_metadata = graphlab.SFrame.read_csv('movies_links.csv' )

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,str,str,str,str,str,str,str,str,str,str,str,str,int,int,float]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


Exploring Ratings Data

In [24]:
#checking the head of the ratings data (unique at userID, movieID, timestamp level)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [25]:
#checking the shape of the ratings data
ratings_shape = ratings.shape
print("{} is the shape of ratings data".format(ratings_shape))

(100004, 4) is the shape of ratings data


In [26]:
#No missing entries in ratings data
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [27]:
#No of unique users 671
m = ratings.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

No of unique users who have rated a movie = 671 
No of unique movies that have been rated = 9066 
No of unique values of ratings = 10 


In [28]:
#unique values of ratings 
k = ratings['rating'].unique()
k = np.sort(k, kind= "quicksort")
print("Values of ratings =  {}".format(k))

Values of ratings =  [ 0.5  1.   1.5  2.   2.5  3.   3.5  4.   4.5  5. ]


In [29]:
#overall distribution of ratings 
ratings.groupby('rating').userId.count()

rating
0.5     1101
1.0     3326
1.5     1687
2.0     7271
2.5     4449
3.0    20064
3.5    10538
4.0    28750
4.5     7723
5.0    15095
Name: userId, dtype: int64

In [30]:
#Every customer has rated a movie only once
ratings.groupby(['userId','movieId']).userId.count().value_counts()

1    100004
Name: userId, dtype: int64

In [31]:
#converting time since epoch to datetime 
ratings["timestamp"] = pd.to_datetime(ratings['timestamp'], unit='ms')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1970-01-15 14:12:39.144
1,1,1029,3.0,1970-01-15 14:12:39.179
2,1,1061,3.0,1970-01-15 14:12:39.182
3,1,1129,2.0,1970-01-15 14:12:39.185
4,1,1172,4.0,1970-01-15 14:12:39.205


Explaoring Tags data

In [32]:
#checking tags data 
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [33]:
#converting time since epoch to datetime 
tags["timestamp"] = pd.to_datetime(tags['timestamp'], unit='ms')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1970-01-14 04:15:37.770
1,15,1955,dentist,1970-01-14 19:30:35.061
2,15,7478,Cambodia,1970-01-14 13:09:20.997
3,15,32892,Russian,1970-01-14 13:10:26.366
4,15,34162,forgettable,1970-01-14 05:03:11.765


In [34]:
#shape of tags data
tags_shape = tags.shape
print("Shape of tags dataset = {}".format(tags_shape))

Shape of tags dataset = (1296, 4)


In [35]:
#No of unique values in tags dataset
m = tags.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_tags = m[2]
print("No of unique users who have tagged a movie = {} ".format(uni_user))
print("No of unique movies that have been tagged = {} ".format(uni_movies))
print("No of unique values of tags = {} ".format(uni_tags))

No of unique users who have tagged a movie = 61 
No of unique movies that have been tagged = 689 
No of unique values of tags = 582 


In [36]:
#checking Null values in the tags dataset
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [37]:
#looking at data with more than 5 tags for a movie
df = tags.groupby(['userId','movieId']).count()
df.loc[df['tag'] > 5].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tag,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
138,48780,6,6
138,79132,7,7
138,109487,12,12
212,64957,8,8
212,66097,7,7


In [38]:
#creating combined tag for a particular user movie combination
tags_new =tags.groupby(['userId','movieId'], as_index=False).sum()

In [39]:
tags_new.head()

Unnamed: 0,userId,movieId,tag
0,15,339,sandra 'boring' bullock
1,15,1955,dentist
2,15,7478,Cambodia
3,15,32892,Russian
4,15,34162,forgettable


In [40]:
#New shape of tags dataset
tags_shape = tags_new.shape
print("shape of tags dataset = {}".format(tags_shape))

shape of tags dataset = (772, 3)


In [41]:
#merging the ratings and tags data : information at user level
ratings_tags = pd.merge(ratings,tags_new[['userId','movieId','tag']],how = 'left', left_on = ['userId','movieId'], right_on = ['userId','movieId'])

In [42]:
#shape of the new ratings _tags dataset
ratings_tags_shape = ratings_tags.shape
print("shape of ratings_tags dataset = {}".format(ratings_tags_shape))

shape of ratings_tags dataset = (100004, 5)


In [43]:
#checking data for a particular entry
ratings_tags.loc[(ratings_tags.userId == 15) & (ratings_tags.movieId == 339)]

Unnamed: 0,userId,movieId,rating,timestamp,tag
1048,15,339,2.5,1970-01-13 23:49:36.622,sandra 'boring' bullock


In [44]:
#storing ratings and tags data as Sframe
ratings_tags.to_csv("ratings_tags.csv")
user_metadata = graphlab.SFrame.read_csv('ratings_tags.csv' )

------------------------------------------------------
Inferred types from first 100 line(s) of file as 
column_type_hints=[int,int,int,float,str,str]
If parsing fails due to incorrect types, you can correct
the inferred type list above and pass it to read_csv in
the column_type_hints argument
------------------------------------------------------


In [45]:
#Including the movie information to the ratings_tags data
all_data = pd.merge(ratings_tags, movies_links,how = 'left', left_on = ['movieId'], right_on = ['movieId'])

In [46]:
#checking the head of the combined data
all_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release,imdbId,tmdbId
0,1,31,2.5,1970-01-15 14:12:39.144,,Dangerous Minds (1995),Drama,Drama,,,,,,,,,,1995,112792,9909.0
1,1,1029,3.0,1970-01-15 14:12:39.179,,Dumbo (1941),Animation|Children|Drama|Musical,Animation,Children,Drama,Musical,,,,,,,1941,33563,11360.0
2,1,1061,3.0,1970-01-15 14:12:39.182,,Sleepers (1996),Thriller,Thriller,,,,,,,,,,1996,117665,819.0
3,1,1129,2.0,1970-01-15 14:12:39.185,,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,Action,Adventure,Sci-Fi,Thriller,,,,,,,1981,82340,1103.0
4,1,1172,4.0,1970-01-15 14:12:39.205,,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,Drama,,,,,,,,,,1989,95765,11216.0


In [47]:
#shape of the combined data
all_data_shape = all_data.shape
print("shape of combined dataset = {}".format(all_data_shape))

shape of combined dataset = (100004, 20)


In [48]:
#converting the tags column to string type
all_data['tag'] = all_data['tag'].astype(str)

In [49]:
#checking data type of all columns 
all_data.dtypes

userId                   int64
movieId                  int64
rating                 float64
timestamp       datetime64[ns]
tag                     object
title                   object
genres                  object
genres1                 object
genres2                 object
genres3                 object
genres4                 object
genres5                 object
genres6                 object
genres7                 object
genres8                 object
genres9                 object
genres10                object
Year_release            object
imdbId                   int64
tmdbId                 float64
dtype: object

# Model based recommendation with Graphlabs

In [50]:
#splitting the data into test and train 80/20
train_data, test_data  = train_test_split(all_data , test_size=0.2, random_state=42)

In [51]:
#shape of train data
train_shape = train_data.shape
print("shape of train dataset = {}".format(train_shape))

shape of train dataset = (80003, 20)


In [52]:
#No of unique values in train data
m = train_data.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

No of unique users who have rated a movie = 671 
No of unique movies that have been rated = 8399 
No of unique values of ratings = 10 


In [53]:
#shape of test dataset
test_shape = test_data.shape
print("shape of train dataset = {}".format(test_shape))

shape of train dataset = (20001, 20)


In [54]:
#No of unique values in test data
m = test_data.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

No of unique users who have rated a movie = 671 
No of unique movies that have been rated = 4901 
No of unique values of ratings = 10 


In [55]:
#exporting the test and train dataset to csv 
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

In [56]:
#converting the test and train dataset to Sframe
hints = [int,int ,int ,float,str, str, str, str, str, str,str,str,str,str,str,str, str,str,float, int, float]
train = graphlab.SFrame.read_csv('train_data.csv',column_type_hints = hints )
test = graphlab.SFrame.read_csv('test_data.csv',column_type_hints = hints )

Using the follwoing three approaches to solve the regression.
* Adaptive Gradient Stochastic Gradient Descent
* Stochastic Gradient Descent
* Alternating Least Squares

In [57]:
#building a factorization_recommender model using the graphLab package with 4 latent factors
#Using adaptive gradient stochastic descent 
m1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',solver= "adagrad" , user_data = user_metadata, item_data = movie_metadata, num_factors=4)

In [58]:
#Using stochastic gradient descent 
m2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId', user_data = user_metadata, item_data = movie_metadata ,solver= "sgd" , num_factors=4)


In [59]:
#Using Implicit Alternating Least Squares 
m3 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',solver= "als" ,user_data = user_metadata, item_data = movie_metadata , num_factors=4)


Comparing the performance of the three models on the test data, to finalize the approach

In [60]:
 model_comp = graphlab.compare(test, [m1,m2, m3])

PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    |  0.00298062593145 | 4.54208768326e-05 |
|   2    |  0.00149031296572 | 4.54208768326e-05 |
|   3    | 0.000993541977149 | 4.54208768326e-05 |
|   4    |  0.00111773472429 | 0.000160060335734 |
|   5    | 0.000894187779434 | 0.000160060335734 |
|   6    | 0.000745156482861 | 0.000160060335734 |
|   7    | 0.000638705556738 | 0.000160060335734 |
|   8    | 0.000558867362146 | 0.000160060335734 |
|   9    | 0.000496770988574 | 0.000160060335734 |
|   10   | 0.000596125186289 | 0.000238497860246 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1

Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+----

In [61]:
m1.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+----------------+
 | movieId | count |      rmse      |
 +---------+-------+----------------+
 |   7899  |   2   | 0.39652808978  |
 |   3143  |   1   | 0.595284576338 |
 |   5684  |   1   | 1.27760788491  |
 |   2779  |   3   | 0.703986772848 |
 |   3988  |   4   | 1.08629855128  |
 |   2847  |   1   | 0.186511718776 |
 |  64614  |   9   | 0.924421067075 |
 |   2925  |   2   | 0.423509537794 |
 |   2871  |   5   | 1.45262986778  |
 |   3913  |   1   | 0.433394469565 |
 +---------+-------+----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |   34  | 0.87727

In [62]:
m2.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+-----------------+
 | movieId | count |       rmse      |
 +---------+-------+-----------------+
 |   7899  |   2   |  0.522340933412 |
 |   3143  |   1   |  0.205963887482 |
 |   5684  |   1   |  2.06886128989  |
 |   2779  |   3   |  0.645115488303 |
 |   3988  |   4   |   1.022573678   |
 |   2847  |   1   |  0.107805869628 |
 |  64614  |   9   |  0.693141330215 |
 |   2925  |   2   |  0.475602542774 |
 |   2871  |   5   |  0.938756426007 |
 |   3913  |   1   | 0.0705656712917 |
 +---------+-------+-----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |  

In [63]:
m3.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+-----------------+
 | movieId | count |       rmse      |
 +---------+-------+-----------------+
 |   7899  |   2   |  0.377070653049 |
 |   3143  |   1   |  0.458251565566 |
 |   5684  |   1   |  1.54174843443  |
 |   2779  |   3   |  0.732886479385 |
 |   3988  |   4   |  1.12254425497  |
 |   2847  |   1   | 0.0679417549279 |
 |  64614  |   9   |  0.688428340507 |
 |   2925  |   2   |  3.13057083689  |
 |   2871  |   5   |  0.961986389764 |
 |   3913  |   1   |  0.148556619754 |
 +---------+-------+-----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |  

ADA model performance for different value of max iterations

In [64]:
#25 iterations
ada_m1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',solver= "adagrad",user_data = user_metadata, item_data = movie_metadata  ,max_iterations=25,num_factors=4)

In [65]:
#50 iterations
ada_m2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,max_iterations=50,num_factors=4)

In [67]:
ada_m2['coefficients']

{'intercept': 3.5417484344337007, 'movieId': Columns:
 	movieId	int
 	linear_terms	float
 	factors	array
 
 Rows: 9125
 
 Data:
 +---------+-------------------+-------------------------------+
 | movieId |    linear_terms   |            factors            |
 +---------+-------------------+-------------------------------+
 |    1    |  0.0520994104445  | [0.328759938478, 0.2199734... |
 |    2    |   0.296872794628  | [0.255938112736, 0.1548187... |
 |    3    | -0.00372883328237 | [-0.147408932447, 0.411493... |
 |    4    |  -0.286986917257  | [0.138155683875, -0.260167... |
 |    5    |  0.0836127474904  | [0.244942814112, 0.2018130... |
 |    6    |   0.188226297498  | [0.126912251115, -0.314835... |
 |    7    |  -0.017520731315  | [-0.309333771467, -0.26947... |
 |    8    |  -0.0693372413516 | [-0.255168944597, 0.071023... |
 |    9    |  -0.0361034013331 | [0.357314109802, 0.1417492... |
 |    10   |  0.0535211935639  | [-0.202059090137, 0.302821... |
 +---------+---------------

Creating a visualization of the Model to helps explore and qualitatively evaluate the recommendations made by the model

In [68]:
view = ada_m2.views.explore(train)
view.show()

View object

URI: 		http://localhost:32213/view/bf5e9867-3ae3-4a8a-8989-1ed1024912c9
HTML: 		
<gl-recommender-explore
    uri="http://localhost:32213/view/7d16d45c-1dd6-4969-b105-b8b4158d4792"
    api_key=""
/>
        

In [69]:
 ada_model_iter = graphlab.compare(test, [ada_m1, ada_m2])

PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------------------+-------------------+
|   1    |        0.0        |        0.0        |
|   2    | 0.000745156482861 | 8.27951647624e-05 |
|   3    |  0.00149031296572 | 0.000311061068202 |
|   4    |  0.00111773472429 | 0.000311061068202 |
|   5    | 0.000894187779434 | 0.000311061068202 |
|   6    | 0.000993541977149 | 0.000476651397726 |
|   7    |  0.00106450926123 | 0.000497641721187 |
|   8    |  0.00111773472429 | 0.000503555661527 |
|   9    | 0.000993541977149 | 0.000503555661527 |
|   10   |  0.00104321907601 | 0.000506693162508 |
+--------+-------------------+-------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1

Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+----

In [70]:
ada_m1.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+----------------+
 | movieId | count |      rmse      |
 +---------+-------+----------------+
 |   7899  |   2   | 0.892427993991 |
 |   3143  |   1   | 0.571538181501 |
 |   5684  |   1   |  1.4543405819  |
 |   2779  |   3   | 0.996127955976 |
 |   3988  |   4   | 0.963188769515 |
 |   2847  |   1   | 0.143713430264 |
 |  64614  |   9   | 0.855278967112 |
 |   2925  |   2   | 0.769083057627 |
 |   2871  |   5   | 0.895937446247 |
 |   3913  |   1   | 0.803816567404 |
 +---------+-------+----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |   34  | 0.82202

In [71]:
ada_m2.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+----------------+
 | movieId | count |      rmse      |
 +---------+-------+----------------+
 |   7899  |   2   | 0.751248769983 |
 |   3143  |   1   | 0.978389637078 |
 |   5684  |   1   | 1.32678363876  |
 |   2779  |   3   |  1.0372591451  |
 |   3988  |   4   | 1.22145814886  |
 |   2847  |   1   | 0.458662887778 |
 |  64614  |   9   | 0.645958569121 |
 |   2925  |   2   | 0.966421851509 |
 |   2871  |   5   | 1.15026122377  |
 |   3913  |   1   | 0.593691277506 |
 +---------+-------+----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |   34  | 0.78960

In [72]:
#use ada_m2 to predict ratings 
ada_m2.recommend(users=range(1,11),k=1000)

userId,movieId,score,rank
1,163949,4.6517430348,1
1,112183,4.47938575932,2
1,131724,4.47584725278,3
1,108981,4.46932068392,4
1,127152,4.46102318022,5
1,141668,4.41671554944,6
1,160718,4.41572714886,7
1,139757,4.37400980124,8
1,152081,4.34241155764,9
1,151307,4.33557830096,10


Model performance for different values of L2 regularization

In [73]:
ada_R1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-10,num_factors=4)


In [74]:
ada_R2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-8,num_factors=4)


In [75]:
ada_R3 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=4)


In [76]:
 ada_model_reg = graphlab.compare(test, [ada_R1, ada_R2 ,ada_R3])

PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+------------------+-------------------+
| cutoff |  mean_precision  |    mean_recall    |
+--------+------------------+-------------------+
|   1    | 0.00596125186289 | 0.000182871235997 |
|   2    | 0.00372578241431 | 0.000206526997358 |
|   3    | 0.00248385494287 | 0.000206526997358 |
|   4    | 0.00186289120715 | 0.000206526997358 |
|   5    | 0.00149031296572 | 0.000206526997358 |
|   6    | 0.00149031296572 | 0.000210797235082 |
|   7    | 0.00149031296572 | 0.000217827013222 |
|   8    | 0.00130402384501 | 0.000217827013222 |
|   9    | 0.00149031296572 | 0.000425714872571 |
|   10   | 0.0016393442623  | 0.000444030166594 |
+--------+------------------+-------------------+
[10 rows x 3 columns]

PROGRESS: Evaluate model M1

Precision and recall summary statistics by cutoff
+--------+-------------------+-------------------+
| cutoff |   mean_precision  |    mean_recall    |
+--------+-------

In [77]:
ada_R1.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+----------------+
 | movieId | count |      rmse      |
 +---------+-------+----------------+
 |   7899  |   2   | 0.258587158942 |
 |   3143  |   1   | 0.747133156962 |
 |   5684  |   1   | 1.44507143119  |
 |   2779  |   3   | 0.926701275881 |
 |   3988  |   4   | 0.93650724547  |
 |   2847  |   1   | 0.221362879229 |
 |  64614  |   9   | 0.759506007397 |
 |   2925  |   2   | 1.33987519159  |
 |   2871  |   5   | 1.39284310725  |
 |   3913  |   1   | 0.36085953014  |
 +---------+-------+----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |   34  | 0.91668

In [78]:
ada_R2.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+----------------+
 | movieId | count |      rmse      |
 +---------+-------+----------------+
 |   7899  |   2   | 0.490269126276 |
 |   3143  |   1   | 0.226173744892 |
 |   5684  |   1   | 1.57510679373  |
 |   2779  |   3   | 0.835510381516 |
 |   3988  |   4   | 0.819734445408 |
 |   2847  |   1   | 0.110462617344 |
 |  64614  |   9   | 0.796728211145 |
 |   2925  |   2   | 0.927408188074 |
 |   2871  |   5   | 0.898754714723 |
 |   3913  |   1   | 0.235724501857 |
 +---------+-------+----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |   34  | 0.78009

In [79]:
ada_R3.evaluate_rmse(test,target='rating')

{'rmse_by_item': Columns:
 	movieId	int
 	count	int
 	rmse	float
 
 Rows: 4901
 
 Data:
 +---------+-------+----------------+
 | movieId | count |      rmse      |
 +---------+-------+----------------+
 |   7899  |   2   | 0.591126191995 |
 |   3143  |   1   | 0.540916633661 |
 |   5684  |   1   | 1.39918696549  |
 |   2779  |   3   | 0.371713502663 |
 |   3988  |   4   | 1.02225046852  |
 |   2847  |   1   | 0.127944276775 |
 |  64614  |   9   | 0.675814668016 |
 |   2925  |   2   | 0.319005747806 |
 |   2871  |   5   | 0.803685322664 |
 |   3913  |   1   | 0.535754216071 |
 +---------+-------+----------------+
 [4901 rows x 3 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.,
 'rmse_by_user': Columns:
 	userId	int
 	count	int
 	rmse	float
 
 Rows: 671
 
 Data:
 +--------+-------+----------------+
 | userId | count |      rmse      |
 +--------+-------+----------------+
 |  118   |   34  | 0.81687

In [81]:
ada_NF1 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=2)


In [80]:
ada_NF2 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=4)


In [82]:
ada_NF3 = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId',user_data = user_metadata, item_data = movie_metadata,solver= "adagrad" ,regularization=1e-6,num_factors=6)
