In [44]:
#Importing required packages
import pandas as pd
import graphlab
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
import numpy as np

In [2]:
#Importing datasets all the datasets from csv file
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [3]:
#checking the head of the links dataset (at movie ID level)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
#checking the shape of the links dataset
links_shape = links.shape
print("Shape of links dataset = {}".format(links_shape))

Shape of links dataset = (9125, 3)


In [5]:
#number of unique movie Ids in the links dataset
uni_movieIds_links = links["movieId"].nunique()
print("No of unique movieIds in links dataset = {}".format(uni_movieIds_links))

No of unique movieIds in links dataset = 9125


In [7]:
#checking if a column has null values 
links.isnull().any()

movieId    False
imdbId     False
tmdbId      True
dtype: bool

In [8]:
#looking at rows with missing tmdbid
links[links["tmdbId"].isnull()].head()

Unnamed: 0,movieId,imdbId,tmdbId
607,720,118114,
608,721,114103,
640,769,116992,
910,1133,111357,
2282,2851,81454,


In [9]:
#number of movies from which tmdbid is missing (13)
missing_tmdbId = links.loc[links["tmdbId"].isnull(), "movieId"].nunique()
print("{} movies have missing tmdbId".format(missing_tmdbId))

13 movies have missing tmdbId


In [10]:
#checking the head of the movies dataset (at movie ID level)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
#checking the shape of the movies dataset
movies_shape = movies.shape
print("Shape of movies dataset = {}".format(movies_shape))

Shape of movies dataset = (9125, 3)


In [12]:
#Data is unique at movie Id level
no_movies = movies["movieId"].nunique()
print("No of unique movies in movies dataset = {}".format(no_movies))

No of unique movies in movies dataset = 9125


In [13]:
#checking for mising values (None)
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [14]:
#creating a new dataset with genres split into different columns
movies2 = pd.DataFrame(movies.genres.str.split('|').tolist(), columns = ["genres1","genres2","genres3","genres4","genres5","genres6","genres7","genres8","genres9","genres10"])

In [15]:
#concatenating the two datasets to create the final movies dataset
movies_new = pd.concat([movies,movies2],axis = 1)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,


In [16]:
#creating the year of release dataset from the title column
movies_new["Year_release"] = movies_new['title'].str.extract('(\d\d\d\d)', expand=True)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,,1995
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,,1995


In [17]:
#merging the the links and movies datasets as both are on movie ID level
movies_links = pd.merge(movies_new,links, on = "movieId")
movies_links.head()
movies_links.shape
movies_links_shape = movies_links.shape
print("Shape of movies_links dataset = {}".format(movies_links_shape))

Shape of movies_links dataset = (9125, 16)


In [19]:
movies_links.isnull().any()

movieId         False
title           False
genres          False
genres1         False
genres2          True
genres3          True
genres4          True
genres5          True
genres6          True
genres7          True
genres8          True
genres9          True
genres10         True
Year_release     True
imdbId          False
tmdbId           True
dtype: bool

In [24]:
#No of movies with missing year of release 
missing_year = movies_new.loc[movies_new['Year_release'].isnull(),'movieId'].nunique()
print("{} movies have missing year of release".format(missing_year))

4 movies have missing year of release


In [19]:
#checking the head of the ratings data (unique at userID, movieID, timestamp level)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [49]:
#checking the shape of the ratings data
ratings_shape = ratings.shape
print("{} is the shape of ratings data".format(ratings_shape))

(100004, 4) is the shape of ratings data


In [50]:
#No missing entries in ratings data
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [35]:
#No of unique users 671
m = ratings.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

No of unique users who have rated a movie = 671 
No of unique movies that have been rated = 9066 
No of unique values of ratings = 10 


In [47]:
#unique values of ratings 
k = ratings['rating'].unique()
k = np.sort(k, kind= "quicksort")
print("Values of ratings =  {}".format(k))

Values of ratings =  [ 0.5  1.   1.5  2.   2.5  3.   3.5  4.   4.5  5. ]


In [48]:
#overall distribution of ratings 
ratings.groupby('rating').userId.count()

rating
0.5     1101
1.0     3326
1.5     1687
2.0     7271
2.5     4449
3.0    20064
3.5    10538
4.0    28750
4.5     7723
5.0    15095
Name: userId, dtype: int64

In [25]:
#Every customer has rated a movie only once
ratings.groupby(['userId','movieId']).userId.count().value_counts()

1    100004
Name: userId, dtype: int64

In [26]:
#converting time since epoch to datetime 
ratings["timestamp"] = pd.to_datetime(ratings['timestamp'], unit='ms')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1970-01-15 14:12:39.144
1,1,1029,3.0,1970-01-15 14:12:39.179
2,1,1061,3.0,1970-01-15 14:12:39.182
3,1,1129,2.0,1970-01-15 14:12:39.185
4,1,1172,4.0,1970-01-15 14:12:39.205


In [55]:
#checking tags data 
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1138537770
1,15,1955,dentist,1193435061
2,15,7478,Cambodia,1170560997
3,15,32892,Russian,1170626366
4,15,34162,forgettable,1141391765


In [57]:
#converting time since epoch to datetime 
tags["timestamp"] = pd.to_datetime(tags['timestamp'], unit='ms')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1970-01-14 04:15:37.770
1,15,1955,dentist,1970-01-14 19:30:35.061
2,15,7478,Cambodia,1970-01-14 13:09:20.997
3,15,32892,Russian,1970-01-14 13:10:26.366
4,15,34162,forgettable,1970-01-14 05:03:11.765


In [58]:
#shape of tags data
tags_shape = tags.shape
print("Shape of tags dataset = {}".format(tags_shape))

Shape of tags dataset = (1296, 4)


In [60]:
#No of unique values in tags dataset
m = tags.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_tags = m[2]
print("No of unique users who have tagged a movie = {} ".format(uni_user))
print("No of unique movies that have been tagged = {} ".format(uni_movies))
print("No of unique values of tags = {} ".format(uni_tags))

No of unique users who have tagged a movie = 61 
No of unique movies that have been tagged = 689 
No of unique values of tags = 582 


In [61]:
#checking Null values in the tags dataset
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [73]:
#looking at data with more than 5 tags for a movie
df = tags.groupby(['userId','movieId']).count()
df.loc[df['tag'] > 5].head()

Unnamed: 0_level_0,Unnamed: 1_level_0,tag,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
138,48780,6,6
138,79132,7,7
138,109487,12,12
212,64957,8,8
212,66097,7,7


In [64]:
#creating combined tag for a particular user movie combination
tags_new =tags.groupby(['userId','movieId'], as_index=False).sum()

In [65]:
tags_new.head()

Unnamed: 0,userId,movieId,tag
0,15,339,sandra 'boring' bullock
1,15,1955,dentist
2,15,7478,Cambodia
3,15,32892,Russian
4,15,34162,forgettable


In [66]:
#New shape of tags dataset
tags_shape = tags_new.shape
print("shape of tags dataset = {}".format(tags_shape))

shape of tags dataset = (772, 3)


In [68]:
#merning the ratings and tags data : information at user level
ratings_tags = pd.merge(ratings,tags_new[['userId','movieId','tag']],how = 'left', left_on = ['userId','movieId'], right_on = ['userId','movieId'])

In [69]:
#shape of the new ratings _tags dataset
ratings_tags_shape = ratings_tags.shape
print("shape of ratings_tags dataset = {}".format(ratings_tags_shape))

shape of ratings_tags dataset = (100004, 5)


In [70]:
#checking data for a particular entry
ratings_tags.loc[(ratings_tags.userId == 15) & (ratings_tags.movieId == 339)]

Unnamed: 0,userId,movieId,rating,timestamp,tag
1048,15,339,2.5,1122576622,sandra 'boring' bullock


In [71]:
#Including the movie information to the ratings_tags data
all_data = pd.merge(ratings_tags, movies_links,how = 'left', left_on = ['movieId'], right_on = ['movieId'])

In [74]:
#checking the head of the combined data
all_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release,imdbId,tmdbId
0,1,31,2.5,1260759144,,Dangerous Minds (1995),Drama,Drama,,,,,,,,,,1995,112792,9909.0
1,1,1029,3.0,1260759179,,Dumbo (1941),Animation|Children|Drama|Musical,Animation,Children,Drama,Musical,,,,,,,1941,33563,11360.0
2,1,1061,3.0,1260759182,,Sleepers (1996),Thriller,Thriller,,,,,,,,,,1996,117665,819.0
3,1,1129,2.0,1260759185,,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,Action,Adventure,Sci-Fi,Thriller,,,,,,,1981,82340,1103.0
4,1,1172,4.0,1260759205,,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,Drama,,,,,,,,,,1989,95765,11216.0


In [75]:
#shape of the combined data
all_data_shape = all_data.shape
print("shape of combined dataset = {}".format(all_data_shape))

shape of combined dataset = (100004, 20)


In [76]:
#converting the tags column to string type
all_data['tag'] = all_data['tag'].astype(str)

In [77]:
#checking data type of all columns 
all_data.dtypes

userId            int64
movieId           int64
rating          float64
timestamp         int64
tag              object
title            object
genres           object
genres1          object
genres2          object
genres3          object
genres4          object
genres5          object
genres6          object
genres7          object
genres8          object
genres9          object
genres10         object
Year_release     object
imdbId            int64
tmdbId          float64
dtype: object

In [79]:
#replacing missing values of year release and tmdbID to 0
all_data.loc[(all_data.Year_release.isnull()) , "Year_release"] = 0
all_data.loc[(all_data.tmdbId.isnull()) , "tmdbId"] = 0

In [49]:
#checking data where tmdbId is 0
all_data.loc[all_data.tmdbId ==0]

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release,imdbId,tmdbId
95,2,720,4.0,1970-01-10 16:02:35.978,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0
537,7,720,5.0,1970-01-10 20:37:48.019,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0
1118,15,720,3.0,1970-01-12 23:02:25.122,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0
2130,15,26587,5.0,1970-01-17 09:22:00.087,,"Decalogue, The (Dekalog) (1989)",Crime|Drama|Romance,Crime,Drama,Romance,,,,,,,,1989,92337,0.0
3559,20,720,5.0,1970-01-15 04:00:43.166,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0
4806,26,720,3.0,1970-01-16 15:25:44.331,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0
5600,30,2851,2.0,1970-01-11 22:34:55.588,,Saturn 3 (1980),Adventure|Sci-Fi|Thriller,Adventure,Sci-Fi,Thriller,,,,,,,,1980,81454,0.0
6617,36,720,4.0,1970-01-10 19:17:37.827,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0
6733,38,26587,4.5,1970-01-17 02:02:02.725,,"Decalogue, The (Dekalog) (1989)",Crime|Drama|Romance,Crime,Drama,Romance,,,,,,,,1989,92337,0.0
9085,59,720,4.5,1970-01-14 05:59:16.414,,Wallace & Gromit: The Best of Aardman Animatio...,Adventure|Animation|Comedy,Adventure,Animation,Comedy,,,,,,,,1996,118114,0.0


In [81]:
#splitting the data into test and train 80/20
train_data, test_data  = train_test_split(all_data , test_size=0.2, random_state=42)

In [88]:
#shape of train data
train_shape = train_data.shape
print("shape of train dataset = {}".format(train_shape))

shape of train dataset = (80003, 20)


In [92]:
#No of unique values in train data
m = train_data.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

No of unique users who have rated a movie = 671 
No of unique movies that have been rated = 8399 
No of unique values of ratings = 10 


In [93]:
#shape of test dataset
test_shape = test_data.shape
print("shape of train dataset = {}".format(test_shape))

shape of train dataset = (20001, 20)


In [94]:
#No of unique values in test data
m = test_data.nunique()
uni_user = m[0]
uni_movies = m[1]
uni_ratings = m[2]
print("No of unique users who have rated a movie = {} ".format(uni_user))
print("No of unique movies that have been rated = {} ".format(uni_movies))
print("No of unique values of ratings = {} ".format(uni_ratings))

No of unique users who have rated a movie = 671 
No of unique movies that have been rated = 4901 
No of unique values of ratings = 10 


In [95]:
#exporting the test and train dataset to csv 
train_data.to_csv('train_data.csv')
test_data.to_csv('test_data.csv')

In [96]:
#converting the test and train dataset to Sframe
hints = [int,int ,int ,float,str, str, str, str, str, str,str,str,str,str,str,str, str,str,float, int, float]
train = graphlab.SFrame.read_csv('train_data.csv',column_type_hints = hints )
test = graphlab.SFrame.read_csv('test_data.csv',column_type_hints = hints )

In [97]:
#building a factorization_recommender model using the graphLab package with 4 latent factors
m = graphlab.factorization_recommender.create(train, target='rating', user_id='userId', item_id='movieId', num_factors=4)

In [57]:
 model_comp = graphlab.compare(test, [m])

PROGRESS: Evaluate model M0

Precision and recall summary statistics by cutoff
+--------+-----------------+------------------+
| cutoff |  mean_precision |   mean_recall    |
+--------+-----------------+------------------+
|   1    | 0.0655737704918 | 0.00332210249596 |
|   2    | 0.0365126676602 | 0.00354726333367 |
|   3    |  0.030303030303 | 0.00397718737926 |
|   4    | 0.0249627421759 | 0.00409180098365 |
|   5    | 0.0223546944858 | 0.00455557016286 |
|   6    | 0.0193740685544 | 0.00467890371489 |
|   7    | 0.0170321481797 | 0.00471541813175 |
|   8    | 0.0154619970194 | 0.0048317690086  |
|   9    | 0.0137439973506 | 0.0048317690086  |
|   10   | 0.0131147540984 | 0.00545391815788 |
+--------+-----------------+------------------+
[10 rows x 3 columns]

Model compare metric: precision_recall
