In [48]:
import pandas as pd
import graphlab
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt

In [2]:
#importing datasets all the datasets from csv file
links = pd.read_csv("links.csv")
movies = pd.read_csv("movies.csv")
ratings = pd.read_csv("ratings.csv")
tags = pd.read_csv("tags.csv")

In [3]:
#checking the head of the links dataset (at movie ID level)
links.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [4]:
links.describe()

Unnamed: 0,movieId,imdbId,tmdbId
count,9125.0,9125.0,9112.0
mean,31123.291836,479824.4,39104.545544
std,40782.633604,743177.4,62814.519801
min,1.0,417.0,2.0
25%,2850.0,88846.0,9451.75
50%,6290.0,119778.0,15852.0
75%,56274.0,428441.0,39160.5
max,164979.0,5794766.0,416437.0


In [5]:
#checking the shape of the links dataset
links_shape = links.shape
print("Shape of links dataset = {}".format(links_shape))

Shape of links dataset = (9125, 3)


In [6]:
links["movieId"].nunique()

9125

In [7]:
links.isnull().any()

movieId    False
imdbId     False
tmdbId      True
dtype: bool

In [8]:
links[links["tmdbId"].isnull()].head()

Unnamed: 0,movieId,imdbId,tmdbId
607,720,118114,
608,721,114103,
640,769,116992,
910,1133,111357,
2282,2851,81454,


In [9]:
#number of movies from which tmdbid is missing (13)
links.loc[links["tmdbId"].isnull(), "movieId"].nunique()

13

In [10]:
#checking the head of the movies dataset (at movie ID level)
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [11]:
#checking the shape of the movies dataset
movies_shape = movies.shape
print("Shape of movies dataset = {}".format(movies_shape))

Shape of movies dataset = (9125, 3)


In [12]:
#Data is unique at movie Id level
movies["movieId"].nunique()

9125

In [13]:
#checking for mising values (None)
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [14]:
#creating a new dataset with genres split into different columns
movies2 = pd.DataFrame(movies.genres.str.split('|').tolist(), columns = ["genres1","genres2","genres3","genres4","genres5","genres6","genres7","genres8","genres9","genres10"])

In [15]:
#concatenating the two datasets to create the final movies dataset
movies_new = pd.concat([movies,movies2],axis = 1)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,


In [16]:
#creating the year of release dataset from the title column
movies_new["Year_release"] = movies_new['title'].str.extract('(\d\d\d\d)', expand=True)
movies_new.head()

Unnamed: 0,movieId,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,Adventure,Animation,Children,Comedy,Fantasy,,,,,,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,Adventure,Children,Fantasy,,,,,,,,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,Comedy,Romance,,,,,,,,,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,Comedy,Drama,Romance,,,,,,,,1995
4,5,Father of the Bride Part II (1995),Comedy,Comedy,,,,,,,,,,1995


In [17]:
#merging the the links and movies datasets as both are on movie ID level
movies_links = pd.merge(movies_new,links, on = "movieId")
movies_links.head()
movies_links.shape
movies_links_shape = movies_links.shape
print("Shape of movies_links dataset = {}".format(movies_links_shape))

Shape of movies_links dataset = (9125, 16)


In [18]:
movies_links.isnull().any()

movieId         False
title           False
genres          False
genres1         False
genres2          True
genres3          True
genres4          True
genres5          True
genres6          True
genres7          True
genres8          True
genres9          True
genres10         True
Year_release     True
imdbId          False
tmdbId           True
dtype: bool

In [19]:
#checking the head of the ratings data (unique at userID, movieID, timestamp level)
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [20]:
#checking the shape of the ratings data
ratings.shape

(100004, 4)

In [21]:
#No of unique users 671
ratings.nunique()

userId         671
movieId       9066
rating          10
timestamp    78141
dtype: int64

In [22]:
ratings['rating'].unique()

array([ 2.5,  3. ,  2. ,  4. ,  3.5,  1. ,  5. ,  4.5,  1.5,  0.5])

In [23]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [24]:
ratings.groupby('rating').userId.count()

rating
0.5     1101
1.0     3326
1.5     1687
2.0     7271
2.5     4449
3.0    20064
3.5    10538
4.0    28750
4.5     7723
5.0    15095
Name: userId, dtype: int64

In [25]:
#Every customer has rated a movie only once
ratings.groupby(['userId','movieId']).userId.count().value_counts()

1    100004
Name: userId, dtype: int64

In [26]:
ratings["timestamp"] = pd.to_datetime(ratings['timestamp'], unit='ms')
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1970-01-15 14:12:39.144
1,1,1029,3.0,1970-01-15 14:12:39.179
2,1,1061,3.0,1970-01-15 14:12:39.182
3,1,1129,2.0,1970-01-15 14:12:39.185
4,1,1172,4.0,1970-01-15 14:12:39.205


In [27]:
ratings_shape = ratings.shape
print("Shape of ratings dataset = {}".format(ratings_shape))

Shape of ratings dataset = (100004, 4)


In [28]:
#No missing entries in ratings data
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [29]:
tags["timestamp"] = pd.to_datetime(tags['timestamp'], unit='ms')
tags.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,sandra 'boring' bullock,1970-01-14 04:15:37.770
1,15,1955,dentist,1970-01-14 19:30:35.061
2,15,7478,Cambodia,1970-01-14 13:09:20.997
3,15,32892,Russian,1970-01-14 13:10:26.366
4,15,34162,forgettable,1970-01-14 05:03:11.765


In [30]:
tags_shape = tags.shape
print("Shape of tags dataset = {}".format(tags_shape))

Shape of tags dataset = (1296, 4)


In [31]:
#No of unique values
tags.nunique()

userId         61
movieId       689
tag           582
timestamp    1245
dtype: int64

In [32]:
tags.isnull().any()

userId       False
movieId      False
tag          False
timestamp    False
dtype: bool

In [33]:
tags.groupby(['userId','movieId'], as_index=False).count().shape

(772, 4)

In [34]:
df = tags.groupby(['userId','movieId']).count()
df.loc[df['tag'] > 5]

Unnamed: 0_level_0,Unnamed: 1_level_0,tag,timestamp
userId,movieId,Unnamed: 2_level_1,Unnamed: 3_level_1
138,48780,6,6
138,79132,7,7
138,109487,12,12
212,64957,8,8
212,66097,7,7
212,66934,7,7
212,68157,8,8
364,47,8,8
364,318,8,8
364,1210,10,10


In [35]:
#creating combined tag for a particular user movie combination
tags_new = tags.groupby(['userId','movieId'], as_index=False).aggregate(lambda x: set(x))

In [36]:
tags_new.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,15,339,{sandra 'boring' bullock},{1970-01-14 04:15:37.770000}
1,15,1955,{dentist},{1970-01-14 19:30:35.061000}
2,15,7478,{Cambodia},{1970-01-14 13:09:20.997000}
3,15,32892,{Russian},{1970-01-14 13:10:26.366000}
4,15,34162,{forgettable},{1970-01-14 05:03:11.765000}


In [37]:
tags_new.shape

(772, 4)

In [38]:
ratings_tags = pd.merge(ratings,tags_new[['userId','movieId','tag']],how = 'left', left_on = ['userId','movieId'], right_on = ['userId','movieId'])

In [39]:
ratings_tags.shape

(100004, 5)

In [40]:
ratings_tags.loc[(ratings_tags.userId == 15) & (ratings_tags.movieId == 339)]

Unnamed: 0,userId,movieId,rating,timestamp,tag
1048,15,339,2.5,1970-01-13 23:49:36.622,{sandra 'boring' bullock}


In [41]:
all_data = pd.merge(ratings_tags, movies_links,how = 'left', left_on = ['movieId'], right_on = ['movieId'])

In [42]:
all_data.head()

Unnamed: 0,userId,movieId,rating,timestamp,tag,title,genres,genres1,genres2,genres3,genres4,genres5,genres6,genres7,genres8,genres9,genres10,Year_release,imdbId,tmdbId
0,1,31,2.5,1970-01-15 14:12:39.144,,Dangerous Minds (1995),Drama,Drama,,,,,,,,,,1995,112792,9909.0
1,1,1029,3.0,1970-01-15 14:12:39.179,,Dumbo (1941),Animation|Children|Drama|Musical,Animation,Children,Drama,Musical,,,,,,,1941,33563,11360.0
2,1,1061,3.0,1970-01-15 14:12:39.182,,Sleepers (1996),Thriller,Thriller,,,,,,,,,,1996,117665,819.0
3,1,1129,2.0,1970-01-15 14:12:39.185,,Escape from New York (1981),Action|Adventure|Sci-Fi|Thriller,Action,Adventure,Sci-Fi,Thriller,,,,,,,1981,82340,1103.0
4,1,1172,4.0,1970-01-15 14:12:39.205,,Cinema Paradiso (Nuovo cinema Paradiso) (1989),Drama,Drama,,,,,,,,,,1989,95765,11216.0


In [43]:
all_data.shape

(100004, 20)

In [47]:
all_data.to_csv('user_rating_part1.csv')

In [51]:
X_train, X_test = train_test_split(all_data,test_size=0.2)
X_train.shape
X_test.shape

(20001, 20)

In [45]:
graphlab.recommender.factorization_recommender.create(all_data,user_id = 'userId', item_id='movieId', target= 'rating', user_data=ratings_tags, item_data= movies_new, num_factors=8, regularization=1e-08, linear_regularization=1e-10, side_data_factorization=True, nmf=False, binary_target=False, max_iterations=50, sgd_step_size=0, random_seed=0, solver='auto', verbose=True )


















This non-commercial license of GraphLab Create for academic use is assigned to ritu.tak@columbia.edu and will expire on November 01, 2018.


[INFO] graphlab.cython.cy_server: GraphLab Create v2.1 started. Logging: /tmp/graphlab_server_1509571658.log


RuntimeError: Runtime Exception. boost::bad_get: failed value get using boost::get