In [2]:
# Ignore  the warnings
import warnings
warnings.filterwarnings('always')
warnings.filterwarnings('ignore')
import time
# data visualisation and manipulation
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
 
#configure
# sets matplotlib to inline and displays graphs below the corressponding cell.
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

#model selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix,roc_curve,roc_auc_score
from sklearn.metrics import mean_absolute_error
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder

In [6]:


# reading users file:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('u.user', sep='|', names=u_cols,encoding='latin-1')

# reading ratings file:
r_cols = ['user_id', 'movie_id', 'rating', 'unix_timestamp']
ratings = pd.read_csv('u.data', sep='\t', names=r_cols,encoding='latin-1')

# reading items file:
i_cols = ['movie id', 'movie title' ,'release date','video release date', 'IMDb URL', 'unknown', 'Action', 'Adventure',
'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 'Drama', 'Fantasy',
'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller', 'War', 'Western']
items = pd.read_csv('u.item', sep='|', names=i_cols,
encoding='latin-1')

# After loading the dataset, we should look at the content of each file (users, ratings, items).

# Looking at the user file
print("\nUser Data :")
print("shape : ", users.shape)
#print(users.head())

# We have 943 users in the dataset and each user has 5 features, i.e. user_ID, age, sex, occupation and zip_code. Now let’s look at the ratings file.

# Ratings Data
print("\nRatings Data :")
print("shape : ", ratings.shape)
#print(ratings.head())

# We have 100k ratings for different user and movie combinations. Now finally examine the items file.

# Item Data
print("\nItem Data :")
print("shape : ", items.shape)
#print(items.head())


User Data :
shape :  (943, 5)

Ratings Data :
shape :  (100000, 4)

Item Data :
shape :  (1682, 24)


In [None]:
users.head()

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213


In [None]:
ratings.head()

Unnamed: 0,user_id,movie_id,rating,unix_timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596


In [None]:
#converting the unix timestamp of ratings to a  date
from datetime import datetime
def time_stamp(k):
  return datetime.fromtimestamp(k).strftime('%d-%m-%Y')
ratings['rating_date'] = ratings['unix_timestamp'].apply(time_stamp)
ratings.head()

In [8]:
print(pd.DatetimeIndex(ratings['rating_date']).year.min())
print(pd.DatetimeIndex(ratings['rating_date']).year.max())

1997
1998


In [None]:
#finding the no of days since the rating is given on movies from a specified date('01-11-1998')
date_format = "%d-%m-%Y"
def sub_dates(a):
  return (datetime.strptime('01-11-1998', date_format)-a).days   
def dat_strp(a):
    return datetime.strptime(a, date_format)
ratings['new_date']=ratings['rating_date'].apply(dat_strp)    
ratings['days_diff']=ratings['new_date'].apply(sub_dates) 
#conversion of no of days to years
ratings['years_diff']=round(ratings['days_diff']/365,2)
ratings.head() 

In [10]:
#dropping the below columns 
ratings.drop(['unix_timestamp','new_date'],axis=1,inplace=True)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_date,days_diff,years_diff
0,196,242,3,04-12-1997,332,0.91
1,186,302,3,04-04-1998,211,0.58
2,22,377,1,07-11-1997,359,0.98
3,244,51,2,27-11-1997,339,0.93
4,166,346,1,02-02-1998,272,0.75


In [None]:
ratings['years_diff'].describe()

In [None]:
#for these values select m=0.425 and n=0.25, this idea is based on max and min values of years difference to get weighted difference
m,n=0.425,0.25
ratings['weighted_diff']=m*ratings['years_diff']+n
ratings.head()

In [12]:
#Now I am gonna create new ratings which are time based/temporal
ratings['final_ratings']=round(ratings['rating']/ratings['weighted_diff'],2)
ratings.head()

Unnamed: 0,user_id,movie_id,rating,rating_date,days_diff,years_diff,weighted_diff,final_ratings
0,196,242,3,04-12-1997,332,0.91,0.63675,4.71
1,186,302,3,04-04-1998,211,0.58,0.4965,6.04
2,22,377,1,07-11-1997,359,0.98,0.6665,1.5
3,244,51,2,27-11-1997,339,0.93,0.64525,3.1
4,166,346,1,02-02-1998,272,0.75,0.56875,1.76


In [None]:
ratings['final_ratings'].describe()

In [15]:
n_users = ratings.user_id.unique().shape[0]
n_items = ratings.movie_id.unique().shape[0]
data_matrix = np.zeros((n_users, n_items))

In [16]:
#basically here I am filling the respective user ids(line[1]) and movies(line[2]) with the value final ratings(line[8])
for line in ratings.itertuples():
    data_matrix[line[1]-1,line[2]-1] = line[8]
data_matrix    

array([[6.93, 4.31, 5.96, ..., 0.  , 0.  , 0.  ],
       [7.42, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       ...,
       [6.93, 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 0.  , 0.  , ..., 0.  , 0.  , 0.  ],
       [0.  , 9.35, 0.  , ..., 0.  , 0.  , 0.  ]])

## Building a simple popularity,collaborative filtering and matrix factorization model using Turicreate

**Turi Create simplifies the development of custom machine learning models. You don’t have to be a machine learning expert to add recommendations, object detection, image classification, image similarity or activity classification to your app**

In [18]:
import turicreate
#Turi Create use its own data type, called SFrame, similar to Pandas Dataframe, giving some verbose output too.
train_data = turicreate.SFrame(ratings)
train_data

user_id,movie_id,rating,rating_date,days_diff,years_diff,weighted_diff,final_ratings
196,242,3,04-12-1997,332,0.91,0.6367499999999999,4.71
186,302,3,04-04-1998,211,0.58,0.4964999999999999,6.04
22,377,1,07-11-1997,359,0.98,0.6665,1.5
244,51,2,27-11-1997,339,0.93,0.64525,3.1
166,346,1,02-02-1998,272,0.75,0.56875,1.76
298,474,4,07-01-1998,298,0.82,0.5985,6.68
115,265,2,03-12-1997,333,0.91,0.6367499999999999,3.14
253,465,5,03-04-1998,212,0.58,0.4964999999999999,10.07
305,451,3,01-02-1998,273,0.75,0.56875,5.27
6,86,3,31-12-1997,305,0.84,0.607,4.94


In [None]:
#visualisations of train data final ratings
train_data['final_ratings'].show()

In [20]:
#split into train data and validation data
training_data,validation_data = turicreate.recommender.util.random_split_by_user(train_data, 'user_id', 'movie_id',item_test_proportion=0.2)

In [None]:
#A simple recommender model which will recommend movies based on the most popular choices, i.e., a model where all the users receive the same recommendation(s).
popularity_model = turicreate.popularity_recommender.create(train_data, user_id='user_id', item_id='movie_id', target='final_ratings')

In [23]:
popularity_recomm = popularity_model.recommend(users=[1,2,3,4,5,6,7],k=5)
popularity_recomm.print_rows(num_rows=35)


+---------+----------+-------------------+------+
| user_id | movie_id |       score       | rank |
+---------+----------+-------------------+------+
|    1    |   1201   |        9.99       |  1   |
|    1    |   1189   | 9.503333333333334 |  2   |
|    1    |   1653   |        9.5        |  3   |
|    1    |   1594   | 9.469999999999999 |  4   |
|    1    |   1293   | 9.386666666666667 |  5   |
|    2    |   1201   |        9.99       |  1   |
|    2    |   1189   | 9.503333333333334 |  2   |
|    2    |   1653   |        9.5        |  3   |
|    2    |   1594   | 9.469999999999999 |  4   |
|    2    |   1293   | 9.386666666666667 |  5   |
|    3    |   1201   |        9.99       |  1   |
|    3    |   1189   | 9.503333333333334 |  2   |
|    3    |   1653   |        9.5        |  3   |
|    3    |   1594   | 9.469999999999999 |  4   |
|    3    |   1293   | 9.386666666666667 |  5   |
|    4    |   1201   |        9.99       |  1   |
|    4    |   1189   | 9.503333333333334 |  2   |


In [None]:
#evaluating the popularity model ('Overall RMSE=1.83)
popularity_model.evaluate(validation_data)

The recommendations for all 7 users are the same . And they’re also in the same order. All the users who have watched these movies gave it a top rating and this was obviously what was expected from this ***Popularity Recommender***

In [24]:
#Now creating a collaborative filtering model using similarity recommender
#Training the model
item_sim_model = turicreate.item_similarity_recommender.create(training_data, user_id='user_id', item_id='movie_id', target='final_ratings', similarity_type='cosine')

#Creating Recommendations
item_sim_recomm = item_sim_model.recommend(users=[1,2,3,4,5,6,7],k=5)
item_sim_recomm.print_rows(num_rows=35)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |   204    | 1.3385929975796629 |  1   |
|    1    |   195    | 1.218967752048263  |  2   |
|    1    |   423    | 1.1703393409649532 |  3   |
|    1    |   210    | 1.078538604080677  |  4   |
|    1    |    28    | 1.0219033178355958 |  5   |
|    2    |   121    | 1.4054655748970655 |  1   |
|    2    |   181    | 1.401206106555705  |  2   |
|    2    |   286    | 1.3796823085570822 |  3   |
|    2    |   117    | 1.3309194920014362 |  4   |
|    2    |    7     | 1.2173429362627926 |  5   |
|    3    |   313    | 1.1348579208901588 |  1   |
|    3    |   300    | 1.0395310850853616 |  2   |
|    3    |   315    | 0.8929527072196312 |  3   |
|    3    |   895    | 0.8386510585216765 |  4   |
|    3    |   286    | 0.789850893172812  |  5   |
|    4    |   258    | 2.2700142396820917 |  1   |
|    4    |   313    | 1.893356

***In this model, we do not have the ratings for each movie given by each user. ***

In [None]:
#evaluating the collaborative filtering model using similarity recommender('Overall RMSE=5.802)
item_sim_model.evaluate(validation_data)

In [27]:
#Training the Matrix Factorization Model
fact_model = turicreate.recommender.ranking_factorization_recommender.create(training_data, user_id='user_id', item_id='movie_id', target='final_ratings')

#Making recommendations
fact_recomm = fact_model.recommend(users=[1,2,3,4,5,6,7],k=5)
fact_recomm.print_rows(num_rows=25)

+---------+----------+--------------------+------+
| user_id | movie_id |       score        | rank |
+---------+----------+--------------------+------+
|    1    |    7     | 7.105852937008046  |  1   |
|    1    |   405    | 7.086041876728438  |  2   |
|    1    |   546    | 7.042463713939093  |  3   |
|    1    |   1047   |  7.0313975900959   |  4   |
|    1    |    25    | 6.972714120204352  |  5   |
|    2    |   181    | 7.070090228373908  |  1   |
|    2    |   172    | 6.941142977828883  |  2   |
|    2    |   286    | 6.937798717672728  |  3   |
|    2    |   174    | 6.919526608343028  |  4   |
|    2    |    98    | 6.9186161324929225 |  5   |
|    3    |   313    | 7.272018154765271  |  1   |
|    3    |    50    | 7.253635724688672  |  2   |
|    3    |   172    | 7.045737033392094  |  3   |
|    3    |    98    | 7.039024850035809  |  4   |
|    3    |    22    | 7.035968130136632  |  5   |
|    4    |   313    | 7.3870760590385425 |  1   |
|    4    |   181    | 7.323020

In [28]:
#evaluating the matrix factorization model('Overall RMSE=1.64')
fact_model.evaluate(validation_data)


Precision and recall summary statistics by cutoff
+--------+---------------------+----------------------+
| cutoff |    mean_precision   |     mean_recall      |
+--------+---------------------+----------------------+
|   1    | 0.24628450106157115 | 0.015828688514704203 |
|   2    | 0.21178343949044587 | 0.02629948228805904  |
|   3    | 0.20488322717622065 | 0.04071270138741779  |
|   4    | 0.19851380042462846 | 0.05090579560961576  |
|   5    |  0.1951167728237791 | 0.06379460824002249  |
|   6    |  0.1873673036093418 | 0.07416042327539159  |
|   7    | 0.18365180467091283 | 0.08352004015760031  |
|   8    | 0.18006900212314225 | 0.09380484126340348  |
|   9    | 0.17622080679405527 | 0.10140977115131883  |
|   10   | 0.17144373673036087 | 0.10946886913945302  |
+--------+---------------------+----------------------+
[10 rows x 3 columns]


Overall RMSE: 1.6431082547351172

Per User RMSE (best)
+---------+---------------------+-------+
| user_id |         rmse        | count |
+-

{'precision_recall_by_user': Columns:
 	user_id	int
 	cutoff	int
 	precision	float
 	recall	float
 	count	int
 
 Rows: 16956
 
 Data:
 +---------+--------+-----------+--------+-------+
 | user_id | cutoff | precision | recall | count |
 +---------+--------+-----------+--------+-------+
 |   196   |   1    |    0.0    |  0.0   |   7   |
 |   196   |   2    |    0.0    |  0.0   |   7   |
 |   196   |   3    |    0.0    |  0.0   |   7   |
 |   196   |   4    |    0.0    |  0.0   |   7   |
 |   196   |   5    |    0.0    |  0.0   |   7   |
 |   196   |   6    |    0.0    |  0.0   |   7   |
 |   196   |   7    |    0.0    |  0.0   |   7   |
 |   196   |   8    |    0.0    |  0.0   |   7   |
 |   196   |   9    |    0.0    |  0.0   |   7   |
 |   196   |   10   |    0.0    |  0.0   |   7   |
 +---------+--------+-----------+--------+-------+
 [16956 rows x 5 columns]
 Note: Only the head of the SFrame is printed.
 You can use print_rows(num_rows=m, num_columns=n) to print more rows and colum