# Movie Ratings Recommendation System

Let's compare some of the recommendation systems for the movie ratings dataset provided by MovieLens.

# Import libraries and environment settings

### Git clone the microsoft recommender repository

In [114]:
!pip install -q xlrd
!git clone https://github.com/microsoft/recommenders.git

fatal: destination path 'recommenders' already exists and is not an empty directory.


In [115]:
!pip install surprise



In [116]:
!pip install papermill



In [0]:
import pandas as pd
import sys
import os
sys.path.append(os.path.join('/content', 'recommenders')) # Running on Colab enviroment
import surprise
import papermill as pm
from sklearn.metrics import mean_squared_error
import tensorflow as tf
import keras
import numpy as np

#microsoft recommender
from reco_utils.common.spark_utils import start_or_get_spark
from reco_utils.dataset.download_utils import maybe_download
from reco_utils.dataset.python_splitters import (
    python_random_split, 
    python_chrono_split, 
    python_stratified_split
)
from reco_utils.dataset.spark_splitters import (
    spark_random_split, 
    spark_chrono_split, 
    spark_stratified_split,
    spark_timestamp_split
)
from reco_utils.recommender.surprise.surprise_utils import compute_rating_predictions, compute_ranking_predictions
from reco_utils.evaluation.python_evaluation import (rmse, mae, rsquared, exp_var, map_at_k, ndcg_at_k, precision_at_k, 
                                                     recall_at_k, get_top_k_items)
from reco_utils.recommender.ncf.ncf_singlenode import NCF
from reco_utils.recommender.ncf.dataset import Dataset as NCFDataset


# Data Import

In [0]:
DATA_URL = "http://files.grouplens.org/datasets/movielens/ml-100k/u.data"
DATA_PATH = "ml-100k.data"

COL_USER = "userID"
COL_ITEM = "itemID"
COL_RATING = "rating"
COL_PREDICTION = "rating"
COL_TIMESTAMP = "Timestamp"

In [0]:
filepath = maybe_download(DATA_URL, DATA_PATH)

In [0]:
data = pd.read_csv(filepath, sep="\t", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data = data.drop(columns = ['Timestamp'])

In [121]:
data.head()

Unnamed: 0,userID,itemID,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1


In [122]:
data.describe()

Unnamed: 0,userID,itemID,rating
count,100000.0,100000.0,100000.0
mean,462.48475,425.53013,3.52986
std,266.61442,330.798356,1.125674
min,1.0,1.0,1.0
25%,254.0,175.0,3.0
50%,447.0,322.0,4.0
75%,682.0,631.0,4.0
max,943.0,1682.0,5.0


# Data Split

For the modelling, let's split the data into train, validation, and test set.  
The splitting threshold that I used is 60 - 20 - 20.

### Shuffle & Split the dataset

In [0]:
data_train, data_test = python_random_split(data, ratio=0.7)

In [124]:
data_train.shape, data_test.shape

((70000, 3), (30000, 3))

In [125]:
data_train.head()

Unnamed: 0,userID,itemID,rating
76513,907,628,5
60406,622,206,1
27322,18,480,4
53699,484,699,4
65412,871,690,3


In [126]:
data_test.head()

Unnamed: 0,userID,itemID,rating
75721,877,381,4
80184,815,602,3
19864,94,431,4
76699,416,875,2
92991,500,182,2


# Modelling

##Model List

###1. Baseline
###2. SVD
###3. Neural Collaborative Filtering

## Evaluation Method 

###Mean Squared Error





##1. Baseline

: for the baseline model, I used mean rating for each of the user.


In [127]:
users_ratings = data_train.groupby(['userID'])['rating'].mean()
users_ratings = users_ratings.to_frame().reset_index()
users_ratings.rename(columns = {'Rating': 'MeanRating'}, inplace = True)
users_ratings.head()

Unnamed: 0,userID,rating
0,1,3.705263
1,2,3.897436
2,3,2.789474
3,4,4.5
4,5,2.858333


In [128]:
baseline_data = pd.merge(data_test, users_ratings, on=['userID'], how='inner')
baseline_data.head()

Unnamed: 0,userID,itemID,rating_x,rating_y
0,877,381,4,3.672727
1,877,451,4,3.672727
2,877,557,4,3.672727
3,877,692,4,3.672727
4,877,56,5,3.672727


### Evaluate with RMSE

In [0]:
baseline_test = baseline_data[['userID', 'itemID', 'rating_x']]
baseline_predictions = baseline_data[['userID', 'itemID', 'rating_y']]

In [0]:
baseline_test.columns = ['userID', 'itemID', 'rating']
baseline_predictions.columns = ['userID', 'itemID', 'rating']

In [0]:
cols = {
    "col_user" : 'userID',
    "col_item" : "itemID",
    "col_prediction" : "rating"
}

In [132]:
eval_rmse = rmse(baseline_test, baseline_predictions, **cols)
print(eval_rmse)
errors = {}
errors['baseline'] = eval_rmse

1.046143073881678


### Scikit learn 

## 2.SVD

: for the matrix factorization model, I used Surprise library's SVD algorithm.

In [133]:
train_set = surprise.Dataset.load_from_df(data_train, reader=surprise.Reader('ml-100k')).build_full_trainset()
train_set

<surprise.trainset.Trainset at 0x7fb6f017e710>

In [134]:
svd = surprise.SVD(random_state=0, n_factors=200, n_epochs=30, verbose=True)

svd.fit(train_set)

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29


<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7fb6efd1eba8>

In [0]:
svd_data = compute_rating_predictions(svd, data_test, usercol='userID', itemcol='itemID')
svd_data.head()
svd_data.columns = ['userID', 'itemID', 'rating']

In [180]:
eval_rmse = rmse(data_test, svd_data, **cols)
print(eval_rmse)

0.9513392485318333


In [0]:
errors['svd'] = eval_rmse

## 3-1.Neural Collaborative Filtering (using Microsoft Recommender)

In [0]:
# dataset class for NCF
data = NCFDataset(train=data_train, test=data_test, seed=1)

In [0]:
model = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=4,
    layer_sizes=[8,8,4],
    n_epochs=50,
    batch_size=256,
    learning_rate=1e-3,
    verbose=10,
    seed=1
)

In [0]:
model.fit(data)

In [0]:
predictions = [[row.userID, row.itemID, model.predict(row.userID, row.itemID)]
               for (_, row) in data_test.iterrows()]

In [150]:
predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'rating'])
predictions.head()

Unnamed: 0,userID,itemID,rating
0,877,381,0.533057
1,815,602,0.339252
2,94,431,0.788292
3,416,875,0.143191
4,500,182,0.298132


In [151]:
eval_rmse = rmse(data_test, predictions, **cols)
print(eval_rmse)

3.224251224188875


In [0]:
model_2 = NCF (
    n_users=data.n_users, 
    n_items=data.n_items,
    model_type="NeuMF",
    n_factors=8,
    layer_sizes= [64,32,16,8],
    n_epochs=20,
    batch_size=256,
    learning_rate=1e-3,
    verbose=10,
    seed=1
)

In [0]:
model_2.fit(data)

In [0]:
predictions = [[row.userID, row.itemID, model_2.predict(row.userID, row.itemID)]
               for (_, row) in data_test.iterrows()]

In [156]:
predictions = pd.DataFrame(predictions, columns=['userID', 'itemID', 'rating'])
predictions.head()

Unnamed: 0,userID,itemID,rating
0,877,381,0.826964
1,815,602,0.521168
2,94,431,0.854729
3,416,875,0.751871
4,500,182,0.235201


In [157]:
eval_rmse = rmse(data_test, predictions, **cols)
print(eval_rmse)

3.2353893268186016


# 3-2. Neural Collaborative Filtering (using Keras)

Neural Net Structure referred to https://nipunbatra.github.io/blog/2017/neural-collaborative-filtering.html

In [0]:
data = pd.read_csv(filepath, sep="\t", names=[COL_USER, COL_ITEM, COL_RATING, COL_TIMESTAMP])
data = data.drop(columns = ['Timestamp'])

In [172]:
import keras
from keras.layers import concatenate

n_latent_factors_user = 8
n_latent_factors_movie = 10
n_latent_factors_mf = 3
n_users, n_movies = len(data.userID.unique()), len(data.itemID.unique())

movie_input = keras.layers.Input(shape=[1],name='Item')
movie_embedding_mlp = keras.layers.Embedding(n_movies + 1, n_latent_factors_movie, name='Movie-Embedding-MLP')(movie_input)
movie_vec_mlp = keras.layers.Flatten(name='FlattenMovies-MLP')(movie_embedding_mlp)
movie_vec_mlp = keras.layers.Dropout(0.2)(movie_vec_mlp)

movie_embedding_mf = keras.layers.Embedding(n_movies + 1, n_latent_factors_mf, name='Movie-Embedding-MF')(movie_input)
movie_vec_mf = keras.layers.Flatten(name='FlattenMovies-MF')(movie_embedding_mf)
movie_vec_mf = keras.layers.Dropout(0.2)(movie_vec_mf)


user_input = keras.layers.Input(shape=[1],name='User')
user_vec_mlp = keras.layers.Flatten(name='FlattenUsers-MLP')(keras.layers.Embedding(n_users + 1, n_latent_factors_user,name='User-Embedding-MLP')(user_input))
user_vec_mlp = keras.layers.Dropout(0.2)(user_vec_mlp)

user_vec_mf = keras.layers.Flatten(name='FlattenUsers-MF')(keras.layers.Embedding(n_users + 1, n_latent_factors_mf,name='User-Embedding-MF')(user_input))
user_vec_mf = keras.layers.Dropout(0.2)(user_vec_mf)


concat = keras.layers.concatenate([movie_vec_mlp, user_vec_mlp], axis = 1)
concat_dropout = keras.layers.Dropout(0.2)(concat)
dense = keras.layers.Dense(200,name='FullyConnected')(concat_dropout)
dense_batch = keras.layers.BatchNormalization(name='Batch')(dense)
dropout_1 = keras.layers.Dropout(0.2,name='Dropout-1')(dense_batch)
dense_2 = keras.layers.Dense(100,name='FullyConnected-1')(dropout_1)
dense_batch_2 = keras.layers.BatchNormalization(name='Batch-2')(dense_2)


dropout_2 = keras.layers.Dropout(0.2,name='Dropout-2')(dense_batch_2)
dense_3 = keras.layers.Dense(50,name='FullyConnected-2')(dropout_2)
dense_4 = keras.layers.Dense(20,name='FullyConnected-3', activation='relu')(dense_3)

pred_mf = keras.layers.concatenate([movie_vec_mf, user_vec_mf], axis = 1)


pred_mlp = keras.layers.Dense(1, activation='relu',name='Activation')(dense_4)

combine_mlp_mf = keras.layers.concatenate([pred_mf, pred_mlp], axis = 1)
result_combine = keras.layers.Dense(100,name='Combine-MF-MLP')(combine_mlp_mf)
deep_combine = keras.layers.Dense(100,name='FullyConnected-4')(result_combine)


result = keras.layers.Dense(1,name='Prediction')(deep_combine)


model_keras = keras.Model([user_input, movie_input], result)
opt = keras.optimizers.Adam(lr =0.01)
model_keras.compile(optimizer='adam',loss= 'mean_absolute_error')




In [173]:
model_keras.summary()

Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Item (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
User (InputLayer)               (None, 1)            0                                            
__________________________________________________________________________________________________
Movie-Embedding-MLP (Embedding) (None, 1, 10)        16830       Item[0][0]                       
__________________________________________________________________________________________________
User-Embedding-MLP (Embedding)  (None, 1, 8)         7552        User[0][0]                       
____________________________________________________________________________________________

In [0]:
history = model_keras.fit([data_train.userID, data_train.itemID], data_train.rating, epochs=25, verbose=0, validation_split=0.1)

In [182]:
from sklearn.metrics import mean_squared_error
y_hat_2 = np.round(model_keras.predict([data_test.userID, data_test.itemID]),0)
ncf_error = mean_squared_error(data_test.rating, y_hat_2)
print(ncf_error)

1.0136


In [0]:
errors['NCF'] = ncf_error

In [184]:
errors

{'NCF': 1.0136, 'baseline': 1.046143073881678, 'svd': 0.9513392485318333}