In [1]:
import os
from google.colab import drive
drive.mount('/content/gdrive')
os.chdir('/content/gdrive/My Drive/AML Project Group 20/ml-25m')

Mounted at /content/gdrive


In [2]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import re

import warnings
warnings.filterwarnings('ignore')

# Data Loading
The dataset will be used in this study as rating.csv and movie.csv
rating.csv that contains ratings of movies by users:
+ userId
+ movieId
+ rating
+ timestamp

movie.csv that contains movie information:
+ movieId
+ title
+ genres

In [3]:
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [6]:
df = movies_df.merge(ratings_df, how="left", on="movieId")
df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,2.0,3.5,1141416000.0
1,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,3.0,4.0,1439472000.0
2,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,4.0,3.0,1573944000.0
3,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,5.0,4.0,858625900.0
4,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,8.0,4.0,890492500.0


# Data Processing

In [4]:
!pip install surprise
import pandas as pd
!pip install openpyxl
from surprise import Reader, SVD, Dataset, accuracy
from surprise.model_selection import GridSearchCV, train_test_split, cross_validate

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[K     |████████████████████████████████| 771 kB 4.9 MB/s 
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp38-cp38-linux_x86_64.whl size=2626501 sha256=4e877087965041e735fb729231b679b8bf727c5e198c1f5f346a0929aed928cb
  Stored in directory: /root/.cache/pip/wheels/af/db/86/2c18183a80ba05da35bf0fb7417aac5cddbd93bcb1b92fd3ea
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


We select five popular movies as sample. 

In [54]:
movie_ids = [130219, 356, 4422, 541, 388]
movies = ["The Dark Knight (2011)",
          "Cries and Whispers (Viskningar och rop) (1972)",
          "Forrest Gump (1994)",
          "Blade Runner (1982)",
          "Boys Life (1995)"]

In [55]:
sample_df = df[df.movieId.isin(movie_ids)]
sample_df.shape

(119488, 6)

In [58]:
sample_df.head()

Unnamed: 0,movieId,title,genres,userId,rating,timestamp
2466478,356,Forrest Gump (1994),Comedy|Drama|Romance|War,2.0,4.5,1141417000.0
2466479,356,Forrest Gump (1994),Comedy|Drama|Romance|War,3.0,4.0,1439472000.0
2466480,356,Forrest Gump (1994),Comedy|Drama|Romance|War,5.0,4.0,833146700.0
2466481,356,Forrest Gump (1994),Comedy|Drama|Romance|War,8.0,5.0,890489600.0
2466482,356,Forrest Gump (1994),Comedy|Drama|Romance|War,9.0,5.0,859383400.0


Create user-item matrix with users in rows and movies in column. 

In [59]:
user_movie_df = sample_df.pivot_table(index=["userId"], columns=["title"], values="rating")
user_movie_df.shape

(94051, 5)

In [60]:
user_movie_df.head()

title,Blade Runner (1982),Boys Life (1995),Cries and Whispers (Viskningar och rop) (1972),Forrest Gump (1994),The Dark Knight (2011)
userId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1.0,,,3.0,,
2.0,,,,4.5,
3.0,5.0,,,4.0,
4.0,4.5,,,,
5.0,,,,4.0,


# Support Vector Decomposition (SVD) Modeling

In [61]:
# scale the ratings to the 1-5 range with the Reader object.
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(sample_df[['userId', 'movieId', 'rating']], reader)

In [62]:
# Unknown p and q matrices were learned from the existing data. It pretends to have 100 latent factors by default
trainset, testset = train_test_split(data, test_size=.25)
svd_model = SVD()
svd_model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f2878a5dfa0>

In [63]:
predictions = svd_model.test(testset)
accuracy.rmse(predictions)

RMSE: 0.9363


0.9363485096951216



Now we will apply cross validation method in order to validate the results.

In [64]:
cross_validate(svd_model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9395  0.9377  0.9282  0.9318  0.9333  0.9341  0.0041  
MAE (testset)     0.7224  0.7258  0.7165  0.7210  0.7153  0.7202  0.0039  
Fit time          2.36    2.22    2.29    2.25    2.83    2.39    0.23    
Test time         0.41    0.21    0.42    0.42    0.76    0.44    0.18    


{'test_rmse': array([0.93950855, 0.93765175, 0.92821849, 0.93180865, 0.93332851]),
 'test_mae': array([0.72236477, 0.72577923, 0.71647853, 0.72097782, 0.71525608]),
 'fit_time': (2.359659433364868,
  2.22340989112854,
  2.288630723953247,
  2.2453296184539795,
  2.832789659500122),
 'test_time': (0.40523815155029297,
  0.20647025108337402,
  0.41715049743652344,
  0.4226059913635254,
  0.7582738399505615)}

Try to use SVD model to predict one user's rating for one movie.

In [65]:
svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.18   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.179883557980326, details={'was_impossible': False})

Tuning Hyperparameter for the SVD model.

In [66]:
param_grid = {'n_epochs': [5, 10, 15], 'lr_all': [0.001, 0.002, 0.005]}

GS = GridSearchCV(SVD,
                  param_grid,
                  measures=['rmse', 'mae'],
                  cv=3,
                  n_jobs=-1,
                  joblib_verbose=True)

GS.fit(data)
GS.best_score['rmse']

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  27 out of  27 | elapsed:   37.3s finished


0.9286914201643829

In [68]:
GS.best_params['rmse']

{'n_epochs': 15, 'lr_all': 0.001}

Through hyperparameter tuning, the SVD model performs better with the RMSE decreasing from 0.9363 to 0.9286. And the best hyperparameters are 15 pochs and 0.001 learning rate. And then we apply this model to the example above, and get a different predicted rating.

In [69]:
svd_model = SVD(**GS.best_params['rmse'])

data = data.build_full_trainset()
svd_model.fit(data)

svd_model.predict(uid=1.0, iid=541, verbose=True)

user: 1.0        item: 541        r_ui = None   est = 4.22   {'was_impossible': False}


Prediction(uid=1.0, iid=541, r_ui=None, est=4.222047113331523, details={'was_impossible': False})