# Import Packages

In [1]:
import gc

import pandas as pd
import numpy as np

import surprise 
from surprise.model_selection import cross_validate
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV

# Load Data

In [2]:
df_ratings = pd.read_csv('../data/task2/ratings_small.csv')

print(df_ratings.shape)
df_ratings.head()

(100004, 4)


Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205


In [3]:
##### transform dataset #####
df_ratings = df_ratings[['userId', 'movieId', 'rating']]

movie_reader = surprise.Reader(rating_scale=(1, 5))
data = surprise.Dataset.load_from_df(df_ratings, movie_reader)

del(df_ratings)

# Analysis

In [4]:
class PMF(surprise.AlgoBase):

    def __init__(self, learning_rate, num_epochs, num_factors):

        self.alpha = learning_rate 
        self.num_epochs = num_epochs
        self.num_factors = num_factors
        
        
    def fit(self, train):
        
        P = np.random.normal(0, 0.1, (train.n_users, self.num_factors))
        Q = np.random.normal(0, 0.1, (train.n_items, self.num_factors))

        for epoch in range(self.num_epochs):
            for u,i,r_ui in train.all_ratings():
                residual = r_ui - np.dot(P[u], Q[i])
                temp = P[u, :] 
                
                P[u,:] += self.alpha * residual * Q[i]
                Q[i,:] += self.alpha * residual * temp 

                
        self.P = P
        self.Q = Q

        self.trainset = train
    
    
    def estimate(self, u, i):

        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            nanCheck = np.dot(self.P[u], self.Q[i])
            
            if np.isnan(nanCheck):
                return self.trainset.global_mean
            else:
                return np.dot(self.P[u,:],self.Q[i,:])
        
        else:
            return self.trainset.global_mean

## 3(c), 3(d)

In [7]:
##### compute rmse and mae for Probabilistic Matrix Factorization (PMF) #####
pmf = PMF(learning_rate=0.05, num_epochs=5, num_factors=10)

cross_validate(pmf, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Evaluating RMSE, MAE of algorithm PMF on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0493  1.0434  1.0453  1.0742  1.0534  1.0531  0.0111  
MAE (testset)     0.8077  0.8069  0.8026  0.8243  0.8102  0.8103  0.0074  
Fit time          1.92    1.87    1.86    1.86    1.85    1.87    0.02    
Test time         0.12    0.13    0.17    0.12    0.17    0.14    0.02    


{'test_rmse': array([1.04926729, 1.04341216, 1.04526324, 1.07419732, 1.05343396]),
 'test_mae': array([0.8076676 , 0.80692042, 0.8026196 , 0.82430141, 0.81022014]),
 'fit_time': (1.9190654754638672,
  1.8726131916046143,
  1.8603920936584473,
  1.8614134788513184,
  1.84981107711792),
 'test_time': (0.11936211585998535,
  0.13021349906921387,
  0.16975975036621094,
  0.11609244346618652,
  0.1739966869354248)}

In [8]:
##### compute rmse and mae for User-based Collaborative Filtering #####
##### name: cosine #####
sim_options = {'name': 'cosine', 'user_based': True}; user_algo = KNNWithMeans(sim_options=sim_options)

cross_validate(user_algo, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9270  0.9191  0.9250  0.9118  0.9247  0.9215  0.0055  
MAE (testset)     0.7104  0.7053  0.7096  0.7000  0.7097  0.7070  0.0039  
Fit time          0.14    0.17    0.23    0.20    0.18    0.19    0.03    
Test time         0.82    0.83    0.92    0.88    0.79    0.85    0.05    


{'test_rmse': array([0.92701321, 0.91907191, 0.92504458, 0.91182768, 0.9246893 ]),
 'test_mae': array([0.7103943 , 0.70533666, 0.70962461, 0.69997665, 0.70968548]),
 'fit_time': (0.14395737648010254,
  0.16860198974609375,
  0.2346024513244629,
  0.20336008071899414,
  0.18266582489013672),
 'test_time': (0.8151421546936035,
  0.8307774066925049,
  0.9164149761199951,
  0.8835556507110596,
  0.7948973178863525)}

In [9]:
##### compute rmse and mae for Item-based Collaborative Filtering #####
##### name: cosine #####
sim_options = {'name': 'cosine', 'user_based': False}; item_algo = KNNWithMeans(sim_options=sim_options)

cross_validate(item_algo, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9230  0.9298  0.9228  0.9176  0.9373  0.9261  0.0068  
MAE (testset)     0.7093  0.7120  0.7076  0.7044  0.7181  0.7103  0.0046  
Fit time          3.28    3.34    3.46    3.34    3.43    3.37    0.07    
Test time         3.71    3.59    3.85    3.65    3.74    3.71    0.09    


{'test_rmse': array([0.92297538, 0.92977994, 0.92278553, 0.91760466, 0.93726409]),
 'test_mae': array([0.70931316, 0.7120402 , 0.70763508, 0.70440726, 0.71811834]),
 'fit_time': (3.283848762512207,
  3.338536024093628,
  3.4628963470458984,
  3.3390493392944336,
  3.433276653289795),
 'test_time': (3.7141618728637695,
  3.593128204345703,
  3.8541290760040283,
  3.6477551460266113,
  3.7387216091156006)}

## 3(e)

In [10]:
##### compute rmse and mae for User-based Collaborative Filtering #####
##### name: msd #####
sim_options = {'name': 'msd', 'user_based': True}; user_algo = KNNWithMeans(sim_options=sim_options)

cross_validate(user_algo, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9213  0.9240  0.9136  0.9176  0.9184  0.9190  0.0035  
MAE (testset)     0.7067  0.7057  0.6985  0.7033  0.7031  0.7035  0.0028  
Fit time          0.11    0.15    0.11    0.14    0.11    0.12    0.02    
Test time         0.81    0.87    0.94    0.85    0.82    0.86    0.04    


{'test_rmse': array([0.92127364, 0.92401114, 0.91360495, 0.91761804, 0.91837236]),
 'test_mae': array([0.7066581 , 0.70571853, 0.69848866, 0.70326436, 0.7031458 ]),
 'fit_time': (0.10574698448181152,
  0.15415620803833008,
  0.1122746467590332,
  0.14043688774108887,
  0.10959768295288086),
 'test_time': (0.8139958381652832,
  0.8715102672576904,
  0.9356298446655273,
  0.8533451557159424,
  0.822089433670044)}

In [11]:
##### compute rmse and mae for Item-based Collaborative Filtering #####
##### name: msd #####
sim_options = {'name': 'msd', 'user_based': False}; item_algo = KNNWithMeans(sim_options=sim_options)

cross_validate(item_algo, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9179  0.9123  0.9121  0.9265  0.9074  0.9152  0.0066  
MAE (testset)     0.7032  0.6972  0.6968  0.7072  0.6982  0.7005  0.0040  
Fit time          2.08    2.17    2.25    2.21    2.25    2.19    0.07    
Test time         3.69    3.82    3.94    3.91    3.89    3.85    0.09    


{'test_rmse': array([0.91794877, 0.91230046, 0.91206795, 0.92651742, 0.90737439]),
 'test_mae': array([0.7032312 , 0.69719026, 0.69684267, 0.70715609, 0.6982284 ]),
 'fit_time': (2.0774383544921875,
  2.1701581478118896,
  2.2545392513275146,
  2.2075233459472656,
  2.251655101776123),
 'test_time': (3.6900172233581543,
  3.8153743743896484,
  3.9378738403320312,
  3.9068808555603027,
  3.890315055847168)}

In [12]:
##### compute rmse and mae for User-based Collaborative Filtering #####
##### name: pearson #####
sim_options = {'name': 'pearson', 'user_based': True}; user_algo = KNNWithMeans(sim_options=sim_options)

cross_validate(user_algo, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9304  0.9218  0.9265  0.9279  0.9127  0.9239  0.0062  
MAE (testset)     0.7053  0.7041  0.7076  0.7079  0.6983  0.7046  0.0035  
Fit time          0.23    0.25    0.22    0.30    0.21    0.24    0.03    
Test time         0.84    0.87    0.98    0.84    0.84    0.87    0.06    


{'test_rmse': array([0.93037588, 0.92184808, 0.92653583, 0.92793188, 0.91273066]),
 'test_mae': array([0.70532283, 0.70410408, 0.70758262, 0.70787063, 0.69829003]),
 'fit_time': (0.23227906227111816,
  0.25476789474487305,
  0.21914029121398926,
  0.29950785636901855,
  0.21060633659362793),
 'test_time': (0.835533857345581,
  0.8686585426330566,
  0.9841725826263428,
  0.8445603847503662,
  0.8417208194732666)}

In [13]:
##### compute rmse and mae for Item-based Collaborative Filtering #####
##### name: pearson #####
sim_options = {'name': 'pearson', 'user_based': False}; item_algo = KNNWithMeans(sim_options=sim_options)

cross_validate(item_algo, data, 
               measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9312  0.9202  0.9339  0.9356  0.9211  0.9284  0.0065  
MAE (testset)     0.7088  0.7059  0.7102  0.7086  0.7044  0.7076  0.0021  
Fit time          4.55    4.59    4.59    4.44    4.48    4.53    0.06    
Test time         3.72    3.79    3.78    3.63    3.75    3.73    0.06    


{'test_rmse': array([0.93120639, 0.92016492, 0.93387879, 0.93560859, 0.92107981]),
 'test_mae': array([0.70879431, 0.70592712, 0.71022951, 0.70858843, 0.7043958 ]),
 'fit_time': (4.551377058029175,
  4.589977025985718,
  4.593186855316162,
  4.437203884124756,
  4.482363939285278),
 'test_time': (3.7184770107269287,
  3.7869553565979004,
  3.7829689979553223,
  3.6326050758361816,
  3.7521798610687256)}

## 3(f), 3(g)

In [11]:
##### examine neigbouring impacts on User-based Collaborative Filtering #####
sim_options = {'name': ['cosine', 'msd', 'pearson'], 'user_based': [True]}
param_grid = {'sim_options': sim_options, 'k': [25, 30, 35, 40, 45, 50, 60, 70, 80, 90, 100, 150, 200, 300]}

grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)
grid_search.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [12]:
print(grid_search.best_params['rmse']); print(grid_search.best_score['rmse'])

{'sim_options': {'name': 'msd', 'user_based': True}, 'k': 40}
0.9192799310583138


In [19]:
##### grid_searchine neigbouring impacts on Item-based Collaborative Filtering #####
sim_options = {'name': ['cosine', 'msd', 'pearson'], 'user_based': [False]}
param_grid = {'sim_options': sim_options, 'k': [50, 60, 70, 80, 90, 100, 150, 200, 300]}

grid_search = GridSearchCV(KNNWithMeans, param_grid, measures=['rmse'], cv=5)
grid_search.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [20]:
print(grid_search.best_params['rmse']); print(grid_search.best_score['rmse'])

{'sim_options': {'name': 'msd', 'user_based': False}, 'k': 150}
0.9097211832400618


In [21]:
##### find optimum neighbours for User-Based Collaborative Filtering #####
sim_options = {'name': 'msd', 'user_based': True}

for k in range(5, 250, 5):
    print('k = ', k)
    
    user_algo = KNNWithMeans(sim_options=sim_options, k = k)
    
    cross_validate(user_algo, data, 
                   measures=['RMSE'], cv=5, verbose=True)
    
    print('\n')

k =  5
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9559  0.9557  0.9658  0.9579  0.9521  0.9575  0.0045  
Fit time          0.09    0.11    0.11    0.11    0.11    0.11    0.01    
Test time         0.64    0.56    0.70    0.58    0.57    0.61    0.05    


k =  10
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Do

In [22]:
##### find optimum neighbours for Item-Based Collaborative Filtering #####
sim_options = {'name': 'msd', 'user_based': False}

for k in range(5, 250, 5):
    print('k = ', k)
    
    item_algo = KNNWithMeans(sim_options=sim_options, k = k)
    
    cross_validate(item_algo, data, 
                   measures=['RMSE'], cv=5, verbose=True)
    
    print('\n')

k =  5
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9806  0.9891  0.9753  0.9607  0.9789  0.9769  0.0093  
Fit time          2.28    2.20    2.17    2.39    2.19    2.25    0.08    
Test time         3.19    3.29    3.12    3.16    2.98    3.15    0.10    


k =  10
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Do