## Loading the data Set

In [6]:
import pandas as pd
import numpy as np


import surprise
from surprise.model_selection import cross_validate
from surprise import KNNWithMeans
from surprise.model_selection import GridSearchCV

In [7]:
df = pd.read_csv('./archive/ratings_small.csv')
df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,31,2.5,1260759144
1,1,1029,3.0,1260759179
2,1,1061,3.0,1260759182
3,1,1129,2.0,1260759185
4,1,1172,4.0,1260759205



## Defining the class for pmf and ranking

ref : https://towardsdatascience.com/machine-learning-for-building-recommender-system-in-python-9e4922dd7e97
ref: https://github.com/NicolasHug/Surprise/tree/fa7455880192383f01475162b4cbd310d91d29ca

In [8]:
class ProbabilisticMatrixFactorization(surprise.AlgoBase):
   
    def __init__(self,learning_rate,num_epochs,num_factors):
        self.alpha = learning_rate
        self.num_epochs = num_epochs
        self.num_factors = num_factors
    def fit(self,train):
        P = np.random.normal(0,.1,(train.n_users,self.num_factors))
        Q = np.random.normal(0,.1,(train.n_items,self.num_factors))
        for epoch in range(self.num_epochs):
            for u,i,r_ui in train.all_ratings():
                residual = r_ui - np.dot(P[u],Q[i])
                temp = P[u,:] 
                P[u,:] +=  self.alpha * residual * Q[i]
                Q[i,:] +=  self.alpha * residual * temp 
               
        self.P = P
        self.Q = Q
        self.trainset = train
    
    def estimate(self,u,i):
        if self.trainset.knows_user(u) and self.trainset.knows_item(i):
            nanCheck = np.dot(self.P[u],self.Q[i])
            
            if np.isnan(nanCheck):
                return self.trainset.global_mean
            else:
                return np.dot(self.P[u,:],self.Q[i,:])
        else:
            return self.trainset.global_mean


## Load dataset using sprise mod

In [9]:
reader = surprise.Reader(rating_scale=(1,5)) 
data = surprise.Dataset.load_from_df(df[['userId', 'movieId', 'rating']], reader)


## PMF 

In [10]:
### PMF

clf = ProbabilisticMatrixFactorization(learning_rate=0.05,num_epochs=5,num_factors=10)

cross_validate(clf, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)



Evaluating RMSE, MAE of algorithm ProbabilisticMatrixFactorization on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0425  1.0499  1.0555  1.0569  1.0575  1.0525  0.0057  
MAE (testset)     0.8022  0.8058  0.8485  0.8164  0.8147  0.8175  0.0164  
Fit time          3.03    3.29    3.16    3.10    3.10    3.14    0.09    
Test time         0.19    0.19    0.15    0.20    0.18    0.18    0.02    


{'test_rmse': array([1.04247514, 1.04989327, 1.05551789, 1.05688958, 1.05752714]),
 'test_mae': array([0.8022014 , 0.80577388, 0.84847803, 0.81635665, 0.81470968]),
 'fit_time': (3.0314552783966064,
  3.2863094806671143,
  3.157055139541626,
  3.0954554080963135,
  3.104997158050537),
 'test_time': (0.19130206108093262,
  0.19201254844665527,
  0.15488576889038086,
  0.19862961769104004,
  0.18157672882080078)}

## User Based  with cosine 

In [11]:
sim_options = {
    "name": "cosine",
    "user_based": True,  # Compute  similarities between users
}

model = KNNWithMeans(sim_options=sim_options)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9245  0.9299  0.9188  0.9097  0.9264  0.9219  0.0071  
MAE (testset)     0.7099  0.7113  0.7050  0.6960  0.7108  0.7066  0.0058  
Fit time          0.42    0.42    0.44    0.42    0.43    0.43    0.01    
Test time         1.21    1.22    1.32    1.20    1.30    1.25    0.05    


{'test_rmse': array([0.92446787, 0.92991556, 0.91883935, 0.909688  , 0.92636274]),
 'test_mae': array([0.70994543, 0.71131235, 0.70504742, 0.69597221, 0.71084639]),
 'fit_time': (0.42414379119873047,
  0.4221363067626953,
  0.4410858154296875,
  0.4166853427886963,
  0.4287400245666504),
 'test_time': (1.21421217918396,
  1.2213215827941895,
  1.3228814601898193,
  1.201566457748413,
  1.302232027053833)}

## Item-Based 

https://buomsoo-kim.github.io/recommender%20systems/2020/09/06/Recommender-systems-collab-filtering-8.md/

In [12]:
sim_options = {
    "name": "cosine",
    "user_based": False,  # Compute  similarities between items
}

model = KNNWithMeans(sim_options=sim_options)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9213  0.9259  0.9195  0.9339  0.9297  0.9260  0.0053  
MAE (testset)     0.7037  0.7091  0.7043  0.7147  0.7144  0.7092  0.0047  
Fit time          9.72    9.66    9.99    10.26   9.75    9.87    0.22    
Test time         4.59    4.92    4.84    4.92    4.61    4.78    0.15    


{'test_rmse': array([0.9212596 , 0.92588429, 0.9194733 , 0.93390967, 0.92971316]),
 'test_mae': array([0.70372687, 0.70905593, 0.70428974, 0.71467422, 0.71437509]),
 'fit_time': (9.720046520233154,
  9.6625497341156,
  9.986820459365845,
  10.257053852081299,
  9.747756242752075),
 'test_time': (4.589172124862671,
  4.92116379737854,
  4.839760065078735,
  4.923788547515869,
  4.607419967651367)}

## User-Based with msd

In [13]:
sim_options = {
    "name": "msd",
    "user_based": True,  # Compute  similarities between users
}

model = KNNWithMeans(sim_options=sim_options)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9142  0.9313  0.9160  0.9178  0.9183  0.9195  0.0061  
MAE (testset)     0.7024  0.7104  0.7043  0.6998  0.7015  0.7037  0.0037  
Fit time          0.16    0.17    0.17    0.16    0.16    0.16    0.00    
Test time         1.21    1.23    1.21    1.23    1.24    1.23    0.01    


{'test_rmse': array([0.91421739, 0.93132716, 0.91598493, 0.9178237 , 0.91832955]),
 'test_mae': array([0.70235453, 0.71038054, 0.70432813, 0.69978264, 0.70148022]),
 'fit_time': (0.1635115146636963,
  0.1665806770324707,
  0.16679000854492188,
  0.16181635856628418,
  0.1569361686706543),
 'test_time': (1.2135398387908936,
  1.2326433658599854,
  1.21380615234375,
  1.234137773513794,
  1.238480567932129)}

## Item-Based with MSD

In [14]:
sim_options = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between items
}

model = KNNWithMeans(sim_options=sim_options)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9145  0.9207  0.9166  0.9095  0.9144  0.9151  0.0036  
MAE (testset)     0.7024  0.7035  0.7006  0.6959  0.7004  0.7006  0.0026  
Fit time          3.04    2.85    2.96    3.10    2.99    2.99    0.08    
Test time         4.76    4.83    4.99    5.36    5.10    5.01    0.21    


{'test_rmse': array([0.91451065, 0.92066022, 0.91661906, 0.90954634, 0.91439801]),
 'test_mae': array([0.70244918, 0.70347063, 0.70057322, 0.6959473 , 0.7004458 ]),
 'fit_time': (3.0373449325561523,
  2.8545494079589844,
  2.9585301876068115,
  3.103058338165283,
  2.9935860633850098),
 'test_time': (4.763417959213257,
  4.825172185897827,
  4.990392446517944,
  5.3612072467803955,
  5.095532655715942)}

## User-Based with pearson

In [15]:


sim_options = {
    "name": "pearson",
    "user_based": True,  # Compute  similarities between users
}

model = KNNWithMeans(sim_options=sim_options)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9246  0.9169  0.9255  0.9247  0.9252  0.9234  0.0032  
MAE (testset)     0.7044  0.6993  0.7057  0.7046  0.7073  0.7043  0.0027  
Fit time          0.65    0.84    0.51    0.52    0.52    0.61    0.13    
Test time         1.54    1.41    1.19    1.21    1.18    1.31    0.15    


{'test_rmse': array([0.92461792, 0.91694191, 0.92550191, 0.92467858, 0.92519284]),
 'test_mae': array([0.70438627, 0.6993294 , 0.70569047, 0.70458712, 0.70734222]),
 'fit_time': (0.649306058883667,
  0.844123125076294,
  0.5132365226745605,
  0.5150206089019775,
  0.5228025913238525),
 'test_time': (1.5434739589691162,
  1.4121382236480713,
  1.1922178268432617,
  1.2067344188690186,
  1.1784141063690186)}

## Item-Based with pearson

In [16]:
### Item-Based

sim_options = {
    "name": "pearson",
    "user_based": False,  # Compute  similarities between items
}

model = KNNWithMeans(sim_options=sim_options)

cross_validate(model, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9275  0.9249  0.9232  0.9329  0.9315  0.9280  0.0037  
MAE (testset)     0.7061  0.7053  0.7022  0.7089  0.7109  0.7067  0.0030  
Fit time          13.75   13.98   14.01   13.49   14.50   13.95   0.34    
Test time         5.29    5.11    5.22    5.21    4.90    5.15    0.14    


{'test_rmse': array([0.92750548, 0.92488617, 0.92323358, 0.93293688, 0.93154156]),
 'test_mae': array([0.70613803, 0.70530024, 0.70223171, 0.70894408, 0.71089286]),
 'fit_time': (13.747706651687622,
  13.981603622436523,
  14.01398754119873,
  13.490431785583496,
  14.503575325012207),
 'test_time': (5.294806241989136,
  5.110508680343628,
  5.2182042598724365,
  5.209561824798584,
  4.900641679763794)}

## Grid search for K


In [25]:
sim_options = {
    "name": "msd",
    "user_based": False,  # Compute  similarities between users
}

for k in [20,30,40,50,60,70,80,100,150,200]:
    clf = KNNWithMeans(sim_options=sim_options, k = k)
    cross_validate(clf, data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9133  0.9324  0.9296  0.9267  0.9247  0.9253  0.0066  
Fit time          3.06    3.08    3.62    3.32    3.63    3.34    0.25    
Test time         4.87    5.05    5.33    5.28    5.44    5.19    0.21    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing simi

In [26]:
sim_options = {
    "name": "msd",
    "user_based": True,  # Compute  similarities between users
}

for k in [20,30,40,50,60,70,80,100,150,200]:
    clf = KNNWithMeans(sim_options=sim_options, k = k)
    cross_validate(clf, data, measures=['RMSE'], cv=5, verbose=True)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE of algorithm KNNWithMeans on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9221  0.9325  0.9181  0.9150  0.9225  0.9220  0.0059  
Fit time          0.16    0.15    0.17    0.17    0.16    0.16    0.01    
Test time         1.06    1.07    1.06    1.07    1.00    1.05    0.03    
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing simi

### best K

In [17]:
sim_options2 = {
    "name": ["cosine", "msd", "pearson"],
    "user_based": [True]
}

param_grid = {"sim_options": sim_options2, "k": [30, 35, 40, 45, 50]}

result = GridSearchCV(KNNWithMeans, param_grid2, measures=["rmse"], cv=5)
result.fit(data)

Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing th

In [18]:
print(result.best_params["rmse"])
print(result.best_score["rmse"])

{'sim_options': {'name': 'msd', 'user_based': True}, 'k': 45}
0.9178361753869739



### best k for msd

In [19]:
sim_options = {
    "name": ["msd"],
    "user_based": [False],
}

param_grid = {"sim_options": sim_options, "k": [70, 80, 90, 100, 150, 200, 300]}

result2 = GridSearchCV(KNNWithMeans, param_grid, measures=["rmse"], cv=5)
result2.fit(data)

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computi

In [20]:
print(result2.best_params["rmse"])
print(result2.best_score["rmse"])

{'sim_options': {'name': 'msd', 'user_based': False}, 'k': 150}
0.9112137376135001


In [24]:
cross_validate.rmse

AttributeError: 'function' object has no attribute 'rmse'