In [326]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from surprise.model_selection import cross_validate
from surprise import SVD, KNNBasic
from surprise import Dataset
from surprise import Reader


## Part A

In [327]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines = 1)
data = Dataset.load_from_file('ratings_small.csv', reader=reader)


## Part C

In [328]:
# Probabilistic Matrix Factorization (PMF)
pmf = cross_validate(SVD(biased=False), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
pmf

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0089  1.0027  1.0040  1.0167  1.0166  1.0098  0.0060  
MAE (testset)     0.7801  0.7708  0.7747  0.7855  0.7847  0.7791  0.0057  
Fit time          3.95    3.88    3.81    3.91    3.78    3.87    0.06    
Test time         0.10    0.10    0.31    0.10    0.10    0.14    0.08    


{'test_rmse': array([1.00893193, 1.00269997, 1.00397836, 1.0167497 , 1.01660245]),
 'test_mae': array([0.78007444, 0.77076163, 0.77465331, 0.78547063, 0.78470598]),
 'fit_time': (3.9529011249542236,
  3.882063150405884,
  3.812962770462036,
  3.9064950942993164,
  3.78027606010437),
 'test_time': (0.10492992401123047,
  0.09775090217590332,
  0.31012797355651855,
  0.10341310501098633,
  0.09830713272094727)}

In [329]:
# User based Collaborative Filtering (ubcf)
ubcf = cross_validate(KNNBasic(sim_options={'userBased':True}), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
ubcf

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9654  0.9637  0.9731  0.9618  0.9776  0.9683  0.0060  
MAE (testset)     0.7441  0.7405  0.7479  0.7403  0.7487  0.7443  0.0035  
Fit time          0.09    0.10    0.10    0.10    0.10    0.10    0.00    
Test time         1.22    1.23    1.18    1.52    1.25    1.28    0.12    


{'test_rmse': array([0.9654149 , 0.96374807, 0.97312949, 0.96183155, 0.97756316]),
 'test_mae': array([0.74410236, 0.74051773, 0.74785872, 0.74028775, 0.74873161]),
 'fit_time': (0.0888056755065918,
  0.09901881217956543,
  0.09933185577392578,
  0.0984640121459961,
  0.10077309608459473),
 'test_time': (1.2185091972351074,
  1.2267882823944092,
  1.181535005569458,
  1.5249810218811035,
  1.2540233135223389)}

In [330]:
# Item based Collaborative Filtering (ibcf)
ibcf = cross_validate(KNNBasic(sim_options={'userBased':False}), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
ibcf

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9700  0.9720  0.9711  0.9645  0.9604  0.9676  0.0044  
MAE (testset)     0.7460  0.7471  0.7450  0.7418  0.7388  0.7437  0.0031  
Fit time          0.09    0.10    0.13    0.11    0.10    0.10    0.02    
Test time         1.19    1.20    1.32    1.41    1.22    1.27    0.09    


{'test_rmse': array([0.97002539, 0.97203659, 0.97110567, 0.96452369, 0.9604452 ]),
 'test_mae': array([0.74598846, 0.74712037, 0.74498909, 0.74179439, 0.73875568]),
 'fit_time': (0.0886073112487793,
  0.09775781631469727,
  0.13315606117248535,
  0.10678505897521973,
  0.09824013710021973),
 'test_time': (1.186065912246704,
  1.1974551677703857,
  1.3166239261627197,
  1.4113941192626953,
  1.2151758670806885)}

## Part D

In [331]:
def avg_metrics(metric):
    return {'mae': np.mean(metric['test_mae']), 'rmse': np.mean(metric['test_rmse'])}

In [332]:
pmf_avg = avg_metrics(pmf)
ubcf_avg = avg_metrics(ubcf)
ibcf_avg = avg_metrics(ibcf)

mae_avg = {
    'pmf': pmf_avg['mae'],
    'ubcf': ubcf_avg['mae'],
    'ibcf': ibcf_avg['mae']
}

rmse_avg = {
    'pmf': pmf_avg['rmse'],
    'ubcf': ubcf_avg['rmse'],
    'ibcf': ibcf_avg['rmse']
}

print('Avegare MAE for PMF, UbCF, and IbCF', mae_avg)
print('Avegare RMSE for PMF, UbCF, and IbCF', rmse_avg)
    



Avegare MAE for PMF, UbCF, and IbCF {'pmf': 0.7791331981877198, 'ubcf': 0.7442996346374398, 'ibcf': 0.743729598975488}
Avegare RMSE for PMF, UbCF, and IbCF {'pmf': 1.0097924824528124, 'ubcf': 0.9683374336603796, 'ibcf': 0.9676273090968468}


## Part E

#### Compare cosine, MSD, and Pearson similarities

In [333]:
def avg_metric_part2(metric, model):
    return {
        'mae': np.average(cross_validate(metric, model)['test_mae']),
        'rmse': np.average(cross_validate(metric, model)['test_rmse'])
        }

In [334]:
ubcf_part2 = []
ibcf_part2 = []

for i in ['cosine', 'msd', 'pearson']:
    print('==Computing: ', i, ' for UbCf==')
    ubcf_part2.append(avg_metric_part2(KNNBasic(sim_options={'name': i, 'user_based': True}, verbose=False), data))
    print('==Finished: ', i, ' for UbCF==')


    print('==Computing: ', i, ' for IbCf==')
    ibcf_part2.append(avg_metric_part2(KNNBasic(sim_options={'name': i, 'user_based': False}, verbose=False), data))
    print('==Finished: ', i, ' for IbCF==')


==Computing:  cosine  for UbCf==
==Finished:  cosine  for UbCF==
==Computing:  cosine  for IbCf==
==Finished:  cosine  for IbCF==
==Computing:  msd  for UbCf==
==Finished:  msd  for UbCF==
==Computing:  msd  for IbCf==
==Finished:  msd  for IbCF==
==Computing:  pearson  for UbCf==
==Finished:  pearson  for UbCF==
==Computing:  pearson  for IbCf==
==Finished:  pearson  for IbCF==


In [335]:
ubcf_mae = []
ibcf_mae = []

for i in ubcf_part2:
    ubcf_mae.append(i['mae'])
for i in ibcf_part2:
    ibcf_mae.append(i['mae'])

print('Mean Square Error for UbCF: ', ubcf_mae)
print('Mean Square Error for IbCF: ', ibcf_mae)


Mean Square Error for UbCF:  [0.7680476239622408, 0.744280371419273, 0.7725484286901374]
Mean Square Error for IbCF:  [0.7741497821193646, 0.7214952813673312, 0.7681559878522499]


In [336]:
maeDF = pd.DataFrame()
maeDF['ubcf'] = ubcf_mae
maeDF['ibcf'] = ibcf_mae

fig = px.bar(
    maeDF, 
    barmode='group', 
    title='Comparison of MAE between UbCF and IbCF',
    labels={
        'index': 'Cosine | MSD | Pearson',
        'value': 'Value'
    }
)
fig.show()

## Part F


In [337]:
ubcf_knn = []
ibcf_knn = []

for i in range(15):
    ubcf_knn.append(cross_validate(KNNBasic(k=i, sim_options = {'name':'pearson','user_based': True}), data, measures=['RMSE'], cv=5, verbose=False))

    ibcf_knn.append(cross_validate(KNNBasic(k=i, sim_options = {'name':'pearson','user_based': False}), data, measures=['RMSE'], cv=5, verbose=False))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [338]:
ubcf_knn_rmse = []
ibcf_knn_rmse = []

for i in ubcf_knn:
    for j in i['test_rmse']:
        ubcf_knn_rmse.append(j)

for i in ibcf_knn:
    for j in i['test_rmse']:
        ibcf_knn_rmse.append(j)

In [339]:
knnDF = pd.DataFrame()

knnDF['ubcf'] = ubcf_knn_rmse
knnDF['ibcf'] = ibcf_knn_rmse

knnDF.head()

Unnamed: 0,ubcf,ibcf
0,1.06456,1.055479
1,1.067785,1.057542
2,1.057579,1.06492
3,1.044023,1.063039
4,1.056315,1.049287


In [340]:
fig2 = px.line(title='Impact performance on number of neighbors')

fig2.add_scatter(
    x = np.arange(0, 15),
    y = knnDF['ubcf'],
    name='UbCF'
    
)

fig2.add_scatter(
    x = np.arange(0, 15),
    y = knnDF['ibcf'],
    name='IbCF'
)

## Part G

For User CF, the best K is at K =  43 with Minimum RMSE: 1.0121099695172417


In [341]:
def findMin(knn_target):
    target = np.min(knn_target)

    for i in range(len(knn_target)):
        if target == knn_target[i]:
            return target, i


In [343]:
ubcfMin, ubcfIndex = findMin(ubcf_knn_rmse)
ibcfMin, ibcfIndex = findMin(ibcf_knn_rmse)


print('Target KNN for UbCF is: ', ubcfIndex, ', With value of: ', ubcfMin)
print('Target KNN for IbCF is: ', ibcfIndex, ', With value of: ', ibcfMin)

Target KNN for UbCF is:  64 , With value of:  0.9986603099293345
Target KNN for IbCF is:  65 , With value of:  0.9983778939815987
