In [1]:
import numpy as np
import pandas as pd
import plotly.express as px
import matplotlib.pyplot as plt

from surprise.model_selection import cross_validate
from surprise import SVD, KNNBasic
from surprise import Dataset
from surprise import Reader


## Part A

In [2]:
reader = Reader(line_format='user item rating timestamp', sep=',', skip_lines = 1)
data = Dataset.load_from_file('ratings_small.csv', reader=reader)


## Part C

In [3]:
# Probabilistic Matrix Factorization (PMF)
pmf = cross_validate(SVD(biased=False), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
pmf

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    1.0108  1.0058  1.0113  1.0122  1.0031  1.0086  0.0036  
MAE (testset)     0.7794  0.7778  0.7791  0.7816  0.7765  0.7789  0.0017  
Fit time          3.16    3.12    3.12    3.16    3.14    3.14    0.02    
Test time         0.09    0.09    0.09    0.13    0.09    0.10    0.02    


{'test_rmse': array([1.01075254, 1.00577431, 1.01131052, 1.01224283, 1.00310723]),
 'test_mae': array([0.77939173, 0.77784924, 0.77908837, 0.78162953, 0.7764546 ]),
 'fit_time': (3.157052993774414,
  3.1205475330352783,
  3.121046781539917,
  3.1555533409118652,
  3.13604998588562),
 'test_time': (0.09001612663269043,
  0.08601498603820801,
  0.08751559257507324,
  0.13152289390563965,
  0.08951568603515625)}

In [4]:
# User based Collaborative Filtering (ubcf)
ubcf = cross_validate(KNNBasic(sim_options={'userBased':True}), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
ubcf

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9727  0.9664  0.9716  0.9659  0.9596  0.9672  0.0047  
MAE (testset)     0.7477  0.7429  0.7462  0.7431  0.7384  0.7437  0.0032  
Fit time          0.14    0.15    0.14    0.14    0.16    0.15    0.01    
Test time         1.24    1.24    1.22    1.27    1.44    1.28    0.08    


{'test_rmse': array([0.97267799, 0.96641201, 0.97156371, 0.96591269, 0.95963362]),
 'test_mae': array([0.74770381, 0.74288554, 0.74619165, 0.74312724, 0.73835864]),
 'fit_time': (0.13852405548095703,
  0.1505262851715088,
  0.14402532577514648,
  0.14102482795715332,
  0.15652704238891602),
 'test_time': (1.2422184944152832,
  1.2442185878753662,
  1.2192139625549316,
  1.2672219276428223,
  1.4447531700134277)}

In [5]:
# Item based Collaborative Filtering (ibcf)
ibcf = cross_validate(KNNBasic(sim_options={'userBased':False}), data, measures=['RMSE', 'MAE'], cv=5, verbose=True)
ibcf

Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Evaluating RMSE, MAE of algorithm KNNBasic on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9558  0.9730  0.9646  0.9704  0.9766  0.9681  0.0073  
MAE (testset)     0.7339  0.7483  0.7428  0.7479  0.7508  0.7447  0.0060  
Fit time          0.13    0.14    0.14    0.15    0.15    0.14    0.01    
Test time         1.22    1.20    1.27    1.29    1.21    1.24    0.03    


{'test_rmse': array([0.95581971, 0.97300445, 0.9646487 , 0.97036974, 0.97660879]),
 'test_mae': array([0.73392936, 0.74834536, 0.74280303, 0.74788729, 0.75077727]),
 'fit_time': (0.1325235366821289,
  0.1395244598388672,
  0.1390242576599121,
  0.14952564239501953,
  0.14552569389343262),
 'test_time': (1.2227141857147217,
  1.2007105350494385,
  1.2657217979431152,
  1.2912266254425049,
  1.2132127285003662)}

## Part D

In [6]:
def avg_metrics(metric):
    return {'mae': np.mean(metric['test_mae']), 'rmse': np.mean(metric['test_rmse'])}

In [7]:
pmf_avg = avg_metrics(pmf)
ubcf_avg = avg_metrics(ubcf)
ibcf_avg = avg_metrics(ibcf)

mae_avg = {
    'pmf': pmf_avg['mae'],
    'ubcf': ubcf_avg['mae'],
    'ibcf': ibcf_avg['mae']
}

rmse_avg = {
    'pmf': pmf_avg['rmse'],
    'ubcf': ubcf_avg['rmse'],
    'ibcf': ibcf_avg['rmse']
}

print('Avegare MAE for PMF, UbCF, and IbCF', mae_avg)
print('Avegare RMSE for PMF, UbCF, and IbCF', rmse_avg)
    



Avegare MAE for PMF, UbCF, and IbCF {'pmf': 0.7788826960776459, 'ubcf': 0.7436533763263031, 'ibcf': 0.744748462861382}
Avegare RMSE for PMF, UbCF, and IbCF {'pmf': 1.0086374865596626, 'ubcf': 0.9672400034797727, 'ibcf': 0.9680902757601135}


## Part E

#### Compare cosine, MSD, and Pearson similarities

In [8]:
def avg_metric_part2(metric, model):
    return {
        'mae': np.average(cross_validate(metric, model)['test_mae']),
        'rmse': np.average(cross_validate(metric, model)['test_rmse'])
        }

In [9]:
ubcf_part2 = []
ibcf_part2 = []

for i in ['cosine', 'msd', 'pearson']:
    print('==Computing: ', i, ' for UbCf==')
    ubcf_part2.append(avg_metric_part2(KNNBasic(sim_options={'name': i, 'user_based': True}, verbose=False), data))
    print('==Finished: ', i, ' for UbCF==')


    print('==Computing: ', i, ' for IbCf==')
    ibcf_part2.append(avg_metric_part2(KNNBasic(sim_options={'name': i, 'user_based': False}, verbose=False), data))
    print('==Finished: ', i, ' for IbCF==')


==Computing:  cosine  for UbCf==
==Finished:  cosine  for UbCF==
==Computing:  cosine  for IbCf==
==Finished:  cosine  for IbCF==
==Computing:  msd  for UbCf==
==Finished:  msd  for UbCF==
==Computing:  msd  for IbCf==
==Finished:  msd  for IbCF==
==Computing:  pearson  for UbCf==
==Finished:  pearson  for UbCF==
==Computing:  pearson  for IbCf==
==Finished:  pearson  for IbCF==


In [10]:
ubcf_mae = []
ibcf_mae = []
ubcf_rmse = []
ibcf_rmse = []

for i in ubcf_part2:
    ubcf_mae.append(i['mae'])
    ubcf_rmse.append(i['rmse'])
for i in ibcf_part2:
    ibcf_mae.append(i['mae'])
    ibcf_rmse.append(i['rmse'])

print('Mean Square Error for UbCF: ', ubcf_mae)
print('Mean Square Error for IbCF: ', ibcf_mae)

print('Root Mean Square Deviation for UbCF: ', ubcf_rmse)
print('Root Mean Square Deviation for IbCF: ', ibcf_rmse)



Mean Square Error for UbCF:  [0.7675927628689191, 0.7442725647401618, 0.7724117624508985]
Mean Square Error for IbCF:  [0.7740503835546557, 0.7209060545865444, 0.7678403327245263]
Root Mean Square Deviation for UbCF:  [0.9940622034873166, 0.9692845404786944, 0.996779564599044]
Root Mean Square Deviation for IbCF:  [0.9946588836771235, 0.9345627310022417, 0.9888502253557082]


In [12]:
maeDF = pd.DataFrame()
rmseDF = pd.DataFrame()
maeDF['ubcf'] = ubcf_mae
maeDF['ibcf'] = ibcf_mae

rmseDF['ubcf'] = ubcf_rmse
rmseDF['ibcf'] = ibcf_rmse

figMAE = px.bar(
    maeDF, 
    barmode='group', 
    title='Comparison of MAE between UbCF and IbCF',
    labels={
        'index': 'Cosine | MSD | Pearson',
        'value': 'Value'
    }
)

figRMSE = px.bar(
    rmseDF, 
    barmode='group', 
    title='Comparison of RMSE between UbCF and IbCF',
    labels={
        'index': 'Cosine | MSD | Pearson',
        'value': 'Value'
    }
)

figMAE.show()
figRMSE.show()

## Part F


In [13]:
ubcf_knn = []
ibcf_knn = []

for i in range(30):
    ubcf_knn.append(cross_validate(KNNBasic(k=i, sim_options = {'name':'pearson','user_based': True}), data, measures=['RMSE'], cv=5, verbose=False))

    ibcf_knn.append(cross_validate(KNNBasic(k=i, sim_options = {'name':'pearson','user_based': False}), data, measures=['RMSE'], cv=5, verbose=False))

Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.

In [23]:
ubcf_knn_rmse = []
ibcf_knn_rmse = []

for i in range(30):
    ubcf_knn_rmse.append(ubcf_knn[i]['test_rmse'].mean())

for i in range(30):
    ibcf_knn_rmse.append(ibcf_knn[i]['test_rmse'].mean())

In [25]:
knnDF = pd.DataFrame()

knnDF['ubcf'] = ubcf_knn_rmse
knnDF['ibcf'] = ibcf_knn_rmse

knnDF.head()

Unnamed: 0,ubcf,ibcf
0,1.058062,1.058056
1,1.302862,1.350587
2,1.14451,1.19155
3,1.087562,1.134061
4,1.062,1.097812


In [35]:
fig2 = px.line(title='Impact performance on number of neighbors')

fig2.add_scatter(
    x = np.arange(0, 30),
    y = knnDF['ubcf'],
    name='UbCF'
    
)

fig2.add_scatter(
    x = np.arange(0, 30),
    y = knnDF['ibcf'],
    name='IbCF'
)

fig2.show()

## Part G

In [36]:
def findMin(knn_target):
    target = np.min(knn_target)

    for i in range(len(knn_target)):
        if target == knn_target[i]:
            return target, i


In [37]:
ubcfMin, ubcfIndex = findMin(ubcf_knn_rmse)
ibcfMin, ibcfIndex = findMin(ibcf_knn_rmse)


print('Target KNN for UbCF is: ', ubcfIndex, ', With value of: ', ubcfMin)
print('Target KNN for IbCF is: ', ibcfIndex, ', With value of: ', ibcfMin)

Target KNN for UbCF is:  28 , With value of:  0.9967226882462853
Target KNN for IbCF is:  29 , With value of:  0.9950773898292425
