### Analysis of KNN

In [139]:
from surprise import AlgoBase
from surprise import Dataset
from surprise import accuracy
from surprise import Reader
from surprise import KNNBasic
from surprise.model_selection import KFold
from surprise.model_selection import train_test_split
from surprise.model_selection import cross_validate
import pandas as pd
import random

#### 1. implement KNN from surprise

In [105]:
# to fit with surprise, maybe we can save train and test as two files
# import os
# train_file = os.path.expanduser('~') + 'ml-20m/ratings.csv'
# data = Dataset.load_from_folds([(train_file, test_file)], Reader(rating_scale = (1,5)))


In [167]:
# load dataset
ratings = pd.read_csv('ml-20m/ratings.csv',header = 0,nrows=1000)
reader = Reader(rating_scale=(0.0,5.0))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)

In [129]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [140]:
trainset,testset = train_test_split(data,test_size =0.2)

In [141]:
# split that is suitable for tuning
#raw_ratings = data.raw_ratings
#print(len(raw_ratings))

# shuffle
#random.shuffle(raw_ratings)

# split
#threshold = int(.8 * len(raw_ratings))
#print(threshold)
#A_raw_ratings = raw_ratings[:threshold] # train
#B_raw_ratings = raw_ratings[threshold:] # for unbiased, test

#data.raw_ratings = A_raw_ratings # data is now the A
#testset = data.construct_testset(B_raw_ratings)

In [99]:
# train test split
# trainset, testset = train_test_split(data, test_size = .2)

In [143]:
# set default KNN to item-based and calculate similarity with cosine
sim_options = {'name':'cosine', 'user_based':False}
algo = KNNBasic(sim_options = sim_options)

In [146]:
# get RMSE
algo.fit(trainset)
predictions = algo.test(testset)
accuracy.rmse(predictions)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9619


0.9619249128351179

In [148]:
# function to convert surprise dataframe to pd dataframe

def get_Iu(uid):
    """Return the number of items rated by given user
    
    Args:
        uid: The raw id of the user.
    Returns:
        The number of items rated by the user.
    """
    
    try:
        return len(trainset.ur[trainset.to_inner_uid(uid)])
    except ValueError:  # user was not part of the trainset
        return 0
    
def get_Ui(iid):
    """Return the number of users that have rated given item
    
    Args:
        iid: The raw id of the item.
    Returns:
        The number of users that have rated the item.
    """
    
    try:
        return len(trainset.ir[trainset.to_inner_iid(iid)])
    except ValueError:  # item was not part of the trainset
        return 0

df = pd.DataFrame(predictions, columns=['uid', 'iid', 'rui', 'est', 'details'])    
df['Iu'] = df.uid.apply(get_Iu)
df['Ui'] = df.iid.apply(get_Ui)
df['err'] = abs(df.est - df.rui)

In [152]:
df.head()

Unnamed: 0,uid,iid,rui,est,details,Iu,Ui,err
0,1,3438,3.5,3.732422,"{'was_impossible': True, 'reason': 'User and/o...",95,0,0.232422
1,11,344,3.5,4.25,"{'actual_k': 6, 'was_impossible': False}",15,1,0.75
2,7,3108,3.0,3.732422,"{'was_impossible': True, 'reason': 'User and/o...",138,0,0.732422
3,3,457,5.0,4.125,"{'actual_k': 8, 'was_impossible': False}",94,2,0.875
4,3,1230,5.0,3.732422,"{'was_impossible': True, 'reason': 'User and/o...",94,0,1.267578


#### 2. evaluation setup

In [149]:
# 2.1 cross-validation setup

kf = KFold(n_splits = 5)

In [150]:
for trainset,testset in kf.split(data):
    # train and test
    algo.fit(trainset)
    predictions = algo.test(testset)
    
    # accuracy
    accuracy.rmse(predictions, verbose = True)

Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9678
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8783
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.8690
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 0.9432
Computing the cosine similarity matrix...
Done computing similarity matrix.
RMSE: 1.0229


In [86]:
# 2.2 precision - recall
from collections import defaultdict
from surprise.model_selection import KFold

# 2.2.1 define function
def precision_recall_at_k (predictions, k =10, threshold = 3.5):
    
    # map prerdictions to each user
    user_est_true = defaultdict(list)
    for uid, _,true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
        
    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():
        
        # sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse = True)
        
        # number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        # number of rec items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])
        
        # number of relevant and rec items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])
        
        # precision@k: proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 1
        
        # recall@k: proportion of relevant items that are reced
        recalls[uid] = n_rel_and_rec_k / n_rel if n_rel != 0 else 1
        
    return precisions, recalls

In [164]:
# 2.2.2 run precision-recall function

kf = KFold(n_splits = 5)

for trainset, testset in kf.split(data):
    algo.fit(trainset)
    predictions = algo.test(testset)
    precisions, recalls = precision_recall_at_k(predictions, k =5, threshold = 4)
    
    # precision and recall averaged over all users
    
    #avg_precision = sum(prec for prec in precisions.values())/len(precisions)
    #avg_recall = sum(rec for rec in recalls.values())/len(recalls)
    # precision
    print(sum(prec for prec in precisions.values())/len(precisions))
    
    # recall
    print(sum(rec for rec in recalls.values())/len(recalls))
    

Computing the cosine similarity matrix...
Done computing similarity matrix.
0.6590909090909091
0.34242424242424246
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.6933333333333332
0.36719314998726765
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.7909090909090909
0.3042929292929293
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.7727272727272727
0.2992007992007992
Computing the cosine similarity matrix...
Done computing similarity matrix.
0.9393939393939394
0.2608225108225108


In [153]:
# 2.3 coverage


#### 3. Tune KNN parameters with GridSearchCV

In [173]:
# load dataset
ratings = pd.read_csv('ml-20m/ratings.csv',header = 0,nrows=10000)
reader = Reader(rating_scale=(0.0,5.0))
data = Dataset.load_from_df(ratings[['userId','movieId','rating']],reader)

In [174]:
# split that is suitable for tuning
raw_ratings = data.raw_ratings
print(len(raw_ratings))

# shuffle
random.shuffle(raw_ratings)

# split
threshold = int(.8 * len(raw_ratings))
print(threshold)
A_raw_ratings = raw_ratings[:threshold] # train
B_raw_ratings = raw_ratings[threshold:] # for unbiased, test

data.raw_ratings = A_raw_ratings # data is now the A
testset = data.construct_testset(B_raw_ratings)

10000
8000


In [175]:
from surprise.model_selection import GridSearchCV

print('Grid Search...')
param_grid = {'k':[5,7,9,11,13,15,17,20],
             'sim_options':{'name':['msd','cosine','pearson','pearson_baseline'],
                           'user_based':[False]}}
gs = GridSearchCV(KNNBasic, param_grid, measures = ['rmse','mae'],cv=5)
gs.fit(data)

# best RMSE score
print(gs.best_score['rmse'])

# best combination
print(gs.best_params['rmse'])

print(gs.best_estimator['rmse'])

# compute biased accuracy on A
trainset = data.build_full_trainset()
algo.fit(trainset)

predictions = algo.test(trainset.build_testset())
print('Biased accuracy on A,', end =' ')
accuracy.rmse(predictions)

# compute unbiased on B
testset = data.construct_testset(B_raw_ratings) # testset = B set
predictions = algo.test(testset)
print('Unbiased accuracy on B', end = ' ')
accuracy.rmse(predictions)

Grid Search...
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing the pearson similarity matrix...
Done computing similarity matrix.
Computing

Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the msd similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine similarity matrix...
Done computing similarity matrix.
Computing the cosine s

1.0115224994805054