# Collaborative Filtering Recommender System


For this part, you can use the python library Scikit-Surprise. Please find the
documentation here: https://surprise.readthedocs.io/en/stable/. However,
you are free to use any libraries of your choice.


In [18]:
# Load cleaned data splits
import pandas as pd 
train_data = pd.read_pickle("train_dataframe.pkl") 
test_data = pd.read_pickle("test_dataframe.pkl")

## Step 1

Based on the frequency of the most rated items computed in Week 6,
implement the TopPop recommender system, which always recommends
the same top-k items sorted decreasingly by the number of “high” ratings
(e.g., ≥ 3) in the training split, train.tsv.


In [19]:
# get top 10 recommendations from TopPop recommender system for next week
def TopPop(train_data, items_rating_count, k):
    high_ratings = train_data[train_data['rating'] >= 3]
    stats = high_ratings.groupby('item_id').agg(num_ratings=('rating', 'count'), avg_rating=('rating', 'mean')).reset_index()
    items_rating_count = stats.sort_values(by=['avg_rating', 'num_ratings'], ascending=False)
    return items_rating_count.head(k)


top_k = 10 
items_rating_count = train_data['item_id'].value_counts()
top_items = TopPop(train_data, items_rating_count, top_k)

print(top_items)
# save it for next week comparison with knn ,svd
top_items.to_csv('top_items.csv', index=False)


        item_id  num_ratings  avg_rating
286  B07N2HQ1T7           28         5.0
214  B0742RB7JK           14         5.0
254  B07C9YCY5J           14         5.0
31   B000U0DU34           13         5.0
473  B0BT2W3TTM           13         5.0
298  B07S19XSPV           12         5.0
123  B00VOQQDHI           11         5.0
200  B06XH7487S           11         5.0
279  B07L6RCDP7           11         5.0
312  B07YK57N2M           10         5.0


## Step 2
Choose at least one neighborhood-based model and one latent factor model
that uses the observed user-item ratings in the training set to predict the
unobserved ratings. Report your choice of models.


### Neighborhood-based model(KNN, K=5)

In [20]:
import random
import pandas as pd
import numpy as np
from surprise import Reader
from surprise import Dataset
from surprise import KNNWithMeans
from surprise.model_selection import KFold
from sklearn.metrics import mean_squared_error as mse
from surprise import accuracy

In [21]:
# Convert train data format
reader = Reader(rating_scale=(1, 5))
training_matrix = Dataset.load_from_df(train_data[['user_id', 'item_id', 'rating']], reader)

In [22]:
# test the implemented KNN models with some randomly selected parameters
sim_options = {'name': 'cosine',
               'user_based': True
               }
algo_knn = KNNWithMeans(k= 5, 
                    random_state=0,
                    sim_options= sim_options,
                    verbose=False)

trainset = training_matrix.build_full_trainset()# includes the entire dataset for training
algo_knn.fit(trainset)

unobserved_ratings = trainset.build_anti_testset()
pred_KNN = algo_knn.test(unobserved_ratings)
test_users = set(test_data['user_id'].unique())
unobserved_users = set(user_id for user_id, item_id, _ in unobserved_ratings)
num_users_not_in_test = len(unobserved_users - test_users)# number of the users that are not in the test set

predictions_new = [pred for pred in pred_KNN if pred.uid in test_users] 
num_predictions = len(predictions_new)
average_prediction = round(np.mean([pred.est for pred in predictions_new]), 3)


print("Number of predictions: ", num_predictions)
print("Average of predictions: ", average_prediction)

Number of predictions:  217724
Average of predictions:  4.441


### Latent Factor Model(SVD)

In [23]:
# test the implemented SVD models with some randomly selected parameters
from surprise import SVD

algo_SVD = SVD(n_factors=30, n_epochs=100)
algo_SVD.fit(trainset)

pred_SVD = algo_SVD.test(unobserved_ratings)
predictions = [pred for pred in pred_SVD if pred.uid in test_users]

average_preds = round(np.mean([pred.est for pred in predictions]), 3)

print("Number of predictions:", len(predictions))
print("Average of predictions:", average_preds)

Number of predictions: 217724
Average of predictions: 4.479


## Step 3
Use 5-fold cross-validation on the training set to tune the hyperparameters
of the chosen models (similarity measure and number of neighbors for the
neighborhood-based model; number of latent factors and number of epochs
for the latent factor model).


Choose an evaluation measure that is suitable for this task and justify your
motivation in using it. Report the optimal hyperparameters together with
the scores of your chosen measure, averaged over the 5 folds.


### KNN Model tune+ RMSE as evaluation measure

In [24]:
# the random seed
my_seed = 0
random.seed(my_seed)
np.random.seed(my_seed)

# Define a cross-validation iterator
kf = KFold(n_splits=5)  #5 fold

rmse_result = dict()

# Parameter options
list_neighbour =  range(1, 11)
similarity_measures = ["cosine", "pearson", "msd"]

# test parameter sets by turns
for neighbour in list_neighbour:
    rmse_result[neighbour] = {}

    for sim_measure in similarity_measures:
        rmse_result[neighbour][sim_measure] = {}
        
        fold = 0
        for trainset, testset in kf.split(training_matrix):
            algo = KNNWithMeans(k=neighbour,
                                sim_options={"name": sim_measure, "user_based": True},
                                verbose=False)

            algo.fit(trainset)
            predictions_KNN = algo.test(testset)

            rmse = accuracy.rmse(predictions_KNN, verbose=False)
            rmse_result[neighbour][sim_measure][fold] = rmse

            fold += 1


# show and find the optimal set of parameters by selecting the lowest mean RMSE
rmse_df = pd.DataFrame.from_dict(
    {f"{n}_{m}": v for n, sim_res in rmse_result.items() for m, v in sim_res.items()},
    orient='index')

rmse_df['mean_rmse'] = rmse_df.mean(axis=1)

print(rmse_df[:30])

best_params = rmse_df['mean_rmse'].idxmin()
best_k, best_sim = best_params.split('_')
print()
print(f'The best number of neighbors is: {best_k} with similarity measure: {best_sim} ')


                   0         1         2         3         4  mean_rmse
1_cosine    1.065002  1.061466  1.095047  1.080334  1.009007   1.062171
1_pearson   0.923623  0.919883  0.916026  0.919373  0.877042   0.911189
1_msd       1.068735  1.065228  1.021629  1.041940  1.051525   1.049812
2_cosine    0.968527  0.963304  0.944011  0.978766  0.955238   0.961969
2_pearson   0.879325  0.955120  0.896603  0.920848  0.889040   0.908187
2_msd       0.979183  0.968180  0.962749  0.953998  0.980197   0.968862
3_cosine    0.941616  0.928014  0.918780  0.939354  0.967233   0.938999
3_pearson   0.911825  0.889045  0.929867  0.903160  0.931849   0.913149
3_msd       0.944145  0.931048  0.943715  0.949273  0.934045   0.940445
4_cosine    0.938677  0.920548  0.923938  0.927643  0.949022   0.931966
4_pearson   0.885429  0.943124  0.916486  0.874496  0.923464   0.908600
4_msd       0.961687  0.927534  0.913367  0.877817  0.954423   0.926966
5_cosine    0.906464  0.978296  0.924445  0.887473  0.892235   0

### SVD Model tune + RMSE as evaluation measure

In [25]:
kf = KFold(n_splits=5)  #5 fold
rmse_result = dict()

# parameter options
list_factor = [5, 10, 20, 30, 50]
list_epoch = [10, 20, 30, 50, 100]

# test parameter sets by turns
for factor in list_factor:
    rmse_result[factor] = {}

    for epoch in list_epoch:
        rmse_result[factor][epoch] = {}
        
        fold = 0
        for trainset, testset in kf.split(training_matrix):
            algo = SVD(n_factors=factor, 
                       n_epochs=epoch)

            algo.fit(trainset)
            predictions_SVD = algo.test(testset)

            rmse = accuracy.rmse(predictions_SVD, verbose=False)
            rmse_result[factor][epoch][fold] = rmse

            fold += 1

rmse_df = pd.DataFrame.from_dict(
    {f"{n}_{m}": v for n, sim_res in rmse_result.items() for m, v in sim_res.items()},
    orient='index')

rmse_df['mean_rmse'] = rmse_df.mean(axis=1)
print(rmse_df)
best_params = rmse_df['mean_rmse'].idxmin()
best_f, best_e = best_params.split('_')
print()
print(f'The best number of latent factors is: {best_f} with number of epoch: {best_e} ')


               0         1         2         3         4  mean_rmse
5_10    0.878515  0.850194  0.863530  0.822427  0.830085   0.848950
5_20    0.842092  0.846689  0.879939  0.785697  0.857102   0.842304
5_30    0.844363  0.866111  0.813899  0.869472  0.824034   0.843576
5_50    0.836984  0.860521  0.827236  0.896429  0.858753   0.855985
5_100   0.894512  0.887232  0.914474  0.835995  0.874504   0.881344
10_10   0.889657  0.834599  0.844190  0.863775  0.811347   0.848714
10_20   0.815711  0.825405  0.845125  0.869117  0.871999   0.845471
10_30   0.838348  0.826131  0.846737  0.848327  0.879816   0.847872
10_50   0.880804  0.848567  0.888898  0.851764  0.809302   0.855867
10_100  0.864501  0.934130  0.856033  0.874372  0.888957   0.883599
20_10   0.813738  0.908554  0.793544  0.862888  0.859245   0.847594
20_20   0.846977  0.832777  0.808091  0.862994  0.871437   0.844455
20_30   0.883228  0.865015  0.822541  0.794975  0.855066   0.844165
20_50   0.836222  0.833646  0.863734  0.868888  

## Step 4
Run the models with the optimal hyperparameters to the whole training
set.


### KNN(K = 6, Measure = pearson)

In [26]:
sim_options = {'name': best_sim,
               'user_based': True
               }
algo_knn = KNNWithMeans(k= 6, 
                    random_state=0,
                    sim_options= sim_options,
                    verbose=False)

trainset = training_matrix.build_full_trainset()# includes the entire dataset for training
algo_knn.fit(trainset)
pred_KNN = algo_knn.test(unobserved_ratings)

average_prediction = round(np.mean([pred.est for pred in pred_KNN if pred.uid in test_users]), 3)

print("Average of predictions: ", average_prediction)

Average of predictions:  4.504


### SVD(latent factor=30， epoch=20)

In [27]:
algo_SVD = SVD(n_factors=30, n_epochs=20)
algo_SVD.fit(trainset)

pred_SVD = algo_SVD.test(unobserved_ratings)

average_preds = round(np.mean([pred.est for pred in pred_SVD if pred.uid in test_users]), 3)

print("Average of predictions:", average_preds)

Average of predictions: 4.498


## Step 5
Use the final models to rank the non-rated items for each user. This
ranking will be used for the evaluation part next week.


In [28]:
#general method for top-k recommendations
from collections import defaultdict
from surprise.prediction_algorithms.predictions import Prediction
from typing import Dict, List
import numpy as np

def get_top_k(predictions: List[Prediction], 
              k: int) -> Dict[str, List]:
    """Compute the top-K recommendation for each user from a set of predictions.
    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        k(int): The number of recommendation to output for each user.
    Returns:
        A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    """
    topk = defaultdict(list)

    for pred in predictions:
        uid = pred.uid 
        iid = pred.iid 
        est = pred.est 
        topk[uid].append((iid, est))
    
    # Sort the predictions
    for uid, user_ratings in topk.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        topk[uid] = user_ratings[:k]
    return topk


def print_top_k(user_id: str, topk: Dict[str, List]) -> None:
    user_ratings = topk[user_id]
    print(f"TOP-{len(user_ratings)} predictions for user {user_id}: {[(item, round(rating,2)) for (item, rating) in user_ratings]}")


### KNN

In [29]:
top5_knn = get_top_k(pred_KNN, k=5)
top10_knn = get_top_k(pred_KNN, k=10)
top20_knn = get_top_k(pred_KNN, k=20)

print_top_k("AE23LDQTB7L76AP6E6WPBFVYL5DA", top5_knn)
print_top_k("AE23LDQTB7L76AP6E6WPBFVYL5DA", top10_knn)
print_top_k("AE23LDQTB7L76AP6E6WPBFVYL5DA", top20_knn)

TOP-5 predictions for user AE23LDQTB7L76AP6E6WPBFVYL5DA: [('B000NGVQKO', 4.8), ('B005PGGU9O', 4.8), ('B00CPLODUU', 4.8), ('B00RX5HQS4', 4.8), ('B00VSYN25M', 4.8)]
TOP-10 predictions for user AE23LDQTB7L76AP6E6WPBFVYL5DA: [('B000NGVQKO', 4.8), ('B005PGGU9O', 4.8), ('B00CPLODUU', 4.8), ('B00RX5HQS4', 4.8), ('B00VSYN25M', 4.8), ('B0150YC54E', 4.8), ('B015QK3GUO', 4.8), ('B06XB3FQKB', 4.8), ('B0719KM5Y8', 4.8), ('B07BHJH2Y8', 4.8)]
TOP-20 predictions for user AE23LDQTB7L76AP6E6WPBFVYL5DA: [('B000NGVQKO', 4.8), ('B005PGGU9O', 4.8), ('B00CPLODUU', 4.8), ('B00RX5HQS4', 4.8), ('B00VSYN25M', 4.8), ('B0150YC54E', 4.8), ('B015QK3GUO', 4.8), ('B06XB3FQKB', 4.8), ('B0719KM5Y8', 4.8), ('B07BHJH2Y8', 4.8), ('B07V46KRD8', 4.8), ('B08R5GM6YB', 4.8), ('B09G5KLKX2', 4.8), ('B0B2LSX437', 4.8), ('B0B95V41NR', 4.8), ('B0BPLFP5P6', 4.8), ('B0BSCFTV2G', 4.8), ('B0BTC9YJ2W', 4.8), ('B00EM5UOE6', 4.8), ('B00JL5I61A', 4.8)]


### SVD

In [30]:
top5_svd = get_top_k(pred_SVD, k=5)
top10_svd = get_top_k(pred_SVD, k=10)
top20_svd = get_top_k(pred_SVD, k=20)

print_top_k("AE23LDQTB7L76AP6E6WPBFVYL5DA", top5_svd)
print_top_k("AE23LDQTB7L76AP6E6WPBFVYL5DA", top10_svd)
print_top_k("AE23LDQTB7L76AP6E6WPBFVYL5DA", top20_svd)

TOP-5 predictions for user AE23LDQTB7L76AP6E6WPBFVYL5DA: [('B07DK59QNR', 5), ('B07N2HQ1T7', 5), ('B00721BB44', 5), ('B0BT9R8MMV', 4.97), ('B0BQ4HSKC9', 4.97)]
TOP-10 predictions for user AE23LDQTB7L76AP6E6WPBFVYL5DA: [('B07DK59QNR', 5), ('B07N2HQ1T7', 5), ('B00721BB44', 5), ('B0BT9R8MMV', 4.97), ('B0BQ4HSKC9', 4.97), ('B09857JRP2', 4.96), ('B00WT7KPCK', 4.96), ('B0719KM5Y8', 4.96), ('B00RVE9X06', 4.95), ('B07YK57N2M', 4.95)]
TOP-20 predictions for user AE23LDQTB7L76AP6E6WPBFVYL5DA: [('B07DK59QNR', 5), ('B07N2HQ1T7', 5), ('B00721BB44', 5), ('B0BT9R8MMV', 4.97), ('B0BQ4HSKC9', 4.97), ('B09857JRP2', 4.96), ('B00WT7KPCK', 4.96), ('B0719KM5Y8', 4.96), ('B00RVE9X06', 4.95), ('B07YK57N2M', 4.95), ('B0C3H7SNPQ', 4.94), ('B07N5MJDBF', 4.94), ('B09KWWS7DH', 4.94), ('B00VOQQDHI', 4.94), ('B001W99HE8', 4.93), ('B0BKR2ZM9X', 4.93), ('B0B8F6LD9F', 4.92), ('B0BT2ZRCY2', 4.92), ('B0B3VSZQHL', 4.92), ('B06XH7487S', 4.92)]
