In neighborhood based models, we've tested the following:

1. Basic Similarity: Taking in account the similarity between the users
2. Rating Normalization: Users may use different rating values to quantify the same level of appreciation for an item despite giving the explicit definition (e.g., 1=“strongly disagree”, 2=“disagree”, 3=“neutral”, etc.)
    * Mean Normalization
    * ZScore Normalization
3. Significance Weighing: Significance weighting reduces the magnitude of a similarity weight when this weight is computed using only a few ratings.

In [1]:
import heapq
import pickle
import surprise
import numpy as np
import fastparquet
import pandas as pd
from surprise.reader import Reader
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise.accuracy import fcp, mae, mse, rmse
from surprise.prediction_algorithms import PredictionImpossible
from surprise.prediction_algorithms.knns import KNNBasic, KNNWithMeans, KNNWithZScore

In [2]:
with open("encode_decode/user_encode.pkl", "rb") as f:
    user_encode = pickle.load(f)
with open("encode_decode/user_decode.pkl", "rb") as f:
    user_decode = pickle.load(f)

with open("encode_decode/anime_encode.pkl", "rb") as f:
    anime_encode = pickle.load(f)
with open("encode_decode/anime_decode.pkl", "rb") as f:
    anime_decode = pickle.load(f)

df_train = pd.read_parquet("valid_train.parquet", engine="fastparquet")
df_test = pd.read_parquet("valid_test.parquet", engine="fastparquet")

Building Training and Testing Datasets for Rating Prediction

In [3]:
reader = Reader(rating_scale=(1, 10))
data_train = DatasetAutoFolds(
    reader=reader,
    df=df_train[["user_id", "anime_id", "score"]],
).build_full_trainset()
data_test = (
    DatasetAutoFolds(
        reader=reader,
        df=df_test[["user_id", "anime_id", "score"]],
    )
    .build_full_trainset()
    .build_testset()
)
_raw_data_train = DatasetAutoFolds(
    reader=reader, df=df_train[["user_id", "anime_id", "score"]]
)

## KNNBasic: Taking similarity into account: 

Hyperparameter Tuning with GridSearchCV (item-based)


In [None]:
param_grid = {
    "k": [10, 15, 20, 25, 30],
    "sim_options": {
        "name": ["msd"],
        "min_support": [5, 10, 15, 20, 25, 30],
        "user_based": [False],
    },
}

gs = GridSearchCV(
    KNNBasic,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

gs.fit(_raw_data_train)

Saving GridSearch CV Results

In [5]:
df = pd.DataFrame(gs.cv_results)
df.to_json("./results/KNNBasic_ii_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params
11,0.891507,0.00109,1.46363,0.005977,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
7,0.891511,0.001093,1.463636,0.006005,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
6,0.891511,0.001093,1.463637,0.006003,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
10,0.891508,0.001089,1.463638,0.005975,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
8,0.89151,0.001092,1.463641,0.005995,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
9,0.891511,0.001092,1.463645,0.005991,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
5,0.891531,0.001232,1.467472,0.007212,"{'k': 10, 'sim_options': {'name': 'msd', 'min_..."
0,0.891537,0.001232,1.467475,0.007216,"{'k': 10, 'sim_options': {'name': 'msd', 'min_..."
4,0.891535,0.00123,1.467476,0.007202,"{'k': 10, 'sim_options': {'name': 'msd', 'min_..."
1,0.891537,0.001232,1.467477,0.007214,"{'k': 10, 'sim_options': {'name': 'msd', 'min_..."


In [6]:
gs.best_params, gs.best_score

({'mse': {'k': 15,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}},
  'mae': {'k': 15,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}}},
 {'mse': 1.4636304232268464, 'mae': 0.8915072064924056})

Evaluating Optimized Model Performance

In [7]:
#! Optimmized for MSE

gs_r = gs.test(data_test)
print(f" FCP: {fcp(gs_r,verbose=False)}")
print(f" MAE: {mae(gs_r,verbose=False)}")
print(f" MSE: {mse(gs_r,verbose=False)}")
print(f"RMSE: {rmse(gs_r,verbose=False)}")

 FCP: 0.7344595198417686
 MAE: 0.8783007433644731
 MSE: 1.4229437407614212
RMSE: 1.1928720554868495


## KNNBasic: Taking similarity into account 

Hyperparameter Tuning with GridSearchCV (user-based)

$$r_{ui} = \frac{\sum_{v \in N_{ui}} w_{uv} \cdot r_{vi}}{\sum_{v \in N_{ui}} |w_{uv}|}$$


In [None]:
param_grid = {
    "k": [10, 15, 20, 25, 30],
    "sim_options": {
        "name": ["msd"],
        "min_support": [5, 10, 15, 20, 25, 30],
        "user_based": [True],
    },
}

gs = GridSearchCV(
    KNNBasic,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

gs.fit(_raw_data_train)

In [9]:
df = pd.DataFrame(gs.cv_results)
df.to_json("./results/KNNBasic_uu_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params
21,0.910641,0.000936,1.520355,0.0042,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."
27,0.911518,0.000938,1.520959,0.004301,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."
22,0.912331,0.001275,1.521163,0.004844,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."
16,0.91172,0.001144,1.521437,0.004541,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
15,0.910361,0.000868,1.521818,0.004078,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
28,0.913568,0.001201,1.523219,0.004734,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."
10,0.912328,0.00113,1.525835,0.004647,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
26,0.912372,0.000973,1.527487,0.004172,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."
20,0.911863,0.000948,1.527968,0.004337,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."
9,0.911572,0.000785,1.528339,0.004225,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."


In [10]:
gs.best_params, gs.best_score

({'mse': {'k': 25,
   'sim_options': {'name': 'msd', 'min_support': 20, 'user_based': True}},
  'mae': {'k': 20,
   'sim_options': {'name': 'msd', 'min_support': 20, 'user_based': True}}},
 {'mse': 1.5203547958903783, 'mae': 0.9103609955155723})

In [11]:
#! Optimmized for MSE

gs_r = gs.test(data_test)
print(f" FCP: {fcp(gs_r,verbose=False)}")
print(f" MAE: {mae(gs_r,verbose=False)}")
print(f" MSE: {mse(gs_r,verbose=False)}")
print(f"RMSE: {rmse(gs_r,verbose=False)}")

 FCP: 0.7291924897042287
 MAE: 0.8963191403178333
 MSE: 1.484046554039336
RMSE: 1.2182144942658235



## KNNMean: Rating Normalization using mean

Hyperparameter Tuning with GridSearchCV (item-based)


In [None]:
param_grid = {
    "k": [10, 15, 20, 25, 30],
    "sim_options": {
        "name": ["msd"],
        "min_support": [5, 10, 15, 20, 25, 30],
        "user_based": [False],
    },
}

gs = GridSearchCV(
    KNNWithMeans,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

gs.fit(_raw_data_train)

In [13]:
df = pd.DataFrame(gs.cv_results)
df.to_json("./results/KNNMean_ii_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params
17,0.879535,0.001162,1.408679,0.006923,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
16,0.879536,0.001159,1.408687,0.006909,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
14,0.87954,0.001159,1.408699,0.006908,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
15,0.87954,0.00116,1.408702,0.006908,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
12,0.879541,0.001159,1.408704,0.006911,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
13,0.879542,0.00116,1.408705,0.006912,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
11,0.879524,0.001133,1.410491,0.006906,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
10,0.879525,0.001131,1.410495,0.006898,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
8,0.879527,0.001131,1.410503,0.006908,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."
9,0.879528,0.00113,1.410511,0.006896,"{'k': 15, 'sim_options': {'name': 'msd', 'min_..."


In [14]:
gs.best_params, gs.best_score

({'mse': {'k': 20,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}},
  'mae': {'k': 15,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}}},
 {'mse': 1.4086794569849994, 'mae': 0.8795241945134151})

In [15]:
#! Optimmized for MSE

gs_r = gs.test(data_test)
print(f" FCP: {fcp(gs_r,verbose=False)}")
print(f" MAE: {mae(gs_r,verbose=False)}")
print(f" MSE: {mse(gs_r,verbose=False)}")
print(f"RMSE: {rmse(gs_r,verbose=False)}")

 FCP: 0.7364464725806266
 MAE: 0.8707253875205995
 MSE: 1.381777903810236
RMSE: 1.1754904949893197



## KNNMean: Rating Normalization using mean

Hyperparameter Tuning with GridSearchCV (user-based)


In [None]:
param_grid = {
    "k": [20, 30, 40, 50],
    "sim_options": {
        "name": ["msd"],
        "min_support": [20, 30, 40, 50],
        "user_based": [True],
    },
}

gs = GridSearchCV(
    KNNWithMeans,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

gs.fit(_raw_data_train)

In [29]:
df = pd.DataFrame(gs.cv_results)
df.to_json("./results/KNNMean_uu_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params
13,0.903056,0.001414,1.469731,0.006513,"{'k': 50, 'sim_options': {'name': 'msd', 'min_..."
9,0.903305,0.001493,1.470193,0.006751,"{'k': 40, 'sim_options': {'name': 'msd', 'min_..."
5,0.904128,0.00158,1.472574,0.007068,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."
12,0.905634,0.001646,1.476712,0.006942,"{'k': 50, 'sim_options': {'name': 'msd', 'min_..."
8,0.906409,0.001584,1.478628,0.006704,"{'k': 40, 'sim_options': {'name': 'msd', 'min_..."
1,0.906776,0.001605,1.479866,0.006887,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
4,0.907811,0.001734,1.482308,0.007013,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."
0,0.911175,0.001773,1.491527,0.007142,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
10,0.910565,0.00155,1.494017,0.006645,"{'k': 40, 'sim_options': {'name': 'msd', 'min_..."
6,0.910862,0.001558,1.494406,0.006631,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."


In [30]:
gs.best_params, gs.best_score

({'mse': {'k': 50,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': True}},
  'mae': {'k': 50,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': True}}},
 {'mse': 1.4697308690458821, 'mae': 0.9030564633147661})

In [31]:
#! Optimmized for MSE

gs_r = gs.test(data_test)
print(f" FCP: {fcp(gs_r,verbose=False)}")
print(f" MAE: {mae(gs_r,verbose=False)}")
print(f" MSE: {mse(gs_r,verbose=False)}")
print(f"RMSE: {rmse(gs_r,verbose=False)}")

 FCP: 0.7275342674726564
 MAE: 0.8983147222055778
 MSE: 1.4492929528700467
RMSE: 1.2038658367401438



## KNNZScore: Rating Normalization using ZScore

Hyperparameter Tuning with GridSearchCV (item-based)


In [None]:
param_grid = {
    "k": [10, 15, 20, 25, 30],
    "sim_options": {
        "name": ["msd"],
        "min_support": [5, 10, 15, 20, 25, 30],
        "user_based": [False],
    },
}

gs = GridSearchCV(
    KNNWithZScore,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

gs.fit(_raw_data_train)

In [33]:
df = pd.DataFrame(gs.cv_results)
df.to_json("./results/KNNZscore_ii_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params
17,0.880537,0.001245,1.420308,0.004411,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
16,0.880542,0.001247,1.420319,0.004429,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
12,0.880546,0.001245,1.420321,0.004422,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
13,0.880547,0.001245,1.420321,0.004421,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
14,0.880547,0.001245,1.420325,0.004423,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
15,0.880547,0.001246,1.420329,0.004423,"{'k': 20, 'sim_options': {'name': 'msd', 'min_..."
23,0.881144,0.001413,1.420829,0.004453,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."
22,0.881148,0.001415,1.420843,0.004457,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."
18,0.881153,0.001412,1.420849,0.004452,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."
19,0.881153,0.001412,1.42085,0.004452,"{'k': 25, 'sim_options': {'name': 'msd', 'min_..."


In [34]:
gs.best_params, gs.best_score

({'mse': {'k': 20,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}},
  'mae': {'k': 20,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}}},
 {'mse': 1.420308222546089, 'mae': 0.8805367124146282})

In [35]:
#! Optimmized for MSE

gs_r = gs.test(data_test)
print(f" FCP: {fcp(gs_r,verbose=False)}")
print(f" MAE: {mae(gs_r,verbose=False)}")
print(f" MSE: {mse(gs_r,verbose=False)}")
print(f"RMSE: {rmse(gs_r,verbose=False)}")

 FCP: 0.7351209351395782
 MAE: 0.872138555286969
 MSE: 1.3947539031062466
RMSE: 1.1809969953840893



## KNNZScore: Rating Normalization using ZScore

Hyperparameter Tuning with GridSearchCV (user-based)


In [None]:
param_grid = {
    "k": [20, 30, 40, 50, 60],
    "sim_options": {
        "name": ["msd"],
        "min_support": [20, 30, 40, 50, 60],
        "user_based": [True],
    },
}

gs = GridSearchCV(
    KNNWithZScore,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

gs.fit(_raw_data_train)

In [37]:
df = pd.DataFrame(gs.cv_results)
df.to_json("./results/KNNZscore_uu_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params
21,0.896781,0.001104,1.465613,0.004979,"{'k': 60, 'sim_options': {'name': 'msd', 'min_..."
16,0.897472,0.001107,1.467773,0.005132,"{'k': 50, 'sim_options': {'name': 'msd', 'min_..."
11,0.898679,0.001135,1.47144,0.0056,"{'k': 40, 'sim_options': {'name': 'msd', 'min_..."
20,0.900688,0.001274,1.476848,0.005506,"{'k': 60, 'sim_options': {'name': 'msd', 'min_..."
6,0.901177,0.00118,1.478858,0.006084,"{'k': 30, 'sim_options': {'name': 'msd', 'min_..."
15,0.901692,0.001329,1.479933,0.005791,"{'k': 50, 'sim_options': {'name': 'msd', 'min_..."
10,0.90342,0.001339,1.485392,0.005688,"{'k': 40, 'sim_options': {'name': 'msd', 'min_..."
22,0.903176,0.00109,1.488554,0.00519,"{'k': 60, 'sim_options': {'name': 'msd', 'min_..."
17,0.903376,0.001076,1.489179,0.005205,"{'k': 50, 'sim_options': {'name': 'msd', 'min_..."
12,0.904083,0.001035,1.491256,0.00513,"{'k': 40, 'sim_options': {'name': 'msd', 'min_..."


In [38]:
gs.best_params, gs.best_score

({'mse': {'k': 60,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': True}},
  'mae': {'k': 60,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': True}}},
 {'mse': 1.4656126372486558, 'mae': 0.8967812528304078})

In [39]:
#! Optimmized for MSE

gs_r = gs.test(data_test)
print(f" FCP: {fcp(gs_r,verbose=False)}")
print(f" MAE: {mae(gs_r,verbose=False)}")
print(f" MSE: {mse(gs_r,verbose=False)}")
print(f"RMSE: {rmse(gs_r,verbose=False)}")

 FCP: 0.7274300575362691
 MAE: 0.8955349623011489
 MSE: 1.454077085262926
RMSE: 1.2058511870305249
