## Significance Weighing: Discounting rating when the similarity is based on very few co-rated items

Certain ways to penalize weighs:

$$
w_{u v}^{\prime}=\frac{\min \left\{\left|\mathcal{J}_{u v}\right|, \gamma\right\}}{\gamma} \times w_{u v}
$$

$$
w_{u v}^{\prime}=\frac{\left|\mathcal{J}_{u v}\right|}{\left|\mathcal{J}_{u v}\right|+\beta} \times w_{u v},
$$

$$
w_{u, v}^{\prime}=\left\{\begin{array}{l}
\frac{\left|\mathcal{J}_{u v}\right|}{\alpha} \cdot w_{u, v}: \forall_{w_{u, v} \geq 0}, \\
\frac{\left|\mathcal{J}_{u v}\right|}{\alpha+\max \left(\left|\mathcal{J}_{u v}\right|, \alpha\right)} \cdot w_{u, v}: \forall_{w_{u, v}<0} .
\end{array}\right.
$$


We're using the third one as this showed much better accuracy as discussed in the following paper: https://core.ac.uk/download/pdf/1510715.pdf

In [1]:
import heapq
import pickle
import surprise
import numpy as np
import fastparquet
import pandas as pd
from surprise.reader import Reader
from surprise.dataset import DatasetAutoFolds
from surprise.model_selection import GridSearchCV
from surprise.accuracy import fcp, mae, mse, rmse
from surprise.prediction_algorithms import PredictionImpossible
from surprise.prediction_algorithms.knns import KNNWithMeans

In [2]:
with open("encode_decode/user_encode.pkl", "rb") as f:
    user_encode = pickle.load(f)
with open("encode_decode/user_decode.pkl", "rb") as f:
    user_decode = pickle.load(f)

with open("encode_decode/anime_encode.pkl", "rb") as f:
    anime_encode = pickle.load(f)
with open("encode_decode/anime_decode.pkl", "rb") as f:
    anime_decode = pickle.load(f)

df_train = pd.read_parquet("valid_train.parquet", engine="fastparquet")
df_test = pd.read_parquet("valid_test.parquet", engine="fastparquet")

In [3]:
reader = Reader(rating_scale=(1, 10))
data_train = DatasetAutoFolds(
    reader=reader,
    df=df_train[["user_id", "anime_id", "score"]],
).build_full_trainset()
data_test = (
    DatasetAutoFolds(
        reader=reader,
        df=df_test[["user_id", "anime_id", "score"]],
    )
    .build_full_trainset()
    .build_testset()
)
_raw_data_train = DatasetAutoFolds(
    reader=reader, df=df_train[["user_id", "anime_id", "score"]]
)

In [4]:
class KNNMCustom(KNNWithMeans):
    def __init__(self, k=40, beta=100, min_k=1, sim_options={}, verbose=True, **kwargs):
        KNNWithMeans.__init__(self, sim_options=sim_options, verbose=verbose, **kwargs)
        self.k = k
        self.min_k = min_k
        self.beta = beta

    def fit(self, trainset):

        KNNWithMeans.fit(self, trainset)

        self.rated_instances = {
            each: set(i[0] for i in self.xr[each]) for each in self.xr
        }

        self.means = np.zeros(self.n_x)
        for x, ratings in self.xr.items():
            self.means[x] = np.mean([r for (_, r) in ratings])

        return self

    def _get_cust_w(self, x, usr):
        # items rated by user
        _x_r = self.rated_instances[x]
        # items rated by target neighbor
        _usr_r = self.rated_instances[usr]
        _co_rated = len(_x_r.intersection(_usr_r))
        return _co_rated / self.beta

    def estimate(self, u, i):

        if not (self.trainset.knows_user(u) and self.trainset.knows_item(i)):
            raise PredictionImpossible("User and/or item is unknown.")

        x, y = self.switch(u, i)

        neighbors = [(x2, self.sim[x, x2], r) for (x2, r) in self.yr[y]]
        k_neighbors = heapq.nlargest(self.k, neighbors, key=lambda t: t[1])

        est = self.means[x]

        # compute weighted average
        sum_sim = sum_ratings = actual_k = 0
        for nb, sim, r in k_neighbors:
            cust_sim = self._get_cust_w(x, nb) * sim
            if sim > 0:
                sum_sim += cust_sim
                sum_ratings += cust_sim * (r - self.means[nb])
                actual_k += 1

        if actual_k < self.min_k:
            sum_ratings = 0

        try:
            est += sum_ratings / sum_sim
        except ZeroDivisionError:
            pass  # return mean

        details = {"actual_k": actual_k}
        return est, details


## KNNZScore: Significane weighing with Mean normalization

Hyperparameter Tuning with GridSearchCV (item-based)


In [5]:
param_grid = {
    "k": [15, 20, 25],
    "beta": [100, 200],
    "sim_options": {
        "name": ["msd"],
        "min_support": [10, 30],
        "user_based": [False],
    },
}

gs_mean_custom = GridSearchCV(
    KNNMCustom,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

In [None]:
gs_mean_custom.fit(_raw_data_train)

saving the grid search results

In [7]:
df = pd.DataFrame(gs_mean_custom.cv_results)
df.to_json("./results/KNN_M_custom_ii_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
        "param_sim_options",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params,param_sim_options
5,0.865189,0.00105,1.368513,0.003857,"{'k': 20, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
7,0.865189,0.00105,1.368513,0.003857,"{'k': 20, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
4,0.865192,0.001054,1.36852,0.003872,"{'k': 20, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
6,0.865192,0.001054,1.36852,0.003872,"{'k': 20, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
0,0.864884,0.0012,1.370672,0.004309,"{'k': 15, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
2,0.864884,0.0012,1.370672,0.004309,"{'k': 15, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
1,0.864882,0.001201,1.370684,0.004326,"{'k': 15, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
3,0.864882,0.001201,1.370684,0.004326,"{'k': 15, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
9,0.866523,0.001001,1.370765,0.003767,"{'k': 25, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
11,0.866523,0.001001,1.370765,0.003767,"{'k': 25, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."


In [8]:
gs_mean_custom.best_params, gs_mean_custom.best_score

({'mse': {'k': 20,
   'beta': 100,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}},
  'mae': {'k': 15,
   'beta': 100,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': False}}},
 {'mse': 1.3685125342639242, 'mae': 0.8648817048936254})

In [9]:
#! Optimmized for MAE

gs_mean_custom_result = gs_mean_custom.test(data_test)
print(f" FCP: {fcp(gs_mean_custom_result,verbose=False)}")
print(f" MAE: {mae(gs_mean_custom_result,verbose=False)}")
print(f" MSE: {mse(gs_mean_custom_result,verbose=False)}")
print(f"RMSE: {rmse(gs_mean_custom_result,verbose=False)}")

 FCP: 0.7456713283901973
 MAE: 0.855427501636369
 MSE: 1.3399013137802025
RMSE: 1.1575410635395198



## KNNCustom: Significane weighing with Mean normalization

Hyperparameter Tuning with GridSearchCV (user-based)


In [10]:
param_grid = {
    "k": [15, 20, 25],
    "beta": [100, 200],
    "sim_options": {
        "name": ["msd"],
        "min_support": [10, 30],
        "user_based": [True],
    },
}

gs_mean_custom = GridSearchCV(
    KNNMCustom,
    param_grid,
    measures=["mse", "mae"],
    cv=5,
    refit=True,
    n_jobs=-1,
    joblib_verbose=5,
)

In [None]:
gs_mean_custom.fit(_raw_data_train)

saving the grid search results

In [12]:
df = pd.DataFrame(gs_mean_custom.cv_results)
df.to_json("./results/KNN_M_custom_uu_gridCV.json")
df_final = df[
    [
        "mean_test_mae",
        "std_test_mae",
        "mean_test_mse",
        "std_test_mse",
        "params",
        "param_sim_options",
    ]
]
df_final.sort_values("mean_test_mse", ascending=True).head(10)

Unnamed: 0,mean_test_mae,std_test_mae,mean_test_mse,std_test_mse,params,param_sim_options
9,0.903884,0.001884,1.471332,0.006827,"{'k': 25, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
11,0.903884,0.001884,1.471332,0.006827,"{'k': 25, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
5,0.905387,0.001857,1.475717,0.006519,"{'k': 20, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
7,0.905387,0.001857,1.475717,0.006519,"{'k': 20, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
1,0.908184,0.001936,1.484016,0.006369,"{'k': 15, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
3,0.908184,0.001936,1.484016,0.006369,"{'k': 15, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 30, 'user_based..."
8,0.911951,0.00181,1.494728,0.007464,"{'k': 25, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
10,0.911951,0.00181,1.494728,0.007464,"{'k': 25, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
4,0.91436,0.00185,1.501955,0.00768,"{'k': 20, 'beta': 100, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."
6,0.91436,0.00185,1.501955,0.00768,"{'k': 20, 'beta': 200, 'sim_options': {'name':...","{'name': 'msd', 'min_support': 10, 'user_based..."


In [13]:
gs_mean_custom.best_params, gs_mean_custom.best_score

({'mse': {'k': 25,
   'beta': 100,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': True}},
  'mae': {'k': 25,
   'beta': 100,
   'sim_options': {'name': 'msd', 'min_support': 30, 'user_based': True}}},
 {'mse': 1.4713317089881384, 'mae': 0.9038841321658833})

In [14]:
#! Optimmized for MSE

gs_mean_custom_result = gs_mean_custom.test(data_test)
print(f" FCP: {fcp(gs_mean_custom_result,verbose=False)}")
print(f" MAE: {mae(gs_mean_custom_result,verbose=False)}")
print(f" MSE: {mse(gs_mean_custom_result,verbose=False)}")
print(f"RMSE: {rmse(gs_mean_custom_result,verbose=False)}")

 FCP: 0.7260165571449508
 MAE: 0.89877817460313
 MSE: 1.4503329983147504
RMSE: 1.2042977199657692
