In [1]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

### Reimlementation of KNN

There are 3 kNN models available ("kNN", "kNNwithMean" and "kNNBaseline"), 2 similarity measures ("cosine", "pearson").
If you wish to use genome scores to calculate similarity, `genome` parameter need to be provided with genome score matrix.

In [2]:
from utils import DataLoader
from knn import Baseline

# train_set, test_set = DataLoader("../../data").load_csv2ndarray()
train_set, test_set = DataLoader("../movielens100k").load_csv2ndarray()

# default
baseline_options = {
    'method': 'als',
    'n_epochs': 10,
    'reg_u': 15,
    'reg_i': 10
}

# baseline_options = {
#     'method': 'sgd',
#     'n_epochs': 20,
#     'learning_rate': 0.005,
#     'regularization': 0.02
# }

baseline_model = Baseline()
baseline_model.fit(train_set=train_set, val_set=test_set, baseline_options=baseline_options)

Listing all users rated each item, and all items rated by each user ...
Listing took 0 sec
Epoch 0: val_loss: 0.81271 - val_rmse: 0.90150 - val_mae: 0.69939
Epoch 1: val_loss: 0.81195 - val_rmse: 0.90108 - val_mae: 0.69893
Epoch 2: val_loss: 0.81149 - val_rmse: 0.90083 - val_mae: 0.69863
Epoch 3: val_loss: 0.81108 - val_rmse: 0.90060 - val_mae: 0.69839
Epoch 4: val_loss: 0.81083 - val_rmse: 0.90046 - val_mae: 0.69825
Epoch 5: val_loss: 0.81068 - val_rmse: 0.90038 - val_mae: 0.69816
Epoch 6: val_loss: 0.81059 - val_rmse: 0.90033 - val_mae: 0.69810
Epoch 7: val_loss: 0.81053 - val_rmse: 0.90030 - val_mae: 0.69807
Epoch 8: val_loss: 0.81050 - val_rmse: 0.90028 - val_mae: 0.69805
Epoch 9: val_loss: 0.81048 - val_rmse: 0.90027 - val_mae: 0.69804
Time for computing the baseline estimate: 3 sec


In [2]:
from utils import DataLoader
from knn import kNNBaseline, kNNwithMean, kNN

# train_set, test_set = DataLoader("../../data").load_csv2ndarray()
train_set, test_set = DataLoader("../movielens100k").load_csv2ndarray()

# default
baseline_options = {
    'method': 'als',
    'n_epochs': 10,
    'reg_u': 15,
    'reg_i': 10
}

# baseline_options = {
#     'method': 'sgd',
#     'n_epochs': 20,
#     'learning_rate': 0.005,
#     'regularization': 0.02
# }

knn = kNNBaseline(uuCF=False, verbose=True, awareness_constrain=False)
knn.fit(train_set=train_set, similarity_measure="pearson_baseline", baseline_options=baseline_options)

# knn = kNNwithMean(uuCF=False, verbose=True, awareness_constrain=False)
# knn.fit(train_set=train_set, similarity_measure="pearson")

knn.predict(test_set, k=20)

knn.rmse()
knn.mae()
knn.precision_recall_at_k(k=20)

Listing all users rated each item, and all items rated by each user ...
Listing took 0 sec
Time for computing the baseline estimate: 2 sec
Computing similarity matrix ...
Computing Pearson Baseline similarity matrix took 3 sec
Predicting 20000 pairs of user-item with k=20 ...

Time for predicting: 2 sec
RMSE: 0.88581
MAE: 0.67993
Precision: 0.70673
Recall: 0.61788


In [7]:
knn.predict(test_set, k=30)

knn.rmse()
knn.mae()

Predicting 20000 pairs of user-item with k=30 ...
Time for predicting: 1 sec
RMSE: 0.88453
MAE: 0.67963



### KNN with mean normalization from NicolasHug/Surprise

In [3]:
from surprise.prediction_algorithms.knns import KNNWithMeans, KNNBasic, KNNBaseline
from surprise.prediction_algorithms import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

import pandas as pd
import time


# train = pd.read_csv('../../data/rating_train.csv')
# test = pd.read_csv('../../data/rating_test.csv')
train = pd.read_csv('../movielens100k/rating_train.csv')
test = pd.read_csv('../movielens100k/rating_test.csv')

reader = Reader(rating_scale=(0.5,5))
train_set = Dataset.load_from_df(train[['userId','movieId','rating']],reader=reader)
test_set = Dataset.load_from_df(test[['userId','movieId','rating']],reader=reader)

trainset = train_set.build_full_trainset()
testset = test_set.build_full_trainset().build_testset()

# Config for surprise similarity function
sim_options = {
    'name': 'pearson_baseline',
    'user_based': False
}

t1 = time.time()

algo = KNNBaseline(k=20, sim_options=sim_options)
# algo = KNNWithMeans(k=20, sim_options=sim_options)
# algo = BaselineOnly()
algo.fit(trainset)

predictions = algo.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

t2 = time.time()
print(t2 - t1)

Estimating biases using als...
Computing the pearson_baseline similarity matrix...
Done computing similarity matrix.
RMSE: 0.8858
MAE:  0.6799
13.305283069610596


In [6]:
predictions[:5]

[Prediction(uid=1, iid=32, r_ui=3.5, est=3.8329223910734926, details={'actual_k': 20, 'was_impossible': False}),
 Prediction(uid=1, iid=337, r_ui=3.5, est=3.761605869231817, details={'actual_k': 20, 'was_impossible': False}),
 Prediction(uid=1, iid=541, r_ui=4.0, est=4.10927366872051, details={'actual_k': 20, 'was_impossible': False}),
 Prediction(uid=1, iid=593, r_ui=3.5, est=4.152943429071827, details={'actual_k': 20, 'was_impossible': False}),
 Prediction(uid=1, iid=653, r_ui=3.0, est=3.5279103971753205, details={'actual_k': 20, 'was_impossible': False})]

In [7]:
knn.predictions[:5]

array([3.83292625, 3.76160869, 4.10927793, 4.15294811, 3.52791222])