In [2]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

### Reimlementation of KNN

There are 3 available normalizing methods ("none", "mean" and "baseline"), 2 similarity measures ("cosine", "pearson").
If you wish to use genome scores to calculate similarity, `genome` parameter need to be provided with genome score matrix.

In [3]:
import numpy as np

from utils import DataLoader
from knn import kNN

train_data, test_data = DataLoader("../../data").load_csv2ndarray()

knn = kNN(k=40, distance="pearson", uuCF=False, normalize="baseline", verbose=True, awareness_constrain=True)
knn.fit(train_data=train_data)

knn.predict(test_data)

knn.rmse()
knn.mae()

Normalizing the utility matrix ...
Listing all users rated each item (or vice versa if iiCF) ...
Computing similarity matrix ...


|                                                                        |   0%

Computing Pearson similarity matrix took 9 min and 5 sec
Predicting 3957876 pairs of user-item ...


|########################################################################| 100%


Time for predicting: 4 min and 32 sec
RMSE: 0.81296
MAE: 0.61888


### KNN with mean normalization from NicolasHug/Surprise

In [4]:
from surprise.prediction_algorithms.knns import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import accuracy

import pandas as pd
import time


train = pd.read_csv('../../data/rating_train.csv').sort_values(by=["movieId","userId"])
test = pd.read_csv('../../data/rating_test.csv').sort_values(by=["movieId","userId"])

reader = Reader(rating_scale=(0.5,5))
train_set = Dataset.load_from_df(train[['userId','movieId','rating']],reader=reader)
test_set = Dataset.load_from_df(test[['userId','movieId','rating']],reader=reader)

trainset = train_set.build_full_trainset()
testset = test_set.build_full_trainset().build_testset()

# Config for surprise similarity function
sim_options = {
    'name': 'pearson',
    'user_based': False
}

t1 = time.time()

algo = KNNBaseline(k=40, sim_options=sim_options)
algo.fit(trainset)

predictions = algo.test(testset)
accuracy.rmse(predictions)
accuracy.mae(predictions)

t2 = time.time()
print(t2 - t1)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 0.8118
MAE:  0.6178
3166.620276451111
