In [11]:
import os
import sys
nb_dir = os.path.split(os.getcwd())[0]
if nb_dir not in sys.path:
    sys.path.append(nb_dir)

### Reimlementation of KNN

There are 3 available normalizing methods ("none", "mean" and "baseline"), 2 similarity measures ("cosine", "pearson").
If you wish to use genome scores to calculate similarity, `genome` parameter need to be provided with genome score matrix.

In [14]:
import numpy as np

from utils import DataLoader
from knn import kNN

train_data, test_data = DataLoader("../movielens10k").load_csv2ndarray()

knn = kNN(k=10, distance="pearson", uuCF=1, normalize="baseline", verbose=True)
knn.fit(train_data=train_data)

knn.predict(test_data)

knn.rmse()
knn.mae()

Normalizing the utility matrix ...
Listing all users rated each item (or vice versa if iiCF) ...
Computing similarity matrix ...
Predicting 2000 pairs of user-item ...
Time for predicting: 1 sec
RMSE: 1.06212
MAE: 0.83180



### KNN with mean normalization from NicolasHug/Surprise

In [19]:
from surprise.prediction_algorithms.knns import KNNWithMeans, KNNBasic, KNNBaseline
from surprise import Dataset
from surprise import Reader
from surprise import accuracy
from surprise.model_selection import train_test_split


file_path = "../movielens10k/rating.csv"
reader = Reader(line_format='user item rating timestamp', sep=",", skip_lines=1)
data = Dataset.load_from_file(file_path, reader=reader)

trainset, testset = train_test_split(data, test_size=.2)

# Config for surprise similarity function
sim_options = {
    'name': 'pearson',
    'user_based': True
}

algo = KNNBaseline(k=10, sim_options=sim_options)
algo.fit(trainset)

predictions = algo.test(testset)
accuracy.rmse(predictions)

Estimating biases using als...
Computing the pearson similarity matrix...
Done computing similarity matrix.
RMSE: 1.0614


1.0614047912102296

In [21]:
import pandas as pd
combined_csv = pd.concat([
    pd.read_csv("movielens10k/rating_train.csv", header=None),
    pd.read_csv("movielens10k/rating_test.csv", header=None), 
    pd.read_csv("movielens10k/rating_val.csv", header=None)
])
#export to csv
combined_csv.to_csv("movielens10k/rating.csv", index=False, encoding='utf-8', header=None)