# Коллаборативная фильтрация

Нужно самостоятельно реализовать колоборативную фильтрацию методами:

1. Knn нужно реализовать 2 базовых метода
    1. Простой KNN (в библиотеке surprise называется KNNBasic)
    2. Непараметрическая регрессия Надарайя-Ватсона (в библиотеке surprise называется KNNWithMeans)
2. SVD-разложение
    1. Метод SGD
    2. Метод ALS
3. SVD++

С полученными методами нужно произвести следующие исследования:
- Нужно сравнить время работы всех реализованных алгоритмов. 
- Нужно сравнить точность (в смысле RMSE) всех реализованных алгоритмов.
- Качество (в смысле RMSE) kNN по параметру k
- Качество (в смысле RMSE) SVD по числу факторов
- Качество (в смысле RMSE) SVD по числу итераций в SGD

В качестве датасэта можно использовать, например, https://grouplens.org/datasets/movielens/ (можно любой другой).

Можно вдохновляться библиотеками (но не копировать код): 
- https://implicit.readthedocs.io/en/latest/quickstart.html 
- https://surprise.readthedocs.io/en/stable/getting_started.html

## Load Data

In [None]:
! wget https://files.grouplens.org/datasets/movielens/ml-100k.zip

--2023-10-01 12:59:59--  https://files.grouplens.org/datasets/movielens/ml-100k.zip
Resolving files.grouplens.org (files.grouplens.org)... 128.101.65.152
Connecting to files.grouplens.org (files.grouplens.org)|128.101.65.152|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 4924029 (4,7M) [application/zip]
Saving to: ‘ml-100k.zip’


2023-10-01 13:00:03 (1,27 MB/s) - ‘ml-100k.zip’ saved [4924029/4924029]



In [None]:
! unzip ml-100k.zip

Archive:  ml-100k.zip
   creating: ml-100k/
  inflating: ml-100k/allbut.pl       
  inflating: ml-100k/mku.sh          
  inflating: ml-100k/README          
  inflating: ml-100k/u.data          
  inflating: ml-100k/u.genre         
  inflating: ml-100k/u.info          
  inflating: ml-100k/u.item          
  inflating: ml-100k/u.occupation    
  inflating: ml-100k/u.user          
  inflating: ml-100k/u1.base         
  inflating: ml-100k/u1.test         
  inflating: ml-100k/u2.base         
  inflating: ml-100k/u2.test         
  inflating: ml-100k/u3.base         
  inflating: ml-100k/u3.test         
  inflating: ml-100k/u4.base         
  inflating: ml-100k/u4.test         
  inflating: ml-100k/u5.base         
  inflating: ml-100k/u5.test         
  inflating: ml-100k/ua.base         
  inflating: ml-100k/ua.test         
  inflating: ml-100k/ub.base         
  inflating: ml-100k/ub.test         


In [1]:
import pandas as pd
import numpy as np
import os


def load(fname, path='/home/ilya/repos/recsys/hw1/ml-100k'):
    path = os.path.join(path, fname)
    return pd.read_csv(path, sep='\t', names=['user_id', 'item_id', 'rating', 'timestamp']).drop(columns=['timestamp'])

df = load('ua.base')
df.describe()

Unnamed: 0,user_id,item_id,rating
count,90570.0,90570.0,90570.0
mean,461.494038,428.104891,3.523827
std,266.004364,333.088029,1.126073
min,1.0,1.0,1.0
25%,256.0,174.0,3.0
50%,442.0,324.0,4.0
75%,682.0,636.0,4.0
max,943.0,1682.0,5.0


In [2]:
def load_train():
    return load('ua.base')
def load_test():
    return load('ua.test')

df_train = load_train()
df_test = load_train()

Разделение на сплиты корректно:

In [3]:
print(len(set(df_test.item_id.unique()) - set(df_train.item_id.unique())))
print(len(set(df_test.user_id.unique()) - set(df_train.user_id.unique())))

0
0


## Demo

### KNN

In [13]:
metric = 'msd'
min_support = 5
k = 20
with_means = True

In [26]:
from colfil.knn import _preprocess, _n_common_items, _similarities, _k_neighbors, _predictions, knn_user_based

pred = knn_user_based(df_train, df_test, k, metric, min_support, verbose=True, with_means=False)

In [36]:
imp = pred.loc[pred.impossible]
print(len(imp))
imp.head()

146


Unnamed: 0,rating,impossible,_user_id,_item_id
1048,-inf,True,6,575
1124,-inf,True,6,651
1364,-inf,True,9,687
2100,-inf,True,12,805
2116,-inf,True,12,821


In [28]:
df_pred = df_test.merge(pred, on=['_user_id', '_item_id'], how='left')
df_pred.head()

Unnamed: 0,user_id,item_id,rating_x,_user_id,_item_id,rating_y,impossible
0,1,1,5,0,0,3.90585,False
1,1,2,3,0,1,3.458898,False
2,1,3,4,0,2,2.857541,False
3,1,4,3,0,3,4.091902,False
4,1,5,3,0,4,3.288865,False


In [30]:
def rmse(df):
    df_possible = df[df.impossible == False]
    return ((df_possible.rating_x - df_possible.rating_y) ** 2).mean() ** 0.5

rmse(df_pred)

0.9440897938940396

### SVD

In [4]:
hparams = dict(
    n_factors=100,
    n_epochs=10,
    batch_size=128,
    biased=True,
    init_mean=0,
    init_std_dev=.1,
    lr=.005,
    reg=.02,
    random_state=None,
    return_logs=False
)

In [5]:
from hw1.colfil.svd import svd

df_pred = svd(df_train, df_test, **hparams)

In [7]:
def rmse(df):
    return ((df.rating - df.pred_rating) ** 2).mean() ** 0.5

rmse(df_pred)

0.8356410478766767

In [25]:
def _inversed_root(df):
    user2items = df.groupby('_user_id')['_item_id'].apply(lambda x: list(x))
    user2count = user2items.apply(lambda x: len(x) ** -(0.5))
    df['_inversed_root'] = df._user_id.apply(lambda x: user2count[x]).to_numpy()
    return user2items.to_list()

user2items = _inversed_root(df_pred)

In [26]:
df_pred.head()

Unnamed: 0,user_id,item_id,rating,_user_id,_item_id,pred_rating,_inversed_root
0,1,1,5,0,0,3.987672,0.06178
1,1,2,3,0,1,3.253773,0.06178
2,1,3,4,0,2,3.099597,0.06178
3,1,4,3,0,3,3.501269,0.06178
4,1,5,3,0,4,3.373939,0.06178


In [34]:
len(user2items)

943

### SVD++

In [10]:
hparams = dict(
    n_factors=100,
    n_epochs=10,
    batch_size=128,
    init_mean=0,
    init_std_dev=.1,
    lr=.005,
    reg=.02,
    random_state=None,
    return_logs=False,
    verbose=True
)

In [11]:
from colfil import svdpp

svdpp(df_train, df_test, **hparams)

=== i_epoch=0 ===
i_batch=0, rmse=1.0635
i_batch=177, rmse=1.0346
i_batch=354, rmse=1.0978
i_batch=531, rmse=0.9279

=== i_epoch=1 ===
i_batch=0, rmse=0.9672
i_batch=177, rmse=0.8119
i_batch=354, rmse=0.8507
i_batch=531, rmse=0.9577

=== i_epoch=2 ===
i_batch=0, rmse=0.8732
i_batch=177, rmse=0.9585
i_batch=354, rmse=0.9874
i_batch=531, rmse=0.8403

=== i_epoch=3 ===
i_batch=0, rmse=0.8051
i_batch=177, rmse=0.9053
i_batch=354, rmse=0.9062
i_batch=531, rmse=0.7947

=== i_epoch=4 ===
i_batch=0, rmse=0.8440
i_batch=177, rmse=0.8789
i_batch=354, rmse=0.8547
i_batch=531, rmse=0.8946

=== i_epoch=5 ===
i_batch=0, rmse=0.8159
i_batch=177, rmse=0.8366
i_batch=354, rmse=0.8298
i_batch=531, rmse=0.7408

=== i_epoch=6 ===
i_batch=0, rmse=0.8190
i_batch=177, rmse=0.8432
i_batch=354, rmse=0.9186
i_batch=531, rmse=0.9289

=== i_epoch=7 ===
i_batch=0, rmse=0.8418
i_batch=177, rmse=0.8407
i_batch=354, rmse=0.7625
i_batch=531, rmse=0.8432

=== i_epoch=8 ===
i_batch=0, rmse=0.7519
i_batch=177, rmse=0.742

In [13]:
df_test.head()

Unnamed: 0,user_id,item_id,rating,_user_id,_item_id,pred_rating
0,1,1,5,0,0,3.947967
1,1,2,3,0,1,3.323082
2,1,3,4,0,2,2.982732
3,1,4,3,0,3,3.390697
4,1,5,3,0,4,3.406215


In [14]:
def rmse(df):
    return ((df.rating - df.pred_rating) ** 2).mean() ** 0.5

rmse(df_test)

0.8345658609346708