In [1]:
%cd ../

/Users/macos/Uni/1st_year/period_2/RecSys/hw


In [97]:
from typing import Literal
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error

In [3]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

## Task 0: Load data

In [4]:
path_train = "week2/data_split/train.csv"
path_test = "week2/data_split/test.csv"
path_val = "week2/data_split/validation.csv"

In [5]:
df_train = pd.read_csv(path_train)
df_val = pd.read_csv(path_val)
df_test = pd.read_csv(path_test)

In [6]:
df_all = pd.concat([df_train, df_val, df_test])

In [7]:
df_all.head()

Unnamed: 0,item_id,user_id,rating
0,2804,655,4.5
1,5957,655,3.0
2,1213,655,4.5
3,134130,655,4.0
4,5065,655,3.0


In [9]:
len(df_all) / (len(df_all['item_id'].unique()) * len(df_all['user_id'].unique()))

0.022570505586922758

In [78]:
EPS = 1e-9

rating_matrix_train = df_train.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)
rating_matrix_val = df_test.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)
rating_matrix_test = df_test.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)

df_trainval = pd.concat([df_train, df_val])
rating_matrix_trainval = df_trainval.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)

rating_matrix_train.head()

item_id,2,3,4,7,9,11,18,21,22,25,...,232017,233171,233579,233619,233907,233967,234567,234691,235105,235509
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
655,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
2448,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
3426,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
15440,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
16095,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06


## Task 1: Implement knn

In [80]:
def _assert(u: np.ndarray, v: np.ndarray):
    assert len(u.shape) == 2, f"Shape of 'u' must be 2, got {len(u.shape)}"
    assert len(v.shape) == 2, f"Shape of 'v' must be 2, got {len(v.shape)}"

def _sim_cosine(u: np.ndarray, v: np.ndarray) -> np.ndarray:
    _assert(u, v)

    vec_user = np.clip(u / np.linalg.norm(u, axis=-1), a_min=EPS, a_max=None)
    vec_users = np.clip(v / np.linalg.norm(v, axis=-1)[:, None], a_min=EPS, a_max=None)
    cosine = vec_user @ vec_users.T

    return np.squeeze(cosine)

def _sim_euclide(u: np.ndarray, v: np.ndarray) -> np.ndarray:
    _assert(u, v)

    dist = np.sqrt(np.square(u - v).sum(axis=-1))

    return np.squeeze(dist)

def _sim_pearson(u: np.ndarray, v: np.ndarray) -> np.ndarray:
    _assert(u, v)

    corr = np.corrcoef(u, v)[0, 1:]

    return np.squeeze(corr)

def get_score_knn(
    user_id: int,
    list_item_ids: pd.Series,
    k: int = 10,
    sim_method: Literal["cosine", "euclide", "pearson"] = "cosine",
    is_testing: bool = False
) -> list:
    rating_matrix = rating_matrix_train if not is_testing else rating_matrix_trainval

    predicted_rates = []

    for movie in list_item_ids:
        # Get users who rated 'movie'
        valid_users = df_train[df_train['item_id'] == movie]['user_id']
        if len(valid_users) == 0:
            predicted_rates.append(0)
            continue

        # Get vector of user 'user_id' and users who rated 'movie'
        vec_user = rating_matrix.loc[user_id].to_numpy()
        vec_users = rating_matrix.loc[valid_users].to_numpy()

        # Calculate sim
        vec_user = vec_user[None, :]

        if sim_method == "cosine":
            sim = _sim_cosine(vec_user, vec_users)
        elif sim_method == "euclide":
            sim = _sim_euclide(vec_user, vec_users)
        elif sim_method == "pearson":
            sim = _sim_pearson(vec_user, vec_users)
        else:
            raise NotImplementedError()
        if sim.shape == ():
            sim = np.array([sim])
        
        # Get top-k
        k = min(len(sim), k)
        topk = sim.argpartition(-k)[-k:]

        idx_sim_users = valid_users.iloc[topk]
        vec_sim_users = rating_matrix.loc[idx_sim_users]
        cosine_sim_users = sim[topk]

        # Predict rate
        numerator = ((vec_sim_users[movie] - vec_sim_users.mean(axis=1)) * (cosine_sim_users)).sum()
        denominator = cosine_sim_users.sum()
        pred_rate = vec_user.mean() + numerator / denominator

        predicted_rates.append(pred_rate)

    return predicted_rates

## Task 2: Finetune

In [65]:
item_ids_total = set(df_all['item_id'].unique())

def get_uniform_negative_sample(user_id: int, n_samples: int = 100):
    item_ids_rated = set(df_all[df_all['user_id'] == user_id]['item_id'].unique())
    non_rated = item_ids_total.difference(item_ids_rated)

    neg_samples = np.random.choice(list(non_rated), min(n_samples, len(non_rated)), replace=False)

    return neg_samples

In [66]:
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(1, r.size + 1) + 1))
    return 0.
def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

In [67]:
K = 5
N_NEG_SAMPELS = 100

In [76]:

list_topk = [5, 7, 10, 15]
list_methods = ["cosine", "euclide", "pearson"]


for topk, method in product(list_topk, list_methods):
    ndcg_knn = []
    for user_id in tqdm(df_test['user_id'].unique(), desc=f"{topk}-{method}"):
        # Get rows in test set
        df_val_user = df_val[df_val['user_id'] == user_id].drop(columns=['user_id'])

        items = df_val_user['item_id']
        items_neg = get_uniform_negative_sample(user_id, n_samples=N_NEG_SAMPELS)
        df_items_neg = pd.DataFrame({'item_id': items_neg, 'rating': np.zeros_like(items_neg)})

        df_val_user = pd.concat([df_val_user, df_items_neg])

        df_val_user['scores_knn'] = get_score_knn(user_id, df_val_user['item_id'], k=topk, sim_method=method)

        ndcg_random_knn = ndcg_at_k(df_val_user.sort_values('scores_knn', ascending=False)['rating'], K)

        ndcg_knn.append(ndcg_random_knn)
        

    print(f"Mean: NDCG@{K} of KNN {topk}-{method} : {np.mean(ndcg_knn).item():.4f}")

5-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 5-cosine : 0.2115


5-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 5-euclide : 0.0839


5-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 5-pearson : 0.2061


7-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 7-cosine : 0.1953


7-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 7-euclide : 0.0868


7-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 7-pearson : 0.1852


10-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 10-cosine : 0.1850


10-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 10-euclide : 0.0903


10-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 10-pearson : 0.1873


15-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 15-cosine : 0.1838


15-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 15-euclide : 0.0849


15-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 15-pearson : 0.1792


## Task 3: Evaluation

In [83]:
indices = df_train['item_id'].unique()
np.random.shuffle(indices)

scores_random = pd.Series(
    np.arange(1, len(indices) + 1),
    index=indices
)

scores_popularity = df_train.groupby('item_id').count()['rating']

In [84]:
def get_score_random(list_item_ids: list) -> list:
    scores = [scores_random[i] if i in scores_random.index else 0 for i in list_item_ids]

    return scores

def get_score_popularity(list_item_ids: list) -> list:
    scores = [scores_popularity[i] if i in scores_popularity.index else 0 for i in list_item_ids]

    return scores


In [86]:
method = "cosine"
topk = 5

ndcg_random, ndcg_popularity, ndcg_knn = [], [], []

for user_id in tqdm(df_test['user_id'].unique(), desc=f"{topk}-{method}"):
    # Get rows in test set
    df_test_user = df_test[df_test['user_id'] == user_id].drop(columns=['user_id'])

    items = df_test_user['item_id']
    items_neg = get_uniform_negative_sample(user_id, n_samples=N_NEG_SAMPELS)
    df_items_neg = pd.DataFrame({'item_id': items_neg, 'rating': np.zeros_like(items_neg)})

    df_test_user = pd.concat([df_test_user, df_items_neg])

    df_test_user['scores_knn'] = get_score_knn(
        user_id,
        df_test_user['item_id'],
        k=topk,
        sim_method=method,
        is_testing=True
    )
    df_test_user['scores_random'] = get_score_random(df_test_user['item_id'])
    df_test_user['scores_popularity'] = get_score_popularity(df_test_user['item_id'])

    ndcg_random_knn = ndcg_at_k(df_test_user.sort_values('scores_knn', ascending=False)['rating'], K)
    ndcg_random_user= ndcg_at_k(df_test_user.sort_values('scores_random', ascending=False)['rating'], K)
    ndcg_popularity_user = ndcg_at_k(df_test_user.sort_values('scores_popularity', ascending=False)['rating'], K)

    ndcg_knn.append(ndcg_random_knn)
    ndcg_random.append(ndcg_random_user)
    ndcg_popularity.append(ndcg_popularity_user)

print(f"Mean: NDCG@{K} of Random              : {np.mean(ndcg_random).item():.4f}")
print(f"Mean: NDCG@{K} of Popularity          : {np.mean(ndcg_popularity).item():.4f}")
print(f"Mean: NDCG@{K} of KNN {topk}-{method} : {np.mean(ndcg_knn).item():.4f}")

5-cosine:   0%|          | 0/446 [00:00<?, ?it/s]

Mean: NDCG@5 of Random              : 0.2198
Mean: NDCG@5 of Popularity          : 0.7492
Mean: NDCG@5 of KNN 5-cosine : 0.2169


## Task 4

In [88]:
user_id = 655

In [94]:
df_user = df_test[df_test['user_id'] == user_id]

df_user.head()

Unnamed: 0,item_id,user_id,rating
0,104841,655,4.0
1,152081,655,4.0
2,435,655,4.0
3,8464,655,3.0
4,105844,655,4.0


In [95]:
df_user.loc[:, 'scores_knn'] = get_score_knn(
    user_id,
    df_user['item_id'],
    k=5,
    sim_method="cosine",
    is_testing=True
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user.loc[:, 'scores_knn'] = get_score_knn(


In [96]:
df_user.head()

Unnamed: 0,item_id,user_id,rating,scores_knn
0,104841,655,4.0,4.192276
1,152081,655,4.0,3.466926
2,435,655,4.0,1.971452
3,8464,655,3.0,3.390833
4,105844,655,4.0,3.43621


In [98]:
mse = mean_squared_error(df_user['rating'], df_user['scores_knn'])

print(f"MSE: {mse}")

MSE: 0.45098672922213207


In [100]:
df_user.sort_values('scores_knn', ascending=False).iloc[:5]

Unnamed: 0,item_id,user_id,rating,scores_knn
52,356,655,4.5,4.808857
59,7361,655,4.5,4.71102
44,555,655,4.0,4.500576
26,1193,655,3.5,4.500294
42,1222,655,4.5,4.392084


In [102]:
examine_users = df_test['user_id'].unique()[:5]
examine_users

array([  655,  2448,  3426, 15440, 16095])

In [105]:
matrix_examine_user = rating_matrix_trainval.loc[examine_users].to_numpy()

cosine_examine_user = \
    (matrix_examine_user / np.linalg.norm(matrix_examine_user, axis=-1)[:, None]) \
    @ \
    (rating_matrix_trainval / np.linalg.norm(rating_matrix_trainval, axis=-1)[:, None]).to_numpy().T

array([[1.e-06, 1.e-06, 1.e-06, ..., 1.e-06, 1.e-06, 1.e-06],
       [1.e-06, 1.e-06, 1.e-06, ..., 1.e-06, 1.e-06, 1.e-06],
       [1.e-06, 1.e-06, 1.e-06, ..., 1.e-06, 1.e-06, 1.e-06],
       [1.e-06, 1.e-06, 1.e-06, ..., 1.e-06, 1.e-06, 1.e-06],
       [1.e-06, 1.e-06, 1.e-06, ..., 1.e-06, 1.e-06, 1.e-06]])

In [130]:
top3 = cosine_examine_user[0].argpartition(-3)[-3:]
rating_matrix_trainval.iloc[top3]

item_id,2,3,4,7,9,11,18,21,22,25,...,233171,233579,233619,233907,233967,234410,234567,234691,235105,235509
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
964852,4.0,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,3.0,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
627353,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06
655,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,...,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06,1e-06


In [131]:
rating_matrix_trainval.loc[examine_users[0]]

item_id
2         0.000001
3         0.000001
4         0.000001
7         0.000001
9         0.000001
            ...   
234410    0.000001
234567    0.000001
234691    0.000001
235105    0.000001
235509    0.000001
Name: 655, Length: 8277, dtype: float64

In [140]:
top3

array([424, 253,   0])

In [142]:
cosine_examine_user[0, top3]

array([0.41320641, 0.44787606, 1.        ])