In [1]:
%cd ../

/Users/macos/Uni/1st_year/period_2/RecSys/hw


In [2]:
from typing import Literal
from itertools import product

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error

In [3]:
plt.style.use('seaborn-v0_8')
plt.rcParams.update({'font.size': 8})

## Task 0: Load data

In [4]:
path_train = "week2/data_split/train.csv"
path_test = "week2/data_split/test.csv"
path_val = "week2/data_split/validation.csv"

In [5]:
df_train = pd.read_csv(path_train)
df_val = pd.read_csv(path_val)
df_test = pd.read_csv(path_test)

In [6]:
df_all = pd.concat([df_train, df_val, df_test])

In [7]:
df_all.head()

Unnamed: 0,item_id,user_id,rating
0,2804,655,4.5
1,5957,655,3.0
2,1213,655,4.5
3,134130,655,4.0
4,5065,655,3.0


In [8]:
len(df_all) / (len(df_all['item_id'].unique()) * len(df_all['user_id'].unique()))

0.022570505586922758

In [9]:
EPS = 1e-9

rating_matrix_train = df_train.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)
rating_matrix_val = df_test.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)
rating_matrix_test = df_test.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)

df_trainval = pd.concat([df_train, df_val])
rating_matrix_trainval = df_trainval.pivot(index='user_id', columns='item_id', values='rating').fillna(EPS)

rating_matrix_train.head()

item_id,2,3,4,7,9,11,18,21,22,25,...,232017,233171,233579,233619,233907,233967,234567,234691,235105,235509
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
655,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,...,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09
2448,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,...,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09
3426,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,...,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09
15440,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,...,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09
16095,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,...,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09,1e-09


In [42]:
def _get_rating_sim(rating: pd.DataFrame):
    rating = rating.subtract(rating.mean(axis=1), axis=0)
    rating = rating.dropna(axis=1)

    return rating

rating_matrix_train_sim = _get_rating_sim(rating_matrix_train)
rating_matrix_trainval_sim = _get_rating_sim(rating_matrix_trainval)

## Task 1: Implement KNN, Random and Popularity

In [None]:
indices = df_train['item_id'].unique()
np.random.shuffle(indices)

scores_random = pd.Series(
    np.arange(1, len(indices) + 1),
    index=indices
)

scores_popularity = df_train.groupby('item_id').count()['rating']

In [None]:
def get_score_random(list_item_ids: list) -> list:
    scores = [scores_random[i] if i in scores_random.index else 0 for i in list_item_ids]

    return scores

def get_score_popularity(list_item_ids: list) -> list:
    scores = [scores_popularity[i] if i in scores_popularity.index else 0 for i in list_item_ids]

    return scores


In [55]:
def _assert(u: np.ndarray, v: np.ndarray):
    assert len(u.shape) == 2, f"Shape of 'u' must be 2, got {len(u.shape)}"
    assert len(v.shape) == 2, f"Shape of 'v' must be 2, got {len(v.shape)}"

def _sim_cosine(user_id: np.ndarray, other_user_ids: np.ndarray, df_sim: pd.DataFrame) -> np.ndarray:
    vec_user, vec_other_users = df_sim.loc[user_id].to_numpy()[None, :], df_sim.loc[other_user_ids].to_numpy()

    vec_user = np.clip(vec_user / np.linalg.norm(vec_user, axis=-1), a_min=EPS, a_max=None)
    vec_users = np.clip(vec_other_users / np.linalg.norm(vec_other_users, axis=-1)[:, None], a_min=EPS, a_max=None)
    cosine = vec_user @ vec_users.T

    return np.squeeze(cosine)

def _sim_euclide(user_id: np.ndarray, other_user_ids: np.ndarray, df_sim: pd.DataFrame) -> np.ndarray:
    vec_user, vec_other_users = df_sim.loc[user_id].to_numpy()[None, :], df_sim.loc[other_user_ids].to_numpy()

    dist = np.sqrt(np.square(vec_user - vec_other_users).sum(axis=-1))

    return np.squeeze(dist)

def _sim_pearson(user_id: np.ndarray, other_user_ids: np.ndarray, df_sim: pd.DataFrame) -> np.ndarray:
    vec_user, vec_other_users = df_sim.loc[user_id].to_numpy()[None, :], df_sim.loc[other_user_ids].to_numpy()

    corr = np.corrcoef(vec_user, vec_other_users)[0, 1:]

    return np.squeeze(corr)

def get_score_knn(
    user_id: int,
    list_item_ids: pd.Series,
    k: int = 10,
    sim_method: Literal["cosine", "euclide", "pearson"] = "cosine",
    is_testing: bool = False
) -> list:

    if is_testing is True:
        rating_matrix = rating_matrix_trainval
        df = df_trainval
        rating_sim = rating_matrix_trainval_sim
    else:
        rating_matrix = rating_matrix_train
        df = df_train
        rating_sim = rating_matrix_train_sim

    predicted_rates = []

    for movie in list_item_ids:
        # Get users who rated 'movie'
        valid_users = df[df['item_id'] == movie]['user_id']
        if len(valid_users) == 0:
            predicted_rates.append(0)
            continue

        # Calculate sim
        if sim_method == "cosine":
            sim = _sim_cosine(user_id, valid_users, rating_matrix_trainval_sim)
        elif sim_method == "euclide":
            sim = _sim_euclide(user_id, valid_users, rating_matrix_trainval_sim)
        elif sim_method == "pearson":
            sim = _sim_pearson(user_id, valid_users, rating_matrix_trainval_sim)
        else:
            raise NotImplementedError()
        if sim.shape == ():
            sim = np.array([sim])
        
        # Get top-k
        k = min(len(sim), k)
        topk = sim.argpartition(-k)[-k:]

        vec_user = rating_matrix.loc[user_id].to_numpy()

        idx_sim_users = valid_users.iloc[topk]
        vec_sim_users = rating_matrix.loc[idx_sim_users]
        cosine_sim_users = sim[topk]

        # Predict rate
        numerator = (
            (vec_sim_users[movie] - vec_sim_users.mean(axis=1))
            * cosine_sim_users
        ).sum()
        denominator = cosine_sim_users.sum()
        pred_rate = np.nanmean(vec_user) + numerator / denominator

        predicted_rates.append(pred_rate)

    return predicted_rates

## Task 2: Finetune

In [56]:
item_ids_total = set(df_all['item_id'].unique())

def get_uniform_negative_sample(user_id: int, n_samples: int = 100):
    item_ids_rated = set(df_all[df_all['user_id'] == user_id]['item_id'].unique())
    non_rated = item_ids_total.difference(item_ids_rated)

    neg_samples = np.random.choice(list(non_rated), min(n_samples, len(non_rated)), replace=False)

    return neg_samples

In [51]:
def dcg_at_k(r, k):
    r = np.asfarray(r)[:k]
    if r.size:
        return np.sum(r / np.log2(np.arange(1, r.size + 1) + 1))
    return 0.
def ndcg_at_k(r, k):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k) / dcg_max

In [52]:
K = 5
N_NEG_SAMPELS = 100

In [57]:
list_topk = [1, 2, 3, 5, 10, 20]
list_methods = ["cosine", "pearson", "euclide"]


for topk, method in product(list_topk, list_methods):
    ndcg_knn = []
    for user_id in tqdm(df_val['user_id'].unique(), desc=f"{topk}-{method}"):
        # Get rows in val set
        df_val_user = df_val[df_val['user_id'] == user_id].drop(columns=['user_id'])

        items = df_val_user['item_id']
        items_neg = get_uniform_negative_sample(user_id, n_samples=N_NEG_SAMPELS)
        df_items_neg = pd.DataFrame({'item_id': items_neg, 'rating': np.zeros_like(items_neg)})

        df_val_user = pd.concat([df_val_user, df_items_neg])

        df_val_user['scores_knn'] = get_score_knn(user_id, df_val_user['item_id'], k=topk, sim_method=method)

        ndcg_random_knn = ndcg_at_k(df_val_user.sort_values('scores_knn', ascending=False)['rating'], K)

        ndcg_knn.append(ndcg_random_knn)
        

    print(f"Mean: NDCG@{K} of KNN {topk}-{method} : {np.mean(ndcg_knn).item():.4f}")

1-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 1-cosine : 0.5014


1-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 1-pearson : 0.5098


1-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 1-euclide : 0.1753


2-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 2-cosine : 0.3262


2-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 2-pearson : 0.3194


2-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 2-euclide : 0.1070


3-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 3-cosine : 0.2664


3-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 3-pearson : 0.2635


3-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 3-euclide : 0.0950


5-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 5-cosine : 0.2344


5-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 5-pearson : 0.2258


5-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 5-euclide : 0.0891


10-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 10-cosine : 0.2101


10-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 10-pearson : 0.1993


10-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 10-euclide : 0.0887


20-cosine:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 20-cosine : 0.1894


20-pearson:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 20-pearson : 0.1821


20-euclide:   0%|          | 0/447 [00:00<?, ?it/s]

Mean: NDCG@5 of KNN 20-euclide : 0.0878


## Task 3: Evaluation

In [86]:
method = "pearson"
topk = 1

ndcg_random, ndcg_popularity, ndcg_knn = [], [], []

for user_id in tqdm(df_test['user_id'].unique(), desc=f"{topk}-{method}"):
    # Get rows in test set
    df_test_user = df_test[df_test['user_id'] == user_id].drop(columns=['user_id'])

    items = df_test_user['item_id']
    items_neg = get_uniform_negative_sample(user_id, n_samples=N_NEG_SAMPELS)
    df_items_neg = pd.DataFrame({'item_id': items_neg, 'rating': np.zeros_like(items_neg)})

    df_test_user = pd.concat([df_test_user, df_items_neg])

    df_test_user['scores_knn'] = get_score_knn(
        user_id,
        df_test_user['item_id'],
        k=topk,
        sim_method=method,
        is_testing=True
    )
    df_test_user['scores_random'] = get_score_random(df_test_user['item_id'])
    df_test_user['scores_popularity'] = get_score_popularity(df_test_user['item_id'])

    ndcg_random_knn = ndcg_at_k(df_test_user.sort_values('scores_knn', ascending=False)['rating'], K)
    ndcg_random_user= ndcg_at_k(df_test_user.sort_values('scores_random', ascending=False)['rating'], K)
    ndcg_popularity_user = ndcg_at_k(df_test_user.sort_values('scores_popularity', ascending=False)['rating'], K)

    ndcg_knn.append(ndcg_random_knn)
    ndcg_random.append(ndcg_random_user)
    ndcg_popularity.append(ndcg_popularity_user)

print(f"Mean: NDCG@{K} of Random              : {np.mean(ndcg_random).item():.4f}")
print(f"Mean: NDCG@{K} of Popularity          : {np.mean(ndcg_popularity).item():.4f}")
print(f"Mean: NDCG@{K} of KNN {topk}-{method} : {np.mean(ndcg_knn).item():.4f}")

1-pearson:   0%|          | 0/446 [00:00<?, ?it/s]

Mean: NDCG@5 of Random              : 0.1653
Mean: NDCG@5 of Popularity          : 0.7438
Mean: NDCG@5 of KNN 1-pearson : 0.4855


## Task 4

In [87]:
user_id = 655

In [88]:
df_user = df_test[df_test['user_id'] == user_id]

df_user.head()

Unnamed: 0,item_id,user_id,rating
0,104841,655,4.0
1,152081,655,4.0
2,435,655,4.0
3,8464,655,3.0
4,105844,655,4.0


In [89]:
df_user.loc[:, 'scores_knn'] = get_score_knn(
    user_id,
    df_user['item_id'],
    k=5,
    sim_method="cosine",
    is_testing=True
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_user.loc[:, 'scores_knn'] = get_score_knn(


In [90]:
df_user.head()

Unnamed: 0,item_id,user_id,rating,scores_knn
0,104841,655,4.0,4.216776
1,152081,655,4.0,3.107092
2,435,655,4.0,2.363636
3,8464,655,3.0,3.392965
4,105844,655,4.0,3.468998


In [91]:
mse = mean_squared_error(df_user['rating'], df_user['scores_knn'])

print(f"MSE: {mse}")

MSE: 0.4728722981434736


In [92]:
df_user.sort_values('scores_knn', ascending=False).iloc[:5]

Unnamed: 0,item_id,user_id,rating,scores_knn
59,7361,655,4.5,4.711514
44,555,655,4.0,4.501752
52,356,655,4.5,4.497583
27,1208,655,4.5,4.388801
6,4973,655,4.0,4.320186


In [93]:
examine_users = df_test['user_id'].unique()[:5]
examine_users

array([  655,  2448,  3426, 15440, 16095])

In [99]:
matrix_examine_user = rating_matrix_trainval_sim.loc[examine_users].to_numpy()

cosine_examine_user = \
    (matrix_examine_user / np.linalg.norm(matrix_examine_user, axis=-1)[:, None]) \
    @ \
    (rating_matrix_trainval / np.linalg.norm(rating_matrix_trainval, axis=-1)[:, None]).to_numpy().T

In [108]:
cosine_examine_user.sort(axis=1)

In [111]:
df_top5_sim = pd.DataFrame({
    'user_id': examine_users,
    'top1_sim': cosine_examine_user[:, -1],
    'top2_sim': cosine_examine_user[:, -2],
    'top3_sim': cosine_examine_user[:, -3],
    'top4_sim': cosine_examine_user[:, -4],
    'top5_sim': cosine_examine_user[:, -5]
})

df_top5_sim

Unnamed: 0,user_id,top1_sim,top2_sim,top3_sim,top4_sim,top5_sim
0,655,0.985485,0.428047,0.378858,0.377949,0.364885
1,2448,0.99905,0.197122,0.147465,0.134586,0.125742
2,3426,0.991063,0.330687,0.314056,0.313118,0.309631
3,15440,0.998672,0.336205,0.326017,0.324605,0.28536
4,16095,0.997134,0.402273,0.389271,0.375923,0.351592
