### Acknowledgement
This code is derived from the template made up from work of Xinyang Chen

In [1]:
%cd ../

/Users/macos/Uni/1st_year/period_2/RecSys/hw


In [19]:
from itertools import product

import pandas as pd
import numpy as np
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error

# Load data

In [3]:
path_train = "data_split/train.csv"
path_val = "data_split/validation.csv"
path_test = "data_split/test.csv"

In [4]:
# using the provided split

df_train = pd.read_csv(path_train)
df_val = pd.read_csv(path_val)
df_test = pd.read_csv(path_test)

df_trainval = pd.concat([df_train, df_val])

df_all = pd.concat([df_trainval, df_test])

In [5]:
# picking a user for testing
user_sample_id = list(df_trainval['user_id'].unique())[0]
user_sample_items = df_test[df_test['user_id'] == user_sample_id]
print(f"User id: {user_sample_id}, number of ratings: {len(user_sample_items)}")

User id: 655, number of ratings: 60


# Define models

In [6]:
class POP:
    def __init__(self, df):
        pop = df.item_id.value_counts().reset_index()
        pop.columns = ["item_id", "score"]
        self.pop = pop

    def getScores(self, items, user_id):
        # if the item doesn't appear in the df_trainval set, the score of 0 is assigned to it
        return pd.merge(items, self.pop, how='left', on='item_id').fillna(0)

# testing the model with one user
pop = POP(df_trainval)
pop.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
52,356,655,4.5,232
55,608,655,4.5,156
6,4973,655,4.0,147
59,7361,655,4.5,146
34,4995,655,4.0,144


In [7]:
class RND:
    def __init__(self, df):
        item_id_set = set(df.item_id.unique())
        rnd = pd.DataFrame(item_id_set)
        rnd.columns = ["item_id"]
        rnd = rnd.sample(frac=1)
        rnd["score"] = range(len(rnd))
        self.rnd = rnd

    def getScores(self, items, user_id):
        return pd.merge(items, self.rnd, how='left', on='item_id').fillna(0)

# testing the model with one user
rnd = RND(df_trainval)
rnd.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
22,45720,655,3.0,7829
12,68237,655,4.5,7803
2,435,655,4.0,7727
25,454,655,4.0,6518
26,1193,655,3.5,6415


In [8]:
class MatrixFac:
    def __init__(
        self,
        df: pd.DataFrame,
        lr: float = 1e-4,
        k: int = 128,
        beta: float = 1e-2,
        num_epochs: int = 10
    ) -> None:
        self._lr = lr
        self._k = k
        self._beta = beta
        self._num_epochs = num_epochs
        self._df = df

        self.rating = df.pivot(
            index='user_id', 
            columns='item_id', 
            values='rating'
        ).fillna(0)

        self._find_matrix_user_item()

    def _find_matrix_user_item(self):
        # Initialize
        num_users = len(self._df['user_id'].unique())
        num_items = len(self._df['item_id'].unique())

        R = self.rating.to_numpy()
        P = np.random.random((num_users, self._k))
        Qt = np.random.random((self._k, num_items))

        # with tqdm(total=self._num_epochs) as pbar:
        for _ in range(self._num_epochs):
            # E = R - P @ Qt

            # pbar.set_description(f"Loss = {np.square(E).sum():.4f}")
            # pbar.update()

            for i in range(num_users):
                for j in range(num_items):
                    if R[i][j] > 0:
                        eij = R[i][j] - np.dot(P[i,:],Qt[:,j])

                        P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
                        Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]

        self.rating = pd.DataFrame(
            P @ Qt,
            columns=self.rating.columns,
            index=self.rating.index
        )

    def getScores(self, items, user_id):
        items = items.copy()
        items.loc[:, 'score'] = [
            self.rating.loc[user_id, i]
            if i in self.rating
            else 0
            for i in items['item_id']
        ]

        return items

# testing the model with one user
matrixfac = MatrixFac(df_trainval, num_epochs=1)
matrixfac.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
31,5363,655,3.0,0.882313
56,44004,655,3.0,0.8728
54,95088,655,4.0,0.795967
24,6294,655,3.0,0.794739
32,43928,655,2.0,0.768628


# Define metric

In [9]:
NEG_SAMPLES = 100
AT = 5

def ncdg(df: pd.DataFrame, topn):
    '''
    len(predicted), len(truth) >= topn
    '''
    dcg = (
        df
        .sort_values('score', ascending=False)
        .head(topn)['rating']
        / np.log2(np.arange(2, topn + 2))       # Discount rate: 0 +1 -> rank +1 -> discount rate
    ).cumsum().tail(1).values[0]
    idcg = (
        df
        .sort_values('rating', ascending=False)
        .head(topn)['rating']
        / np.log2(np.arange(2, topn + 2))           # Discount rate
    ).cumsum().tail(1).values[0]
    return dcg / idcg

# testing ndcg
ncdg(pop.getScores(user_sample_items, user_sample_id), 5)

0.9314772023011874

## Fine-tune

In [24]:
list_num_epochs = [10, 30, 50, 100]
list_lr = [1e-2, 1e-3, 1e-4]
list_k = [5, 10, 20, 100, 256]
list_beta = [1e-3, 1e-4]

all_items = df_trainval['item_id'].unique()

results = []
for (num_epochs, lr, k, beta) in product(
    list_num_epochs,
    list_lr,
    list_k,
    list_beta
):
    # print(f"Start with num_epoch = {num_epochs}, lr = {lr}, k = {k}, beta = {beta}")
    
    matrixfac = MatrixFac(df_train, num_epochs=num_epochs, k=k ,lr=lr, beta=beta)

    users_ndcg5 = []
    for user in df_val['user_id'].unique():
        pos_items = df_trainval[df_trainval['user_id'] == user]['item_id'].unique()
        neg_samples = np.random.choice(
            all_items[~np.isin(all_items, pos_items)],
            NEG_SAMPLES,
            replace=False
        )
        
        user_test_samples = df_val[df_val['user_id'] == user][['item_id', 'rating']]
        user_neg_samples = pd.DataFrame(
            [(smp, 0.0) for smp in neg_samples],
            columns=['item_id', 'rating']
        )
        items = pd.concat([user_test_samples, user_neg_samples])

        matrixfac_scores = matrixfac.getScores(items, user)

        users_ndcg5.append(ncdg(matrixfac_scores, AT))

    results.append({
        'num_epochs': num_epochs,
        'lr': lr,
        'k': k,
        'beta': beta,
        'ncdg5': np.mean(users_ndcg5)
    })

pd.DataFrame.from_records(results)

  0%|          | 0/120 [00:00<?, ?it/s]

  Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]
  P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
  Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]
  P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
  Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]
  P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
  Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]
  P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
  Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]
  P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
  Qt[:, j] = Qt[:, j] + 2 * self._lr * eij * P[i, :] - 2 * self._beta * Qt[:, j]
  P[i, :] = P[i, :] + 2 * self._lr * eij * Qt[:, j] - 2 * self._beta * P[i, :]
  P[i, :] = P[i, :] + 2 * self._lr * eij

Unnamed: 0,num_epochs,lr,k,beta,ncdg5
0,10,0.0100,5,0.0010,0.434021
1,10,0.0100,5,0.0001,0.312638
2,10,0.0100,10,0.0010,0.268201
3,10,0.0100,10,0.0001,0.301115
4,10,0.0100,20,0.0010,0.051512
...,...,...,...,...,...
115,100,0.0001,20,0.0001,0.019119
116,100,0.0001,100,0.0010,0.014228
117,100,0.0001,100,0.0001,0.017392
118,100,0.0001,256,0.0010,0.012464


In [25]:
pd.DataFrame.from_records(results).sort_values(by='ncdg5', ascending=False)

Unnamed: 0,num_epochs,lr,k,beta,ncdg5
11,10,0.0010,5,0.0001,0.650747
10,10,0.0010,5,0.0010,0.641012
111,100,0.0001,5,0.0001,0.633647
81,50,0.0001,5,0.0001,0.630784
40,30,0.0010,5,0.0010,0.627472
...,...,...,...,...,...
118,100,0.0001,256,0.0010,0.012464
108,100,0.0010,256,0.0010,0.011620
112,100,0.0001,10,0.0010,0.011593
76,50,0.0010,100,0.0010,0.011310


# Final

In [10]:
performance = []
all_items = df_all['item_id'].unique()


# training models
pop = POP(df_trainval)
rnd = RND(df_trainval)
matrixfac = MatrixFac(df_trainval, num_epochs=10, k=5 ,lr=1e-3, beta=1e-4)

# evaluating models
for user in df_test['user_id'].unique():
    pos_items = df_all[df_all['user_id'] == user]['item_id'].unique()
    neg_samples = np.random.choice(all_items[~np.isin(all_items, pos_items)], NEG_SAMPLES, replace=False)
    
    user_test_samples = df_test[df_test['user_id'] == user][['item_id', 'rating']]
    user_neg_samples = pd.DataFrame(
        [(smp, 0.0) for smp in neg_samples],
        columns=['item_id', 'rating']
    )
    items = pd.concat([user_test_samples, user_neg_samples])

    pop_scores = pop.getScores(items, user)
    rand_scores = rnd.getScores(items, user)
    matrixfac_scores = matrixfac.getScores(items, user)

    performance.append((user, ncdg(pop_scores, AT), ncdg(rand_scores, AT), ncdg(matrixfac_scores, AT)))

performance = pd.DataFrame(performance, columns=['user_id', 'popular_ndcg5', 'random_ndcg5', 'matrixfac_ndcg5'])
performance[['popular_ndcg5', 'random_ndcg5', 'matrixfac_ndcg5']].describe()

Unnamed: 0,popular_ndcg5,random_ndcg5,matrixfac_ndcg5
count,446.0,446.0,446.0
mean,0.744613,0.169726,0.627583
std,0.193453,0.189687,0.237287
min,0.0,0.0,0.0
25%,0.648062,0.0,0.490508
50%,0.786175,0.13155,0.667589
75%,0.882494,0.277498,0.804875
max,1.0,0.78787,1.0


In [11]:
print("Popular            : \tNDCG@5=", performance['popular_ndcg5'].mean())
print("Random             : \tNDCG@5=", performance['random_ndcg5'].mean())
print("MatrixFactorization: \tNDCG@5=", performance['matrixfac_ndcg5'].mean())

Popular            : 	NDCG@5= 0.7446125290054115
Random             : 	NDCG@5= 0.1697264354867019
MatrixFactorization: 	NDCG@5= 0.6275834586522915


# Task 4: Recommendation exploration

In [13]:
user_id = 655

In [14]:
df_trainval[df_trainval['user_id'] == user_id]

Unnamed: 0,item_id,user_id,rating
0,2804,655,4.5
1,5957,655,3.0
2,1213,655,4.5
3,134130,655,4.0
4,5065,655,3.0
...,...,...,...
56,8981,655,3.5
57,2001,655,4.0
58,3175,655,4.5
59,61240,655,4.5


In [20]:
items = df_trainval[df_trainval['user_id'] == user][['item_id', 'rating']]
matrixfac_scores = matrixfac.getScores(items, user)

matrixfac_scores.head()

Unnamed: 0,item_id,rating,score
55867,48877,4.0,3.031638
55868,37729,5.0,3.917457
55869,60397,5.0,3.220688
55870,56941,3.0,2.622753
55871,62999,5.0,2.987865


In [21]:
mean_squared_error(matrixfac_scores['rating'], matrixfac_scores['score'])

1.9623639669874895

In [22]:
matrixfac_scores.sort_values(by='score', ascending=False)

Unnamed: 0,item_id,rating,score
18450,296,4.5,4.325832
55868,37729,5.0,3.917457
55872,55052,5.0,3.559452
55875,1721,5.0,3.415667
55876,88163,4.5,3.403536
18448,53121,4.0,3.317211
55869,60397,5.0,3.220688
18447,35836,3.5,3.13065
55874,52975,4.5,3.077265
55867,48877,4.0,3.031638
