### Acknowledgement
This code is derived from the template made up from work of Xinyang Chen

In [1]:
%cd ../

/Users/macos/Uni/1st_year/period_2/RecSys/hw


In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import trange

# Load data

In [3]:
path_train = "data_split/train.csv"
path_val = "data_split/validation.csv"
path_test = "data_split/test.csv"

In [4]:
# using the provided split
df_trainval = pd.concat([
    pd.read_csv(path_train),
    pd.read_csv(path_val),
])
test = pd.read_csv(path_test)

ratings = pd.concat([df_trainval, test])

In [None]:
df_train = pd.read_csv(path_train)
df_val = pd.read_csv(path_val)
df_test = pd.read_csv(path_test)

In [5]:
# picking a user for testing
user_sample_id = list(df_trainval['user_id'].unique())[0]
user_sample_items = test[test['user_id'] == user_sample_id]
print(f"User id: {user_sample_id}, number of ratings: {len(user_sample_items)}")

User id: 655, number of ratings: 60


# Define models

In [6]:
class POP:
    def __init__(self, df):
        pop = df.item_id.value_counts().reset_index()
        pop.columns = ["item_id", "score"]
        self.pop = pop

    def getScores(self, items, user_id):
        # if the item doesn't appear in the df_trainval set, the score of 0 is assigned to it
        return pd.merge(items, self.pop, how='left', on='item_id').fillna(0)

# testing the model with one user
pop = POP(df_trainval)
pop.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
52,356,655,4.5,232
55,608,655,4.5,156
6,4973,655,4.0,147
59,7361,655,4.5,146
34,4995,655,4.0,144


In [7]:
class RND:
    def __init__(self, df):
        item_id_set = set(df.item_id.unique())
        rnd = pd.DataFrame(item_id_set)
        rnd.columns = ["item_id"]
        rnd = rnd.sample(frac=1)
        rnd["score"] = range(len(rnd))
        self.rnd = rnd

    def getScores(self, items, user_id):
        return pd.merge(items, self.rnd, how='left', on='item_id').fillna(0)

# testing the model with one user
rnd = RND(df_trainval)
rnd.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
57,60069,655,4.0,8270
35,5617,655,3.5,8129
27,1208,655,4.5,8079
15,2395,655,3.5,8076
41,103772,655,3.0,8073


In [12]:
class MatrixFac:
    def __init__(
        self,
        df: pd.DataFrame,
        lr: float = 1e-4,
        k: int = 128,
        beta: float = 1e-2,
        num_epochs: int = 10
    ) -> None:
        self._lr = lr
        self._k = k
        self._beta = beta
        self._num_epochs = num_epochs
        self._df = df

        self.rating = df.pivot(
            index='user_id', 
            columns='item_id', 
            values='rating'
        ).fillna(0)

        self._find_matrix_user_item()

    def _find_matrix_user_item(self):
        # Initialize
        num_users = len(self._df['user_id'].unique())
        num_items = len(self._df['item_id'].unique())

        R = self.rating.to_numpy()
        P = np.random.random((num_users, self._k))
        Qt = np.random.random((self._k, num_items))

        for _ in trange(self._num_epochs):
            E = R - P @ Qt
            print(np.square(E).sum())

            P_new, Qt_new = np.zeros_like(P), np.zeros_like(Qt)

            for i in range(num_users):
                for j in range(num_items):
                    P_new[i, :] = P[i, :] + 2 * self._lr * E[i, j] * Qt[:, j] - 2 * self._beta * P[i, :]
                    Qt_new[:, j] = Qt[:, j] + 2 * self._lr * E[i, j] * P[i, :] - 2 * self._beta * Qt[:, j]

            P, Qt = P_new, Qt_new

        self.rating = pd.DataFrame(
            P @ Qt,
            columns=self.rating.columns,
            index=self.rating.index
        )

    def getScores(self, items, user_id):
        items = items.copy()
        items.loc[:, 'score'] = [
            self.rating.loc[user_id, i]
            if i in self.rating
            else 0
            for i in items['item_id']
        ]

        return items

# testing the model with one user
matrixfac = MatrixFac(df_trainval, num_epochs=1)
matrixfac.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

  0%|          | 0/1 [00:00<?, ?it/s]

3801807924.3554463


Unnamed: 0,item_id,user_id,rating,score
35,5617,655,3.5,38.544183
10,5377,655,4.0,36.633244
0,104841,655,4.0,36.632039
46,6586,655,3.0,35.864557
49,2,655,5.0,35.822123


# Define metric

In [16]:
def ncdg(df: pd.DataFrame, topn):
    '''
    len(predicted), len(truth) >= topn
    '''
    dcg = (
        df
        .sort_values('score', ascending=False)
        .head(topn)['rating']
        / np.log2(np.arange(2, topn + 2))       # Discount rate: 0 +1 -> rank +1 -> discount rate
    ).cumsum().tail(1).values[0]
    idcg = (
        df
        .sort_values('rating', ascending=False)
        .head(topn)['rating']
        / np.log2(np.arange(2, topn + 2))           # Discount rate
    ).cumsum().tail(1).values[0]
    return dcg / idcg

# testing ndcg
ncdg(pop.getScores(user_sample_items, user_sample_id), 5)

0.9314772023011874

In [24]:
performance = []
all_items = ratings['item_id'].unique()
NEG_SAMPLES = 100
AT = 5

# training models
pop = POP(df_trainval)
rnd = RND(df_trainval)
matrixfac = MatrixFac(df_trainval, num_epochs=25, k=5 ,lr=5e-3)

# evaluating models
for user in test['user_id'].unique():
    pos_items = ratings[ratings['user_id'] == user]['item_id'].unique()
    neg_samples = np.random.choice(all_items[~np.isin(all_items, pos_items)], NEG_SAMPLES, replace=False) #UPDATE
    
    user_test_samples = test[test['user_id'] == user][['item_id', 'rating']]
    user_neg_samples = pd.DataFrame(
        [(smp, 0.0) for smp in neg_samples],
        columns=['item_id', 'rating']
    )
    items = pd.concat([user_test_samples, user_neg_samples])

    pop_scores = pop.getScores(items, user)
    rand_scores = rnd.getScores(items, user)
    matrixfac_scores = matrixfac.getScores(items, user)

    performance.append((user, ncdg(pop_scores, AT), ncdg(rand_scores, AT), ncdg(matrixfac_scores, AT)))

performance = pd.DataFrame(performance, columns=['user_id', 'popular_ndcg5', 'random_ndcg5', 'matrixfac_ndcg5'])
performance[['popular_ndcg5', 'random_ndcg5', 'matrixfac_ndcg5']].describe()

  0%|          | 0/25 [00:00<?, ?it/s]

6872907.053846599
6088950.242753441
5424221.4265326895
4857468.384874876
4371847.970946807
3953881.055073077
3592684.914574358
3279401.966542993
3006769.486442814
2768791.923446622
2560488.7993276087
2377698.925420539
2216927.0259626806
2075222.6057047474
1950083.5586780016
1839378.9226264928
1741286.567087606
1654242.6166659207
1576900.1606894948
1508095.3598558619
1446819.4814597333
1392195.7141224435
1343459.8569807676
1299944.1661166644
1261063.7865407635


Unnamed: 0,popular_ndcg5,random_ndcg5,matrixfac_ndcg5
count,446.0,446.0,446.0
mean,0.746192,0.144339,0.23753
std,0.19332,0.169303,0.215026
min,0.0,0.0,0.0
25%,0.665696,0.0,0.0
50%,0.78368,0.106676,0.20861
75%,0.885767,0.242135,0.416459
max,1.0,0.774362,0.788086


In [25]:
print("Popular            : \tNDCG@5=", performance['popular_ndcg5'].mean())
print("Random             : \tNDCG@5=", performance['random_ndcg5'].mean())
print("MatrixFactorization: \tNDCG@5=", performance['matrixfac_ndcg5'].mean())

Popular            : 	NDCG@5= 0.7461922119063459
Random             : 	NDCG@5= 0.1443388932438466
MatrixFactorization: 	NDCG@5= 0.23752988242591436
