### Acknowledgement
This code is derived from the template made up from work of Xinyang Chen

In [1]:
%cd ../

/Users/macos/Uni/1st_year/period_2/RecSys/hw


In [2]:
from typing import Union
from itertools import product

import pandas as pd
import numpy as np
from tqdm.contrib.itertools import product
from tqdm.notebook import tqdm
from sklearn.metrics import mean_squared_error

# Load data

In [3]:
path_train = "data_split/train.csv"
path_val = "data_split/validation.csv"
path_test = "data_split/test.csv"
path_tag = "dataset/tags.csv"

In [4]:
# using the provided split

df_train = pd.read_csv(path_train)
df_val = pd.read_csv(path_val)
df_test = pd.read_csv(path_test)
df_tag = pd.read_csv(path_tag)

df_trainval = pd.concat([df_train, df_val])

df_all = pd.concat([df_trainval, df_test])

In [5]:
# picking a user for testing
user_sample_id = list(df_trainval['user_id'].unique())[0]
user_sample_items = df_test[df_test['user_id'] == user_sample_id]
print(f"User id: {user_sample_id}, number of ratings: {len(user_sample_items)}")

User id: 655, number of ratings: 60


# Task 1: Define models

In [6]:
class POP:
    def __init__(self, df):
        pop = df.item_id.value_counts().reset_index()
        pop.columns = ["item_id", "score"]
        self.pop = pop

    def getScores(self, items, user_id):
        # if the item doesn't appear in the df_trainval set, the score of 0 is assigned to it
        return pd.merge(items, self.pop, how='left', on='item_id').fillna(0)

# testing the model with one user
pop = POP(df_trainval)
pop.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
52,356,655,4.5,232
55,608,655,4.5,156
6,4973,655,4.0,147
59,7361,655,4.5,146
34,4995,655,4.0,144


In [7]:
class RND:
    def __init__(self, df):
        item_id_set = set(df.item_id.unique())
        rnd = pd.DataFrame(item_id_set)
        rnd.columns = ["item_id"]
        rnd = rnd.sample(frac=1)
        rnd["score"] = range(len(rnd))
        self.rnd = rnd

    def getScores(self, items, user_id):
        return pd.merge(items, self.rnd, how='left', on='item_id').fillna(0)

# testing the model with one user
rnd = RND(df_trainval)
rnd.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
17,5995,655,3.5,8236
36,51540,655,4.0,8187
46,6586,655,3.0,8142
37,45672,655,3.5,8101
12,68237,655,4.5,7982


In [8]:
class CF:
    def __init__(
        self,
        df_rating: pd.DataFrame,
        df_tag: pd.DataFrame,
        eps: float = 1e-6
    ) -> None:
        self._eps = eps
        self.df_rating = df_rating
        self.df_tag = df_tag.pivot(index='item_id', columns='tag', values='num').fillna(0)
        
        self.idf: Union[pd.DataFrame, None] = None
        self.tfidf_items: Union[pd.DataFrame, None] = None
        self.tfidf_users: Union[pd.DataFrame, None] = None

        self._init_df()

    def _init_df(self):
        # Calculate tf-idf for movies
        self.tfidf_items = self.df_tag.copy()
        self.tfidf_items.loc[:, :] = self._calc_tfidf(self.df_tag)

        # Calculate tf-idf for users
        tags = self.df_tag.columns

        self.tfidf_users = self.df_rating.merge(right=self.df_tag, how='left', on='item_id').groupby('user_id')[tags].sum()
        matrix_tag_user = self.tfidf_users.to_numpy()
        tf_user = matrix_tag_user / matrix_tag_user.sum(axis=1)[:, None]
        self.tfidf_users.loc[:] = tf_user * self.idf
        

    def _calc_tfidf(self, df: pd.DataFrame):
        tag_matrix = df.to_numpy()
        if self.idf is None:
            self.idf = np.log(len(tag_matrix) / np.sum(tag_matrix != 0, axis=0))[None, :]
        tf = tag_matrix /  tag_matrix.sum(axis=1)[:, None]
        tfidf = tf * self.idf

        return tfidf
    
    def _calc_sim_cosine(self, vec_user: np.ndarray, vecs_items: np.ndarray) -> np.ndarray:
        similarity = vec_user @ vecs_items.T / (np.linalg.norm(vec_user, axis=-1) * np.linalg.norm(vecs_items, axis=-1) + self._eps)

        return similarity

    def getScores(self, items: pd.DataFrame, user_id: int):
        # Retrieve tf-idf for user
        vec_user = self.tfidf_users.loc[user_id].to_numpy()

        # Retrieve or calculate tf-idf for movies in 'items'
        vecs_items = items.merge(self.tfidf_items, how='left', on='item_id') \
            [self.tfidf_items.columns] \
            .fillna(0) \
            .to_numpy()

        # Get sim
        sim = self._calc_sim_cosine(vec_user, vecs_items)

        items = items.copy()
        items['score'] = sim

        return items
    
# testing the model with one user
cf = CF(df_trainval, df_tag, 0)
cf.getScores(user_sample_items, user_sample_id).sort_values('score', ascending=False).head()

Unnamed: 0,item_id,user_id,rating,score
12,68237,655,4.5,0.501649
59,7361,655,4.5,0.426272
19,101864,655,3.5,0.369606
57,60069,655,4.0,0.369593
32,43928,655,2.0,0.325154


# Define metric

In [9]:
NEG_SAMPLES = 100
AT = 5

def ncdg(df: pd.DataFrame, topn):
    '''
    len(predicted), len(truth) >= topn
    '''
    dcg = (
        df
        .sort_values('score', ascending=False)
        .head(topn)['rating']
        / np.log2(np.arange(2, topn + 2))       # Discount rate: 0 +1 -> rank +1 -> discount rate
    ).cumsum().tail(1).values[0]
    idcg = (
        df
        .sort_values('rating', ascending=False)
        .head(topn)['rating']
        / np.log2(np.arange(2, topn + 2))           # Discount rate
    ).cumsum().tail(1).values[0]
    return dcg / idcg

# testing ndcg
ncdg(cf.getScores(user_sample_items, user_sample_id), 5)

0.8414831617177216

# Task 2: Algorithm evaluation

In [10]:
performance = []
all_items = df_all['item_id'].unique()


# training models
pop = POP(df_trainval)
rnd = RND(df_trainval)
cf = CF(df_trainval, df_tag)

# evaluating models
for user in df_test['user_id'].unique():
    pos_items = df_all[df_all['user_id'] == user]['item_id'].unique()
    neg_samples = np.random.choice(
        all_items[~np.isin(all_items, pos_items)],
        NEG_SAMPLES,
        replace=False
    )
    
    user_test_samples = df_test[df_test['user_id'] == user][['item_id', 'rating']]
    user_neg_samples = pd.DataFrame(
        [(smp, 0.0) for smp in neg_samples],
        columns=['item_id', 'rating']
    )
    items = pd.concat([user_test_samples, user_neg_samples])

    pop_scores = pop.getScores(items, user)
    rand_scores = rnd.getScores(items, user)
    cf_scores = cf.getScores(items, user)

    performance.append((user, ncdg(pop_scores, AT), ncdg(rand_scores, AT), ncdg(cf_scores, AT)))

performance = pd.DataFrame(performance, columns=['user_id', 'popular_ndcg5', 'random_ndcg5', 'cf_ndcg5'])
performance[['popular_ndcg5', 'random_ndcg5', 'cf_ndcg5']].describe()

Unnamed: 0,popular_ndcg5,random_ndcg5,cf_ndcg5
count,446.0,446.0,446.0
mean,0.752589,0.195247,0.647467
std,0.190303,0.194281,0.222589
min,0.0,0.0,0.0
25%,0.667598,0.0,0.539108
50%,0.795156,0.151177,0.695669
75%,0.888878,0.311344,0.809877
max,1.0,0.818055,1.0


In [11]:
print("Popular           : \tNDCG@5=", performance['popular_ndcg5'].mean())
print("Random            : \tNDCG@5=", performance['random_ndcg5'].mean())
print("Content Filtering : \tNDCG@5=", performance['cf_ndcg5'].mean())

Popular           : 	NDCG@5= 0.75258903874485
Random            : 	NDCG@5= 0.19524671112433006
Content Filtering : 	NDCG@5= 0.6474672721746214


# Task 3: Recommendation exploration

In [12]:
user_id = 655

In [21]:
df_trainval[
    (df_trainval['user_id'] == user_id) 
    # & (df_trainval['item_id'] == 296)
]

Unnamed: 0,item_id,user_id,rating
0,2804,655,4.5
1,5957,655,3.0
2,1213,655,4.5
3,134130,655,4.0
4,5065,655,3.0
...,...,...,...
56,8981,655,3.5
57,2001,655,4.0
58,3175,655,4.5
59,61240,655,4.5


In [22]:
items = df_trainval[df_trainval['user_id'] == user_id][['item_id', 'rating']]
cf_scores = cf.getScores(items, user)

cf_scores.head()

Unnamed: 0,item_id,rating,score
0,2804,4.5,0.058747
1,5957,3.0,0.165833
2,1213,4.5,0.360997
3,134130,4.0,0.019734
4,5065,3.0,0.001669


In [23]:
cf_scores.sort_values(by='score', ascending=False)

Unnamed: 0,item_id,rating,score
53,296,5.0,0.916309
2,1213,4.5,0.360997
63,6016,3.5,0.357231
24,6874,4.5,0.357052
38,5902,4.0,0.317800
...,...,...,...
138,2948,4.0,0.006097
149,1356,4.5,0.005753
67,110730,4.0,0.005746
150,1909,4.0,0.004244
