In [4]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from tqdm import tqdm
import sys
sys.path.append("../utils/")
from recsys_metric import mrr

time: 411 µs (started: 2022-06-15 05:20:16 +00:00)


In [5]:
def trn_rating(file_path):
    def fun1(x):
        x.loc[x['date']==x['date'].max(),'rating']=5
        return x
    
    trn_data = pd.read_csv(file_path, sep=',', engine='python')
    trn_data['date'] = pd.to_datetime(trn_data['date'])
    trn_data['rating'] = 1
    max_view = trn_data.groupby(['session_id', 'item_id']).sum()['rating'].max()
    trn_data_no_date = trn_data.sort_values(['date']).groupby('session_id').apply(fun1).groupby(['session_id', 'item_id'])['rating'].sum().reset_index().sort_values(['session_id', 'rating'])
    trn_data_date = pd.merge(trn_data.loc[:,['session_id', 'item_id', 'date']], trn_data_no_date, how="left").sort_values(['session_id', 'date'])
    
    return trn_data_no_date, trn_data_date


def rank_itemcf(target_dict, overview=False):
    """
    target_dict : 
      {session_id1:[item_id1, item_id2, ...], session_id2:[item_id5, item_id1, ...]}
    """
    item_id_rank_arr = pd.DataFrame.from_dict(target_dict, orient='index').reset_index().rename(columns={'index': 'session_id'}).fillna(-1).astype(int)
    lack_col_num = 101 - len(item_id_rank_arr.columns)
    for i in range(100-lack_col_num,100):
        item_id_rank_arr[i]=-1
     
    if overview:
        print(f"lack_col_num:{lack_col_num}")
        display(item_id_rank_arr)
    
    res_df = pd.melt(item_id_rank_arr, id_vars=['session_id'], value_vars=range(100)).sort_values(['session_id','variable']).rename(columns={'value': 'item_id'}).drop(['variable'], axis=1)
    res_df["rank"] = (list(range(1,101))*len(target_dict))
    
    
    return res_df


class ItemCFRating():
    def __init__(self,trn_data):
        """
        self.trn_data format:
           | session_id | item_id | rating |
           |-----------:|--------:|-------:|
           |        113 |    2171 |      1 |
           |        113 |    6187 |      1 |
           |        113 |   15738 |      1 | 
                           .
                           .
                           .
        """
        self.user_item_mat = trn_data.pivot(index = ["session_id"],columns = ["item_id"],values = "rating").fillna(0)
        self.item_sim_mat = None
        
    def similarity(self):
        tmp = np.dot(self.user_item_mat.T,self.user_item_mat)
        tmp = tmp/norm(tmp, axis=0)
        self.item_sim_mat = pd.DataFrame(tmp, columns=self.user_item_mat.columns, index=self.user_item_mat.columns) 
        
    def recommend(self, session_list ,K ,N):
        # print(rec_dic) => {session_id1:[item_id1, item_id2, ...], session_id2:[item_id5, item_id1, ...]}
        rec_dic = {}
        print("start recommend...")
        for sid in tqdm(session_list):
            viewed_item_list = self.user_item_mat.loc[sid, self.user_item_mat.loc[sid,:]>0].index.tolist()
            viewed_item_sim_mat = self.item_sim_mat.loc[viewed_item_list,:]
            viewed_item_K_sim_item = viewed_item_sim_mat.T.apply(lambda x:x.sort_values(ascending=False).index[:K+1], axis=0).T
            col_list = list(set(np.unique(viewed_item_K_sim_item.values.reshape(-1,))) - set(viewed_item_list))
            viewed_item_K_sim_mat = self.item_sim_mat.loc[viewed_item_list,col_list]
            rec_items = viewed_item_K_sim_mat.sum(axis=0).sort_values(ascending=False).index.tolist()[:N]
            rec_dic[sid] = rec_items
        return rec_dic
            
            

time: 1.82 ms (started: 2022-06-15 05:20:16 +00:00)


In [6]:
val_label = pd.read_csv('../datasets/purchases_last_1m.csv')
val_data_session_list = pd.read_csv("../datasets/train_last_1m.csv")['session_id'].unique()
display(val_data_session_list)

trn_data, trn_data_date = trn_rating("../datasets/train_last_1m.csv") 
display(trn_data['item_id'].nunique())
display(trn_data[:50], trn_data_date[:50])

array([    113,     115,     261, ..., 4439898, 4439949, 4439986])

7342

Unnamed: 0,session_id,item_id,rating
0,31,264,1
3,31,16289,1
4,31,25972,1
5,31,26457,1
6,31,27579,1
1,31,2069,2
2,31,4230,5
7,113,2171,1
8,113,6187,1
9,113,15738,1


Unnamed: 0,session_id,item_id,date,rating
0,31,25972,2021-04-20 19:38:03.816,1
1,31,16289,2021-04-20 19:39:17.092,1
5,31,2069,2021-04-20 19:39:29.968,2
2,31,2069,2021-04-20 19:40:21.952,2
4,31,26457,2021-04-20 19:41:29.179,1
7,31,264,2021-04-20 19:42:13.190,1
3,31,27579,2021-04-20 19:43:43.813,1
6,31,4230,2021-04-20 19:45:23.203,5
13,113,6187,2021-05-05 13:18:20.994,1
11,113,19760,2021-05-05 13:18:49.495,1


time: 1min 15s (started: 2022-06-15 05:20:17 +00:00)


In [7]:
itemcf_last_2m = ItemCFRating(trn_data)
itemcf_last_2m.similarity()
rec_items = itemcf_last_2m.recommend(val_data_session_list, 20, 100)

start recommend...


100% 81618/81618 [08:33<00:00, 158.93it/s]

time: 8min 51s (started: 2022-06-15 05:21:32 +00:00)





In [8]:
eva_res = rank_itemcf(rec_items)

mrr_score = mrr(eva_res, val_label, overview=False)
print(mrr_score)
display(f"eva_res shape:{eva_res.shape}", eva_res.iloc[95:105,:])

0.032989121107306806


'eva_res shape:(8161800, 3)'

Unnamed: 0,session_id,item_id,rank
7753710,113,4003,96
7835328,113,10217,97
7916946,113,21686,98
7998564,113,8713,99
8080182,113,3049,100
1,115,18936,1
81619,115,9365,2
163237,115,1155,3
244855,115,1059,4
326473,115,10898,5


time: 4.88 s (started: 2022-06-15 05:30:24 +00:00)
