In [52]:
import pandas as pd
import numpy as np
import warnings
import random, math, os
from tqdm import tqdm
from sklearn.model_selection import train_test_split
import sys
sys.path.append("../utils/")
from recsys_metric import mrr
warnings.filterwarnings('ignore')

time: 645 µs (started: 2022-06-13 06:06:46 +00:00)


In [53]:
def get_data(train_file_path, val_file_path, overview=False, concat_val=False):
    # 读取数据
    
    trn_raw_data = pd.read_csv(train_file_path, sep=',', engine='python')
    val_raw_data = pd.read_csv(val_file_path, sep=',', engine='python')
    if concat_val:
        trn_raw_data = pd.concat([trn_raw_data, val_raw_data])
    
    # 分割训练和验证集
    
    trn_data = trn_raw_data.groupby('session_id')['item_id'].apply(list).reset_index()
    val_data = val_raw_data.groupby('session_id')['item_id'].apply(list).reset_index()

    trn_user_items = {}
    val_user_items = {}
    
    # 将数组构造成字典的形式{session_id: [item_id1, item_id2,...,item_idn]}
    for session, item in zip(*(list(trn_data['session_id']), list(trn_data['item_id']))):
        trn_user_items[session] = set(item)

    for session, item in zip(*(list(val_data['session_id']), list(val_data['item_id']))):
        val_user_items[session] = set(item)
    
    if overview:
        print(f'trn_raw_data:\n {trn_raw_data} \n val_raw_data:\n {val_raw_data}')
        print('trn_data:\n',trn_data)
        print('val_data:\n', val_data)
    
    return trn_user_items, val_user_items


time: 955 µs (started: 2022-06-13 06:06:47 +00:00)


In [54]:
def Item_CF(trn_user_items, val_user_items, K, N, candidate_items=None, overview=False, prediction=False):
    '''
    trn_user_items: 表示训练数据，格式为：{user_id1: [item_id1, item_id2,...,item_idn], user_id2...}
    val_user_items: 表示验证数据，格式为：{user_id1: [item_id1, item_id2,...,item_idn], user_id2...}
    K: Ｋ表示的是相似商品的数量，为每个用户交互的每个商品都选择其最相似的K个商品
    N: N表示的是给用户推荐的商品数量，给每个用户推荐相似度最大的N个商品
    '''

    # 建立user->item的倒排表
    # 倒排表的格式为: {user_id1: [item_id1, item_id2,...,item_idn], user_id2: ...} 也就是每个用户交互过的所有商品集合
    # 由于输入的训练数据trn_user_items,本身就是这中格式的，所以这里不需要进行额外的计算
    

    # 计算商品协同过滤矩阵
    # 即利用user-items倒排表统计商品与商品之间被共同的用户交互的次数
    # 商品协同过滤矩阵的表示形式为：sim = {item_id1: {item_id２: num1, item_id3: num3,...}, item_id３: {item_id４: num２,...}, ...}
    # 商品协同过滤矩阵是一个双层的字典，用来表示商品之间共同交互的用户数量
    # 在计算商品协同过滤矩阵的同时还需要记录每个商品被多少不同用户交互的次数，其表示形式为: num = {item_id1：num1, item_id２:num2, ...}
    sim = {}#两个物品同时被某些用户喜欢的总次数(不止一个用户)
    num = {}#每个物品的总共的 被喜欢(交互)数
    print('构建相似性矩阵．．．')
    #遍历每一个user交互过的items列表，统计两个不同的item共同被user(不止一个，而是trn_user_items中的所有user)交互的总次数
    for uid, items in tqdm(trn_user_items.items()):
        #对于当前items列表中的每一个item
        for i in items:
            #统计item i 被用户交互的总次数
            if i not in num:
                num[i] = 0
            num[i] += 1
            #统计 item i 和 item j 被共同交互的总次数
            if i not in sim:
                sim[i] = {}
            for j in items:
                if j not in sim[i]:
                    sim[i][j] = 0
                if i != j:
                    sim[i][j] += 1/math.log1p(len(items) * 1.)
    
    # 计算物品的相似度矩阵
    # 商品协同过滤矩阵其实相当于是余弦相似度的分子部分,还需要除以分母,即两个商品被交互的用户数量的乘积
    # 两个商品被交互的用户数量就是上面统计的num字典
    print('计算协同过滤矩阵．．．')
    for i, items in tqdm(sim.items()):
        for j, score in items.items():
            if i != j:
                sim[i][j] = score / math.sqrt(num[i] * num[j])
    
    for i, relations in sim.items():
        if relations:
            max_num = relations[max(relations, key=relations.get)]
            if max_num!=0:
                # 对字典进行归一化操作之后返回新的字典
                sim[i] = {k: v / max_num for k, v in relations.items()}
                
        
    # 对验证数据中的每个用户进行TopN推荐
    # 在对用户进行推荐之前需要先通过商品相似度矩阵得到 与 当前测试用户交互过的商品最相似的前K个商品，
    # 然后对这K个用户交互的商品中除当前测试用户训练集中交互过的商品以外的商品计算最终的相似度分数
    # 最终推荐的候选商品的相似度分数是由多个相似商品对该商品分数的一个累加和
    items_rank = {}
    print('给用户进行推荐．．．')
    for uid, _ in tqdm(val_user_items.items()):
        items_rank[uid] = {} # 存储用户候选的推荐商品
        for hist_item in trn_user_items[uid]: # 遍历该用户历史喜欢的商品，用来下面寻找其相似的商品
            # 回顾：sim = {item_id1: {item_id２: num1, item_id3: num3,...}, item_id３: {item_id４: num２,...}, ...}
            # 回顾：trn_user_items = {user_id1: [item_id1, item_id2,...,item_idn], user_id2: ...} 
            #print(sim[hist_item])# {item_id２: num1, item_id3: num3,...}
            for item, score in sorted(sim[hist_item].items(), key=lambda x: x[1], reverse=True)[:K]:
                if item not in trn_user_items[uid]: # 进行推荐的商品一定不能在历史喜欢商品中出现
                    #计算当前uid对当前item的打分
                    if item not in items_rank[uid]:
                        items_rank[uid][item] = 0
                    items_rank[uid][item] += score
                    
    
    print('为每个用户筛选出相似度分数最高的Ｎ个商品...')
    if not prediction:
        #print(items_rank)#{user1:{item1:score1,item2:score2,...},user2:{item6:score6,item9:score9,...}}
        items_rank = {k: sorted(v.items(), key=lambda x: x[1], reverse=True)[:N] for k, v in items_rank.items()}
        items_rank = {k: [x[0] for x in v] for k, v in items_rank.items()}
    else:
        items_rank = {k: sorted(v.items(), key=lambda x: x[1], reverse=True) for k, v in items_rank.items()}
        items_rank = {k: [x[0] for x in v if x[0] in candidate_items][:N] for k, v in items_rank.items()}
        
        
        
    
    if overview:
        #查看一下sim   
        for k ,v in sim.items():
            print(f"similarity[{k}]:") 
            print(v)
            break
            
    return items_rank, sim



time: 2.52 ms (started: 2022-06-13 06:06:48 +00:00)


In [55]:
def rank_itemcf(target_dict, overview=False):
    item_id_rank_arr = pd.DataFrame.from_dict(target_dict, orient='index').reset_index().rename(columns={'index': 'session_id'}).fillna(-1).astype(int)
    lack_col_num = 101 - len(item_id_rank_arr.columns)
    for i in range(100-lack_col_num,100):
        item_id_rank_arr[i]=-1
     
    if overview:
        print(f"lack_col_num:{lack_col_num}")
        display(item_id_rank_arr)
    
    res_df = pd.melt(item_id_rank_arr, id_vars=['session_id'], value_vars=range(100)).sort_values(['session_id','variable']).rename(columns={'value': 'item_id'}).drop(['variable'], axis=1)
    res_df["rank"] = (list(range(1,101))*len(target_dict))
    
    
    return res_df

time: 724 µs (started: 2022-06-13 06:06:49 +00:00)


# evaluation

In [57]:
# 读取数据
train_file_path = '../../recsys2022/train_sessions_last_item.csv'
val_file_path = '../datasets/train_last_1m.csv'
trn_user_items, val_user_items = get_data(train_file_path, val_file_path, concat_val=False)
print(f'trn_user_items len:{len(trn_user_items)},val_user_items len:{len(val_user_items)}')

trn_user_items len:1000000,val_user_items len:81618
time: 17.4 s (started: 2022-06-13 06:10:00 +00:00)


In [58]:
# 計算ItemCF
k_list = [2000]
mrr_k_records={}
for k in k_list:
    n=100
    rec_items, sim = Item_CF(trn_user_items, val_user_items, k, n, prediction=False)
    display(rec_items)


    # rank item & output rank result
    eva_res = rank_itemcf(rec_items)
    eva_label = pd.read_csv('../datasets/purchases_last_1m.csv')

    mrr_score = mrr(eva_res, eva_label, overview=False)
    mrr_k_records[k] = mrr_score
    print(mrr_score)
    display(f"eva_res shape:{eva_res.shape}", eva_res.iloc[95:105,:])
    
print(mrr_k_records)

构建相似性矩阵．．．


100% 1000000/1000000 [00:00<00:00, 1473123.59it/s]


计算协同过滤矩阵．．．


100% 20891/20891 [00:00<00:00, 2465411.91it/s]


给用户进行推荐．．．


100% 81618/81618 [00:00<00:00, 659676.81it/s]


为每个用户筛选出相似度分数最高的Ｎ个商品...


{113: [],
 115: [],
 261: [],
 325: [],
 332: [],
 388: [],
 526: [],
 579: [],
 782: [],
 795: [],
 809: [],
 931: [],
 934: [],
 937: [],
 964: [],
 997: [],
 1061: [],
 1129: [],
 1131: [],
 1135: [],
 1155: [],
 1215: [],
 1252: [],
 1333: [],
 1341: [],
 1344: [],
 1382: [],
 1419: [],
 1516: [],
 1545: [],
 1551: [],
 1563: [],
 1757: [],
 1873: [],
 1939: [],
 1949: [],
 1953: [],
 1964: [],
 2036: [],
 2099: [],
 2100: [],
 2123: [],
 2162: [],
 2164: [],
 2185: [],
 2226: [],
 2239: [],
 2255: [],
 2281: [],
 2451: [],
 2487: [],
 2488: [],
 2494: [],
 2502: [],
 2533: [],
 2635: [],
 2674: [],
 2809: [],
 2857: [],
 2938: [],
 2957: [],
 2976: [],
 3007: [],
 3026: [],
 3034: [],
 3089: [],
 3092: [],
 3220: [],
 3402: [],
 3441: [],
 3447: [],
 3538: [],
 3557: [],
 3559: [],
 3598: [],
 3771: [],
 3787: [],
 3863: [],
 3906: [],
 3976: [],
 4136: [],
 4270: [],
 4285: [],
 4292: [],
 4330: [],
 4347: [],
 4356: [],
 4405: [],
 4452: [],
 4486: [],
 4543: [],
 4565: [],
 458

0.0


'eva_res shape:(8161800, 3)'

Unnamed: 0,session_id,item_id,rank
7753710,113,-1,96
7835328,113,-1,97
7916946,113,-1,98
7998564,113,-1,99
8080182,113,-1,100
1,115,-1,1
81619,115,-1,2
163237,115,-1,3
244855,115,-1,4
326473,115,-1,5


{2000: 0.0}
time: 3.95 s (started: 2022-06-13 06:10:23 +00:00)


In [48]:
# eva_res.to_csv("../datasets/results/itemcf_1m_en.csv", index=False)

time: 7.03 s (started: 2022-06-12 15:45:24 +00:00)


### parameters測試結果
#### K值測試
trn_data period:2020/01/01~2021/05/31

{1: 0.09500869597028379, 2: 0.1251135352303383, 3: 0.13965544101748156,  <br />
 4: 0.14787793949921724, 5: 0.15199841935504121, 6: 0.15477301681355773, <br />
 10: 0.1599691541461026, 15: 0.16275048167764544, 30: 0.16575185779216386, <br />
 80: 0.16741300751998583, 100: 0.16779740173617033, 200: 0.16817674228462637, <br />
 300: 0.16833651410145736, 400: 0.16843929730413187, 500: 0.1686072146381592, <br />
 800: 0.16871724946498262, 1000: 0.16877922789723332, 2000: 0.16880877727791824, <br />
 4000: 0.16883626427458198, 8000: 0.16883538514910407, 10000: 0.1688353687598421,  <br />
 20000: 0.1688353687598421, 30000: 0.1688353687598421}
 
#### trn_data period測試
k值:2000

{2021/5/1\~2021/5/31: 0.16680978195700882, <br /> 
 2021/3/1\~2021/5/31: 0.17071956198398217, <br />
 2021/2/1\~2021/5/31: 0.17071705394200648, <br />
 2021/1/1\~2021/5/31: 0.17099863193401307, <br />
 2020/12/1\~2021/5/31: 0.1706558516592896, <br />
 2020/10/1\~2021/5/31: 0.16990188467889028, <br />
 2020/5/31\~2021/5/31: 0.16908869780443572, <br />
 2021/6/1\~2021/6/31: }

# Output

In [49]:
# 读取数据
train_file_path = '../datasets/train_purchases_last_5m.csv'
val_file_path = '../../recsys2022/test_leaderboard_sessions.csv'
trn_user_items, val_user_items = get_data(train_file_path, val_file_path, concat_val=True)
print(f'trn_user_items len:{len(trn_user_items)},val_user_items len:{len(val_user_items)}')


# 計算ItemCF
candidate_items = pd.read_csv('../../recsys2022/candidate_items.csv')['item_id'].tolist()
k=8000 # 4 is also acceptable  
n=100
rec_items, sim = Item_CF(trn_user_items, val_user_items, k, n, candidate_items=candidate_items, prediction=True)


# rank item & output rank result
res = rank_itemcf(rec_items)
res.to_csv("../datasets/results/leader_itemcf_k8000_train_purchases_5m_leader_candidate_filter_0613.csv", index=False)

trn_user_items len:362907,val_user_items len:50000
构建相似性矩阵．．．


100% 362907/362907 [00:09<00:00, 36496.17it/s]


计算协同过滤矩阵．．．


100% 11504/11504 [00:02<00:00, 4725.38it/s]


给用户进行推荐．．．


100% 50000/50000 [03:02<00:00, 274.23it/s]


为每个用户筛选出相似度分数最高的Ｎ个商品...
time: 1h 15min 22s (started: 2022-06-13 04:41:06 +00:00)


In [None]:
# view_session_id = 113
view_session_id = 659598
print(rec_items[view_session_id])
print(val_user_items[view_session_id])
print(trn_user_items[view_session_id])
print(sim[list(trn_user_items[view_session_id])[0]])

In [None]:
rec_items

In [None]:
print(sim.keys())

In [None]:
print([len(sim[k]) for k in sim.keys()])

In [None]:
key = 115
print(sim[key])