In [1]:
import math
import random
import pandas as pd
from collections import defaultdict
from operator import itemgetter
from tqdm import tqdm
import sys
sys.path.append("../utils/")
from recsys_metric import mrr

time: 181 ms (started: 2022-06-10 15:04:02 +00:00)


In [39]:
def LoadMovieLensData(filepath, train_rate):
    ratings = pd.read_table(filepath, sep=",", engine='python')
    ratings = ratings[['session_id','item_id']]

    train = []
    test = []
    random.seed(3)
    for idx, row in ratings.iterrows():
        user = int(row['session_id'])
        item = int(row['item_id'])
        if random.random() < train_rate:
            train.append([user, item])
        else:
            test.append([user, item])
    return PreProcessData(train), PreProcessData(test)

def PreProcessData(originData):
    """
    建立User-Item表，结构如下：
        {"session1": {item_id1, item_id2, item_id3,...}
         "session2": {item_id12, item_id5, item_id8,...}
         ...
        }
    """
    trainData = dict()
    for user, item in originData:
        trainData.setdefault(user, set())
        trainData[user].add(item)
    return trainData


class ItemCF(object):
    """ Item based Collaborative Filtering Algorithm Implementation"""
    def __init__(self, trainData, similarity="cosine", norm=True, pred=False):
        self._trainData = trainData
        self._similarity = similarity
        self._isNorm = norm
        self._itemSimMatrix = dict() # 物品相似度矩阵
        self.prediction = pred

    def similarity(self):
        N = defaultdict(int) #记录每个物品的喜爱人数
        for user, items in self._trainData.items():
            for i in items:
                self._itemSimMatrix.setdefault(i, dict())
                N[i] += 1
                for j in items:
                    if i == j:
                        continue
                    self._itemSimMatrix[i].setdefault(j, 0)
                    if self._similarity == "cosine":
                        self._itemSimMatrix[i][j] += 1
                    elif self._similarity == "iuf":
                        self._itemSimMatrix[i][j] += 1. / math.log1p(len(items) * 1.)
        for i, related_items in self._itemSimMatrix.items():
            for j, cij in related_items.items():
                self._itemSimMatrix[i][j] = cij / math.sqrt(N[i]*N[j])
        # 是否要标准化物品相似度矩阵
        if self._isNorm:
            for i, relations in self._itemSimMatrix.items():
                if relations:
                    max_num = relations[max(relations, key=relations.get)]
                    # 对字典进行归一化操作之后返回新的字典
                    self._itemSimMatrix[i] = {k : v/max_num for k, v in relations.items()}

    def recommend(self, user_list, N, K):
        """
        :param user: 被推荐的用户user
        :param N: 推荐的商品个数
        :param K: 查找的最相似的用户个数
        :return: 按照user对推荐物品的感兴趣程度排序的N个商品
        """
        recommends = dict()
        for user in user_list:
            recommends[user] = {}
            # 先获取user的喜爱物品列表
            items = self._trainData[user]
            for item in items:
                # 对每个用户喜爱物品在物品相似矩阵中找到与其最相似的K个
                for i, sim in sorted(self._itemSimMatrix[item].items(), key=itemgetter(1), reverse=True)[:K]:
                    if i in items:
                        continue  # 如果与user喜爱的物品重复了，则直接跳过
                    recommends[user].setdefault(i, 0.)
                    recommends[user][i] += sim
                    
        # 根据被推荐物品的相似度逆序排列，然后推荐前N个物品给到用户
        if not self.prediction:
            items_rank = {k: sorted(v.items(), key=lambda x: x[1], reverse=True)[:N] for k, v in recommends.items()}
            items_rank = {k: [x[0] for x in v] for k, v in items_rank.items()}
            #print(items_rank)#{user1:{item1:score1,item2:score2,...},user2:{item6:score6,item9:score9,...}}
            return items_rank 

    def train(self):
        self.similarity()

if __name__ == "__main__":
    train, test = LoadMovieLensData("../../recsys2022/train_sessions.csv", 1)
    print("train data size: %d, test data size: %d" % (len(train), len(test)))
    ItemCF = ItemCF(train, similarity='iuf', norm=True)
    ItemCF.train()

    # 分别对以下4个用户进行物品推荐
    # print(ItemCF.recommend(1, 5, 80))
    # print(ItemCF.recommend(2, 5, 80))
    # print(ItemCF.recommend(3, 5, 80))
    # print(ItemCF.recommend(4, 5, 80))

train data size: 1000000, test data size: 0
time: 3min 11s (started: 2022-06-11 06:11:19 +00:00)


In [40]:
def rank_itemcf(target_dict, overview=False):
    item_id_rank_arr = pd.DataFrame.from_dict(target_dict, orient='index').reset_index().rename(columns={'index': 'session_id'}).fillna(-1).astype(int)
    display(item_id_rank_arr)
    lack_col_num = 101 - len(item_id_rank_arr.columns)
    for i in range(100-lack_col_num,100):
        item_id_rank_arr[i]=-1
     
    if overview:
        print(f"lack_col_num:{lack_col_num}")
        display(item_id_rank_arr)
    
    res_df = pd.melt(item_id_rank_arr, id_vars=['session_id'], value_vars=range(100)).sort_values(['session_id','variable']).rename(columns={'value': 'item_id'}).drop(['variable'], axis=1)
    res_df["rank"] = (list(range(1,101))*len(target_dict))
    
    
    return res_df

time: 600 µs (started: 2022-06-11 06:14:30 +00:00)


In [41]:
val_label = pd.read_csv("../datasets/purchases_last_1m.csv")
k=3
n=100
rec_items = ItemCF.recommend(set(val_label['session_id']), n, k)

time: 2min 27s (started: 2022-06-11 06:14:30 +00:00)


In [42]:
rank_itemcf(rec_items, overview=True)

Unnamed: 0,session_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,2883586,17473,2963,17373,22375,9409,16873,648,10388,12593,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,3670023,8829,11584,17948,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,262152,16416,15458,18767,5915,1221,7999,28122,16157,4220,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,3670024,13115,4983,1937,17428,7083,3946,7171,17944,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,786442,25420,10636,2794,18031,27548,18975,4009,4813,15426,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81613,1572858,3658,18108,12901,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81614,2359291,25623,7643,23612,102,8861,22751,3773,17288,18981,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81615,3145724,20372,27409,10997,23719,7455,13108,340,17037,22146,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81616,1835005,4130,23774,26892,14306,5966,22747,18657,13403,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


lack_col_num:0


Unnamed: 0,session_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,2883586,17473,2963,17373,22375,9409,16873,648,10388,12593,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,3670023,8829,11584,17948,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,262152,16416,15458,18767,5915,1221,7999,28122,16157,4220,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,3670024,13115,4983,1937,17428,7083,3946,7171,17944,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,786442,25420,10636,2794,18031,27548,18975,4009,4813,15426,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81613,1572858,3658,18108,12901,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81614,2359291,25623,7643,23612,102,8861,22751,3773,17288,18981,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81615,3145724,20372,27409,10997,23719,7455,13108,340,17037,22146,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81616,1835005,4130,23774,26892,14306,5966,22747,18657,13403,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


Unnamed: 0,session_id,item_id,rank
43,113,5433,1
81661,113,17036,2
163279,113,10553,3
244897,113,18947,4
326515,113,11044,5
...,...,...,...
7830615,4439986,-1,96
7912233,4439986,-1,97
7993851,4439986,-1,98
8075469,4439986,-1,99


time: 3.71 s (started: 2022-06-11 06:16:57 +00:00)


In [43]:
eva_res = rank_itemcf(rec_items)
eva_label = val_label 

mrr_score = mrr(eva_res, eva_label, overview=False)

print(mrr_score)
display(f"eva_res shape:{eva_res.shape}", eva_res.iloc[95:105,:])

Unnamed: 0,session_id,0,1,2,3,4,5,6,7,8,...,90,91,92,93,94,95,96,97,98,99
0,2883586,17473,2963,17373,22375,9409,16873,648,10388,12593,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
1,3670023,8829,11584,17948,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
2,262152,16416,15458,18767,5915,1221,7999,28122,16157,4220,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
3,3670024,13115,4983,1937,17428,7083,3946,7171,17944,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
4,786442,25420,10636,2794,18031,27548,18975,4009,4813,15426,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
81613,1572858,3658,18108,12901,-1,-1,-1,-1,-1,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81614,2359291,25623,7643,23612,102,8861,22751,3773,17288,18981,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81615,3145724,20372,27409,10997,23719,7455,13108,340,17037,22146,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1
81616,1835005,4130,23774,26892,14306,5966,22747,18657,13403,-1,...,-1,-1,-1,-1,-1,-1,-1,-1,-1,-1


0.13965544101748156


'eva_res shape:(8161800, 3)'

Unnamed: 0,session_id,item_id,rank
7753753,113,-1,96
7835371,113,-1,97
7916989,113,-1,98
7998607,113,-1,99
8080225,113,-1,100
45,115,-1,1
81663,115,-1,2
163281,115,-1,3
244899,115,-1,4
326517,115,-1,5


time: 4.02 s (started: 2022-06-11 06:17:01 +00:00)


In [None]:
val_label = pd.read_csv("../datasets/purchases_last_1m.csv")

k=3
n=100
item_id_rank_arr = pd.DataFrame(columns=['session_id', 'item_id', 'rank'])

for s_id in set(val_label['session_id']):
    temp_df = pd.DataFrame()
    rec_list = list(ItemCF.recommend(s_id, n, k).keys())
    lack_num = 100-len(rec_list)
    rec_list.extend([-1]*lack_num)
    temp_df['session_id'] = [s_id]*100
    temp_df['item_id'] = rec_list
    temp_df['rank'] = list(range(1,101))
    item_id_rank_arr = pd.concat([item_id_rank_arr,temp_df])
    
    

In [38]:
rec_items

{2883586: [(17473, 1.0),
  (2963, 1.0),
  (17373, 1.0),
  (22375, 1.0),
  (9409, 1.0),
  (16873, 1.0),
  (648, 1.0),
  (10388, 1.0),
  (12593, 1.0),
  (16009, 1.0),
  (11275, 1.0),
  (17093, 0.9976260383871072),
  (21700, 0.9428412814883207),
  (6677, 0.939086886052383),
  (3503, 0.9055906725684971),
  (3403, 0.8950838056135877),
  (14443, 0.8319672921816967),
  (19815, 0.8135778598773772),
  (19643, 0.7597100540268887),
  (18732, 0.7577822297034791),
  (13966, 0.7363626530910354),
  (19040, 0.7342654140722884),
  (9083, 0.7297471075082994),
  (16216, 0.6817012944685596),
  (19346, 0.6643642352894337),
  (5289, 0.6057728806608134),
  (24065, 0.5445596932247002),
  (2377, 0.5320902453323518),
  (9767, 0.4358506343130126),
  (15472, 0.4064044691716537),
  (2848, 0.36287269658439264),
  (26237, 0.3158866077883625),
  (15032, 0.312362095305142),
  (21000, 0.29227078783701943),
  (27104, 0.29221762449252364),
  (7571, 0.2890230303295955)],
 3670023: [(8829, 1.0),
  (11584, 0.384500431660261

time: 242 ms (started: 2022-06-11 06:10:19 +00:00)


In [6]:
print(ItemCF._itemSimMatrix.keys())

dict_keys([9655, 15654, 4026, 2507, 18316, 20033, 4385, 27937, 12804, 6341, 23687, 18936, 25772, 8268, 8813, 6704, 25555, 27638, 19896, 8281, 1755, 8316, 16064, 28075, 18539, 18476, 11662, 2927, 10414, 434, 11529, 21902, 16895, 16289, 4230, 264, 25972, 2069, 26457, 27579, 26536, 25417, 17472, 20523, 10395, 22492, 17089, 22747, 26404, 8398, 21358, 24636, 12047, 5409, 24315, 3173, 13214, 7935, 20147, 27477, 19062, 4867, 15237, 24454, 25086, 26130, 13222, 25896, 2995, 5434, 13885, 15421, 12735, 11843, 22860, 4816, 2536, 27499, 25964, 2925, 5747, 7159, 11256, 11386, 382, 17218, 6187, 19760, 15738, 2171, 21215, 19705, 10442, 15969, 16631, 25907, 26148, 26092, 26016, 14790, 23823, 689, 18902, 13914, 18723, 11378, 27770, 6110, 16218, 973, 8806, 21152, 3462, 3320, 13547, 22079, 16417, 10244, 4822, 2036, 23789, 14383, 19960, 4798, 2284, 24641, 22694, 16075, 16626, 26835, 13717, 23638, 4086, 21018, 17932, 26257, 27811, 2354, 20275, 21816, 26075, 15611, 21773, 18518, 19464, 15200, 12434, 26424, 4

In [17]:
print([len(ItemCF._itemSimMatrix[k]) for k in ItemCF._itemSimMatrix.keys()])

[670, 277, 1469, 2165, 1030, 2577, 2579, 977, 2221, 1596, 2656, 2514, 3717, 3095, 2725, 2018, 3372, 2112, 2694, 1904, 2758, 1766, 3017, 436, 1344, 1834, 683, 983, 670, 3551, 1962, 1775, 2258, 957, 222, 822, 880, 1870, 1844, 1515, 1566, 468, 1272, 1136, 1980, 1517, 6352, 3099, 167, 685, 606, 1525, 1905, 893, 2187, 2534, 1324, 2774, 2558, 2646, 2311, 2762, 1502, 3020, 1722, 3039, 763, 2412, 2098, 1290, 1295, 1970, 2628, 2379, 2316, 1688, 968, 2036, 1033, 1577, 2984, 2606, 2450, 3896, 1659, 1908, 2889, 1249, 4656, 916, 5958, 997, 2284, 951, 3731, 2150, 1628, 1158, 133, 554, 336, 433, 231, 2512, 4078, 2559, 38, 304, 2989, 1410, 1452, 4428, 2131, 259, 1222, 1671, 1202, 802, 2517, 1767, 5010, 1220, 2245, 1958, 1060, 2526, 1102, 2207, 1158, 2761, 2357, 1571, 912, 1709, 1469, 2162, 1123, 841, 2184, 1929, 744, 640, 746, 1680, 3459, 2198, 1053, 1162, 1518, 2547, 2433, 1574, 3256, 2904, 2522, 2513, 5803, 2340, 1666, 2559, 1256, 1480, 1190, 1028, 1308, 1382, 1630, 2494, 2124, 3083, 10168, 5667, 60

In [19]:
key = 115
print(ItemCF._itemSimMatrix[key])

{20808: 0.6348946444696385, 3036: 0.3239809109273099, 24213: 0.4156550819207423, 14496: 0.37909713813082835, 17166: 0.12922493522563297, 26812: 0.08550314032752404, 12948: 0.5186813889811476, 25180: 0.3623741156462223, 20573: 0.11947859139363787, 17696: 0.3973687444194953, 11363: 0.17276966535665395, 11498: 0.39497231598057186, 23505: 0.7053446164016923, 24149: 0.36383131347448566, 18873: 0.1406752546233946, 17533: 0.13287999234638923, 10982: 0.510150705330931, 4074: 0.44655116146588575, 11729: 1.0, 6418: 0.35996067646728636, 24868: 0.361584958970149, 27810: 0.3452587853424748, 293: 0.03655421774924465, 4822: 0.015675283135329246, 13261: 0.13842095221329503, 17011: 0.07066395162645754, 27954: 0.3421423556265305, 19319: 0.18817964021963232, 24609: 0.1601565612605614, 6210: 0.16474297096060606, 15611: 0.21761856356855486, 11675: 0.459939398338333, 8027: 0.13371350266115395, 478: 0.17988693424529215, 9735: 0.021983809033643453, 11913: 0.14635950827305377, 4491: 0.009784417540601437, 10640