In [1]:
# import packages
import time, math, os
from tqdm import tqdm
import gc
import pickle
import random
from datetime import datetime
from operator import itemgetter
import numpy as np
import pandas as pd
import warnings
from collections import defaultdict
import collections
warnings.filterwarnings('ignore')

In [2]:
def get_all_click_data(data_path='./', offline=True):
 
    all_click = pd.read_csv(data_path + 'action.csv')
   
    all_click = all_click.drop_duplicates((['user_id', 'dance_id', 'lable']))
    return all_click

In [3]:
data=get_all_click_data()
data

Unnamed: 0.1,Unnamed: 0,dance_id,user_id,lable
0,0,207000097,24696418,1.000
1,1,207000097,32507861,1.000
2,2,207000097,39060478,1.000
3,3,207000097,85909646,1.000
4,4,207000097,172874760,1.000
...,...,...,...,...
845547,845564,294218316,99992300,0.830
845548,845565,717648043,99993075,0.828
845549,845566,849276941,99996967,0.817
845550,845567,295113421,99997083,0.703


In [5]:
# 获取用户的点击舞蹈  
# 格式：{user1: [(item1, lable), (item2,lable)..]...}
def get_user_item_time_data(click_df):
    def make_item_time_pair(df):
        return list(zip(df['dance_id'], df['lable']))
    # 按用户分组
    user_item_time_df = click_df.groupby('user_id')['dance_id', 'lable'].apply(lambda x: make_item_time_pair(x)).reset_index().rename(columns={0: 'item_time_list'})
    # 构建数据字典
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    return user_item_time_dict

In [6]:
#热门舞蹈视频
# 获取点击最多的舞蹈视频
def get_topk_item(click_df, k):
    topk_click = click_df['dance_id'].value_counts().index[:k]
    return topk_click

In [7]:
topd=get_topk_item(data, 30)

In [8]:
topd

Int64Index([504840051, 548500449, 676673014, 379536282, 850118371, 421575207,
            933076405, 720628602, 932346746, 806924735, 718644535, 677104627,
            720855610, 975641872, 547952243, 974831847, 337402390, 378365197,
            933580136, 462071916, 250508491, 590387741, 762172963, 420609102,
            975463977, 632239402, 722762762, 207776215, 975620441, 420017732],
           dtype='int64')

In [8]:
# 协同过滤(基于物品)
# 基于物品的协同过滤itemcf的物品相似度计算

In [9]:
save_path='temp/'

In [10]:
def cal_itemcf_sim(df):
    """
        基于物品的协同过滤，视频与视频之间的相似性矩阵计算，参数与返回值如下
        df: 数据表
        item_created_time_dict:  视频创建时间的字典
        return : 视频与视频的相似性矩阵
    """
    # 获取 用户，舞蹈，点击时间 数据组
    user_item_time_dict = get_user_item_time_data(df)
    
    # 计算物品相似度
    i2i_sim = {}
    item_cnt = defaultdict(int)
    for user, item_time_list in tqdm(user_item_time_dict.items()):
        # 填充与构建相似度矩阵
        for i, i_click_time in item_time_list:
            item_cnt[i] += 1
            i2i_sim.setdefault(i, {})
            for j, j_click_time in item_time_list:
                if(i == j):
                    continue
                i2i_sim[i].setdefault(j, 0)
                
                i2i_sim[i][j] += 1 / math.log(len(item_time_list) + 1)
                
    i2i_sim_ = i2i_sim.copy()
    for i, related_items in i2i_sim.items():
        for j, wij in related_items.items():
            i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
    # 将得到的相似性矩阵保存到本地
    pickle.dump(i2i_sim_, open(save_path + 'itemcf_i2i_sim.pkl', 'wb'))
    
    return i2i_sim_

In [11]:
i2i_sim = cal_itemcf_sim(data)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 235156/235156 [00:06<00:00, 38712.79it/s]


In [12]:
# 基于舞蹈相似度的召回i2i
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, topk_items):
    """
        基于物品(舞蹈)协同过滤的召回，参数和返回值如下：
        
        user_id: 用户id
        user_item_time_dict: 字典, 根据点击时间获取用户的点击舞蹈序列 {user1: [(item1, time1), (item2, time2)..]...}
        i2i_sim: 字典，视频相似性矩阵
        sim_item_topk: 整数， 选择与当前舞蹈最相似的前k个舞蹈视频
        recall_item_num: 整数， 最后的召回舞蹈数量
        topk_items: 列表，点击次数最多的舞蹈列表，用户召回补全  
        
        return: 召回的舞蹈列表 {item1:score1, item2: score2...}
    """
    
    # 获取用户历史交互的舞蹈
    user_past_items = user_item_time_dict[user_id]
    user_past_items_ = {user_id for user_id, _ in user_past_items}
    
    # 遍历相似度矩阵
    item_rank = {}
    for loc, (i, click_time) in enumerate(user_past_items):
        for j, wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:
            if j in user_past_items_:
                continue
                
            item_rank.setdefault(j, 0)
            item_rank[j] +=  wij
    
    # 没有达到推荐的数量，则用topk热门舞蹈补全
    if len(item_rank) < recall_item_num:
        for i, item in enumerate(topk_items):
            if item in item_rank.items(): # 填充的item应该不在原来的列表中
                continue
            item_rank[item] = - i - 100
            if len(item_rank) == recall_item_num:
                break
    
    item_rank = sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]
        
    return item_rank

In [13]:
# 定义
user_recall_items_dict = collections.defaultdict(dict)

# 获取 （用户，舞蹈，点击时间）数据字典
user_item_time_dict = get_user_item_time_data(data)

# 获取舞蹈相似度
i2i_sim = pickle.load(open(save_path + 'itemcf_i2i_sim.pkl', 'rb'))

# 相似舞蹈的数量
sim_item_topk = 30

# 召回舞蹈数量
recall_item_num = 20

# 用户热度补全
topk_items = get_topk_item(data, k=50)

for user in tqdm(data['user_id'].unique()):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, 
                                                        sim_item_topk, recall_item_num, topk_items)

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 235156/235156 [03:37<00:00, 1079.55it/s]


In [14]:
# 将字典的形式转换成df
user_item_score_list = []

for user, items in tqdm(user_recall_items_dict.items()):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_dance_id', 'pred_score'])

100%|████████████████████████████████████████████████████████████████████████████████████████████| 235156/235156 [00:09<00:00, 25674.65it/s]


In [15]:
# 生成提交文件
def submit(recall_df, topk=20, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    
    # 判断是不是每个用户都有5个舞蹈视频及以上
    tmp = recall_df.groupby('user_id').apply(lambda x: x['rank'].max())
    assert tmp.min() >= topk
    
    del recall_df['pred_score']
    submit = recall_df[recall_df['rank'] <= topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    # 按照提交格式定义列名
    submit = submit.rename(columns={'': 'user_id', 1: 'dance_1', 2: 'dance_2',3: 'dance_3', 4: 'dance_4', 5: 'dance_5'
                                    , 6: 'dance_6',7: 'dance_7', 8: 'dance_8', 9: 'dance_9', 10: 'dance_10'
                                   , 11: 'dance_11', 12: 'dance_12', 13: 'dance_13', 14: 'dance_14', 15: 'dance_15'
                                   , 16: 'dance_16', 17: 'dance_17', 18: 'dance_18', 19: 'dance_19', 20: 'dance_20'})
                                   
    
    save_name = save_path + model_name + '_' + datetime.today().strftime('%m-%d') + '.csv'
    submit.to_csv(save_name, index=False, header=True)

In [16]:
# 获取测试集
tst_click = pd.read_csv('action.csv')
tst_users = tst_click['user_id'].unique()

# 从所有的召回数据中将测试集中的用户选出来
tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]



In [17]:
tst_recall

Unnamed: 0,user_id,click_dance_id,pred_score
0,24696418,377803425,0.089549
1,24696418,890301542,0.081032
2,24696418,505595973,0.071809
3,24696418,933653977,0.069412
4,24696418,377119806,0.064263
...,...,...,...
4703115,603294845,976337342,0.021556
4703116,603294845,763088269,0.020440
4703117,603294845,589779739,0.020264
4703118,603294845,550085132,0.019676


In [None]:
# 生成提交文件
submit(tst_recall, topk=20, model_name='itemcf_baseline')