baseline  ItemCF

In [1]:
import numpy as np
import pandas as pd
import os
import warnings
from tqdm import tqdm
from pathlib import Path

from collections import defaultdict
import logging
logger = logging.getLogger(__name__)
import math
import pickle
data_path=Path(r"C:\Users\Administrator\Desktop\tianchi\data")

save_path=r'C:\Users\Administrator\Desktop\tianchi\data\save'
if not os.path.exists(save_path):
    os.makedirs(save_path)

In [2]:
#节省内存函数
def reduce_mem(df):
    start_mem=df.memory_usage().sum()/1024**2
    print(f'Before:{start_mem:.2f} MB')
    for col in df.columns:
        col_type=df[col].dtype
        #不是字符串，即为数值类型
        if col_type!=object:
            c_min=df[col].min()
            c_max=df[col].max()
            if str(col_type).startwith('int'):
                if c_min>=np.iinfo(np.int8).min and c_max<=np.iinfo(np.int8).max:
                    df[col]=df[col].astype(np.int8)   
                elif c_min>=np.iinfo(np.int16).min and c_max<=np.iinfo(np.int16).max:
                    df[col]=df[col].astype(np.int16)
                elif c_min>=np.iinfo(np.int32).min and c_max<=np.iinfo(np.int32).max:
                    df[col]=df[col].astype(np.int32)
            elif str(col_type).startwith('float'):
                df[col]=df[col].astype(np.float32)
        
        #是字符串
        else:
            if df[col].nunique()/len(df[col])<0.5:
                df[col]=df[col].astype('category')
        
    end_mem=df.memory_usage().sum()/1024**2
    print(f'After:{end_mem:.2f} MB')
    print(f'Decreased:{(1-end_mem/start_mem)*100:.2f}%')

    return df

#得到一份「用户 × 文章 × 点击时间」的点击日志 DataFrame

In [3]:
train_log=pd.read_csv(data_path / 'train_click_log.csv')
train_log.head()
#train_log.shape:(1112623, 9)

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,199999,160417,1507029570190,4,1,17,1,13,1
1,199999,5408,1507029571478,4,1,17,1,13,1
2,199999,50823,1507029601478,4,1,17,1,13,1
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5


In [4]:
#debug
def get_all_click_sample(data_path,sample_nums=10000):
    all_click=pd.read_csv(data_path)
    all_user_ids=all_click.user_id.unique()

    #不重复采样，从所有用户中随机选择sample_nums的用户
    sample_user_ids=np.random.choice(all_user_ids,size=sample_nums,replace=False)
    all_click=all_click[all_click['user_id'].isin(sample_user_ids)]

    #扔掉重复的数据
    all_click=all_click.drop_duplicates((['user_id','click_article_id','click_timestamp']))
    return all_click
def get_all_click_df(data_path, offline=True):
    if offline:
        all_click = pd.read_csv(data_path / 'train_click_log.csv')[:20000]
    else:
        trn_click = pd.read_csv(data_path / 'train_click_log.csv')[:10000]
        tst_click = pd.read_csv(data_path / 'testA_click_log.csv')[:10000]
        all_click = pd.concat([trn_click, tst_click])
    all_click.drop_duplicates((['user_id', 'click_article_id', 'click_timestamp'])).reset_index(drop=True)
    return all_click
all_click_df = get_all_click_df(data_path, offline=False)
all_click_df.head()

Unnamed: 0,user_id,click_article_id,click_timestamp,click_environment,click_deviceGroup,click_os,click_country,click_region,click_referrer_type
0,199999,160417,1507029570190,4,1,17,1,13,1
1,199999,5408,1507029571478,4,1,17,1,13,1
2,199999,50823,1507029601478,4,1,17,1,13,1
3,199998,157770,1507029532200,4,1,17,1,25,5
4,199998,96613,1507029671831,4,1,17,1,25,5


In [5]:
#根据点击时间获取用户的点击文章序列： {user1:{item1:time1, item2:time2, item3:time3...} user2:{item1:time1, item2:time2, item3:time3...} ...}
def get_user_item_time(click_df):
    click_df=click_df.sort_values('click_timestamp')
    def make_item_time_pair(df):
        return list(zip(df['click_article_id'], df['click_timestamp']))
    user_item_time_df=click_df.groupby('user_id')[['click_article_id', 'click_timestamp']].apply(lambda x: make_item_time_pair(x)).reset_index().rename(columns={0:'item_time_list'})
    user_item_time_dict = dict(zip(user_item_time_df['user_id'], user_item_time_df['item_time_list']))
    return user_item_time_dict
#近期点击最多的k个文章
def get_item_topk_click(click_df, k):
    topk_click = click_df['click_article_id'].value_counts().index[:k]
    return topk_click

In [6]:
#item_CF 物品相似度计算
def itemcf_sim(df):
    """
        文章与文章之间的相似性矩阵计算
        :param df: 数据表
        :item_created_time_dict:  文章创建时间的字典
        return : 文章与文章的相似性矩阵
        思路: 基于物品的协同过滤(详细请参考上一期推荐系统基础的组队学习)， 在多路召回部分会加上关联规则的召回策略
    """
    user_item_time_dict=get_user_item_time(df)
    i2i_sim = {}
    item_cnt = defaultdict(int)
    print(next(iter(user_item_time_dict.items())))

    for user, item_time_list in tqdm(user_item_time_dict.items(), disable = not logger.isEnabledFor(logging.DEBUG)):
        #加上时间因素
        for i, i_click_time in item_time_list:
            item_cnt[i] +=1
            i2i_sim.setdefault(i, {})
            for j, j_click_time in item_time_list:
                if i==j: #同一用户既点过i，也点过j，说明 i j有关联
                    continue
                i2i_sim[i].setdefault(j, 0)
                i2i_sim[i][j] += 1/math.log(len(item_time_list)+1)   #防止水用户污染相似度
    i2i_sim_ = i2i_sim.copy()
    #以上求得的i2i_sim[i][j]不客观，热门文章被点得多会和所有文章相似
    for i , related_items in i2i_sim_.items():   #i = 某个物品， related_items = {j1: w_ij1, j2: w_ij2, ...} 固定物品i， 看所有和它共现过的物品j
        for j, wij in related_items.items():    #j = 另一个物品， wij = i和j的原始共现权重
            i2i_sim_[i][j] = wij / math.sqrt(item_cnt[i] * item_cnt[j])
    
    pickle.dump(i2i_sim_, open(data_path / 'itemcf_i2i_sim.pkl', 'wb'))
    return i2i_sim_

i2i_sim = itemcf_sim(all_click_df)

(196381, [(233917, 1507036600306)])


item_rank：字典（dict）类型，用于存储待推荐文章的最终推荐分数
key：文章 ID（待推荐的目标文章）
value：该文章的推荐分数 / 相关度分数
2. 示例数据
python
运行
#### 用户历史阅读的文章 ID 列表
user_history = [101, 203]

#### 文章相似度矩阵（i2i_sim）：key为基准文章ID，value为该文章与其他文章的相似度字典
i2i_sim = {    
    101: {201: 0.9, 202: 0.7, 203: 0.4},  # 文章101与201/202/203的相似度    
    203: {201: 0.5, 204: 0.8}             # 文章203与201/204的相似度    
}

#### 初始化推荐分数字典
item_rank = {}    
3. 分数计算逻辑（分步拆解）
第一步：基于历史文章 101 累加分数
遍历 101 的相似文章，跳过用户已读的 203，累加其余文章分数：
python
运行
#### 处理文章101的相似数据
for sim_article, score in i2i_sim[101].items():
    # 跳过用户已阅读的文章
    if sim_article in user_history:
        continue
    # 初始化分数（不存在则设为0），再累加相似度分数
    item_rank.setdefault(sim_article, 0)
    item_rank[sim_article] += score

#### 此时 item_rank 结果：
#### item_rank = {201: 0.9, 202: 0.7}
第二步：基于历史文章 203 累加分数
遍历 203 的相似文章，继续累加分数到已存在的 item_rank 中：
python
运行
#### 处理文章203的相似数据
for sim_article, score in i2i_sim[203].items():
    if sim_article in user_history:
        continue
    item_rank.setdefault(sim_article, 0)
    item_rank[sim_article] += score

#### 此时 item_rank 最终结果：
#### item_rank = {201: 1.4, 202: 0.7, 204: 0.8}

In [7]:
# 召回
def item_based_recommend(user_id, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click):
    # sim_item_topk 每篇文章的topk相似文章 ； recall_item_num 要召回的文章数量 ； item_topk_click topk相似若不够，从最热门的补全
    user_hist_items = user_item_time_dict[user_id]
    item_rank={}
    for loc, (i, click_time) in enumerate(user_hist_items):
        for j,wij in sorted(i2i_sim[i].items(), key=lambda x: x[1], reverse=True)[:sim_item_topk]:   #先选每篇文章最相似的前 K 个物品 效率
            if j in user_hist_items: #不推荐用户已经看过的文章，跳过
                continue
            item_rank.setdefault(j, 0)
            item_rank[j] += wij
    #不足时用热门物品补全
    if len(item_rank)<recall_item_num:
        for i, item in enumerate(item_topk_click):
            if item in item_rank.items(): #如果热门物品在原先列表中就跳过这个热门物品
                continue
            item_rank[item] =-i - 100   #负数分数标记热门文章，使原本的相似文章排在前面
            if len(item_rank)== recall_item_num:
                break    #数量够了，退出循环
    item_rank =sorted(item_rank.items(), key=lambda x: x[1], reverse=True)[:recall_item_num]   #最终从候选推荐列表中取前 N 个  推荐逻辑

    return item_rank
# item_rank针对单个用户的召回物品列表

In [8]:
#给每个用户根据物品协同过滤推荐文章
user_recall_items_dict=defaultdict(dict)
user_item_time_dict = get_user_item_time(all_click_df)
i2i_sim=pickle.load(open(data_path / 'itemcf_i2i_sim.pkl', 'rb'))
sim_item_topk=10  #每篇文章取的作为候选的相似文章数量
recall_item_num=10  #最终每个用户召回的文章数量
item_topk_click=get_item_topk_click(all_click_df,k=50)  
#这里k取50，比最终召回数量都大，是因为热门文章用户可能看过很多了，如果少了可能被过滤掉的太多了导致补全失败

for user in tqdm(all_click_df['user_id'].unique(), disable=not logger.isEnabledFor(logging.DEBUG)):
    user_recall_items_dict[user] = item_based_recommend(user, user_item_time_dict, i2i_sim, sim_item_topk, recall_item_num, item_topk_click)

In [17]:
user_item_score_list=[]
for user, items in tqdm(user_recall_items_dict.items(), disable=not logger.isEnabledFor(logging.DEBUG)):
    for item, score in items:
        user_item_score_list.append([user, item, score])

recall_df = pd.DataFrame(user_item_score_list, columns=['user_id', 'click_artical_id', 'pred_score'])
recall_df.head()

Unnamed: 0,user_id,click_artical_id,pred_score
0,199999,107301,0.178689
1,199999,50864,0.150742
2,199999,160974,0.144116
3,199999,50383,0.135146
4,199999,158536,0.110413


In [24]:
from datetime import datetime
def submit(recall_df, topk=5, model_name=None):
    recall_df = recall_df.sort_values(by=['user_id', 'pred_score'])
    recall_df['rank'] = recall_df.groupby(['user_id'])['pred_score'].rank(ascending=False, method='first')
    #检查每个用户至少有topk个文章
    tmp = recall_df.groupby('user_id').apply(lambda x:x['rank'].max())  #tmp → 每个用户的最大 rank（文章数量）
    assert tmp.min()>=topk

    del recall_df['pred_score']
    submit = recall_df[recall_df['rank']<=topk].set_index(['user_id', 'rank']).unstack(-1).reset_index()
    submit.columns = [int(col) if isinstance(col, int) else col for col in submit.columns.droplevel(0)]
    submit = submit.rename(columns={'':'user_id', 1:'article_1', 2:'article_2', 3:'article_3', 4:'article_4', 5:'article_5'})
    save_name = data_path / (model_name+'_' + datetime.today().strftime('%m-%d') + '.csv')
    submit.to_csv(save_name, index=False, header=True)

In [25]:
tst_click=pd.read_csv(data_path/'testA_click_log.csv')[:10000]
tst_users=tst_click['user_id'].unique()
tst_recall = recall_df[recall_df['user_id'].isin(tst_users)]
tst_recall.head()

Unnamed: 0,user_id,click_artical_id,pred_score
36190,249999,300470,0.293846
36191,249999,162300,0.277295
36192,249999,158536,0.269468
36193,249999,16129,0.248123
36194,249999,202557,0.173329


In [26]:
# 生成提交文件(这里不执行)
submit(tst_recall, topk=5, model_name='itemcf_baseline')

  tmp = recall_df.groupby('user_id').apply(lambda x:x['rank'].max())  #tmp → 每个用户的最大 rank（文章数量）
