## Description:
搭建MIIND模型，完成新闻推荐数据集下的召回实验

In [9]:
import os
import time
import pickle
import random
from datetime import datetime
from collections import defaultdict

import numpy as np
import pandas as pd

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

# 从utils里面导入函数
from utils import gen_data_set, gen_model_input, train_mind_model
from utils import get_embeddings, get_mind_recall_res

import warnings
warnings.filterwarnings('ignore')

## 导入数据，划分呢数据集

In [2]:
data_path = '../data_process'
data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])

In [3]:
# 选择出需要用到的列
use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'click']
data_new = data[use_cols]

## 划分测试集和训练集 

In [4]:
# 按照用户分组，然后把最后一个item拿出来
click_df = data_new[data_new['click']==1]

In [5]:
def get_hist_and_last_click(all_click):
    all_click = all_click.sort_values(by=['user_id', 'expo_time'])
    click_last_df = all_click.groupby('user_id').tail(1)
    
    # 如果用户只有一个点击，hist为空了，会导致训练的时候这个用户不可见，此时默认泄露一下
    def hist_func(user_df):
        if len(user_df) == 1:
            return user_df
        else:
            return user_df[:-1]

    click_hist_df = all_click.groupby('user_id').apply(hist_func).reset_index(drop=True)

    return click_hist_df, click_last_df

In [6]:
user_click_hist_df, user_click_last_df = get_hist_and_last_click(click_df)

In [7]:
user_click_hist_df.head()

Unnamed: 0,user_id,article_id,expo_time,net_status,exop_position,duration,device,city,age,gender,click
0,17340,464481478,2021-06-30 20:34:47,2,21,27,iPhoneX,上海,A_30_39,male,1
1,17340,465148736,2021-07-02 19:35:03,5,23,49,iPhoneX,上海,A_30_39,male,1
2,17340,464707540,2021-07-02 19:47:06,5,25,174,iPhoneX,上海,A_30_39,male,1
3,17340,464993414,2021-07-02 19:47:06,5,27,11,iPhoneX,上海,A_30_39,male,1
4,17340,465115022,2021-07-02 20:01:34,5,41,14,iPhoneX,上海,A_30_39,male,1


## MIND召回

In [10]:
def mind_recall(data, topk=100, embedding_dim=8, his_seq_maxlen=50, negsample=0,
                      batch_size=64, epochs=1, verbose=1, validation_split=0.0):
    """通过MIND模型，计算用户向量和文章向量
    param: data: 用户日志数据
    topk: 对于每个用户，召回多少篇文章
    """
    user_id_raw = data[['user_id']].drop_duplicates('user_id')
    doc_id_raw = data[['article_id']].drop_duplicates('article_id')
    
    # 类别数据编码   
    base_features = ['user_id', 'article_id', 'city', 'age', 'gender']
    feature_max_idx = {}
    for f in base_features:
        lbe = LabelEncoder()
        data[f] = lbe.fit_transform(data[f])
        feature_max_idx[f] = data[f].max() + 1
        
    # 构建用户id词典和doc的id词典，方便从用户idx找到原始的id
    user_id_enc = data[['user_id']].drop_duplicates('user_id')
    doc_id_enc = data[['article_id']].drop_duplicates('article_id')
    user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))
    doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))
    
    # 保存下每篇文章的被点击数量， 方便后面高热文章的打压
    doc_clicked_count_df = data.groupby('article_id')['click'].apply(lambda x: x.count()).reset_index()
    doc_clicked_count_dict = dict(zip(doc_clicked_count_df['article_id'], doc_clicked_count_df['click']))
    
    if os.path.exists('data/train_model_input.pkl'):
        train_model_input = pickle.load(open('data/train_model_input.pkl', 'rb'))
        train_label = np.load('data/train_label.npy')
        test_model_input = pickle.load(open('data/test_model_input.pkl', 'rb'))
        test_label = np.load('data/test_label.npy')
    else: 
        train_set, test_set = gen_data_set(data, doc_clicked_count_dict, negsample, control_users=False)

        # 构造MIND模型的输入
        train_model_input, train_label = gen_model_input(train_set, his_seq_maxlen)
        test_model_input, test_label = gen_model_input(test_set, his_seq_maxlen)
    
        # 保存一份输入直接，要不然每次都得这么构造输入太慢了
        pickle.dump(train_model_input, open('data/train_model_input.pkl', 'wb'))
        pickle.dump(test_model_input, open('data/test_model_input.pkl', 'wb'))
        np.save('data/train_label.npy', train_label)
        np.save('data/test_label.npy', test_label)
    
    # 构建模型并完成训练
    model = train_mind_model(train_model_input, train_label, embedding_dim, feature_max_idx, his_seq_maxlen, batch_size, epochs, verbose, validation_split)
    
    # 获得用户embedding和doc的embedding， 并进行保存
    user_embs, doc_embs = get_embeddings(model, test_model_input, user_idx_2_rawid, doc_idx_2_rawid)
    
    # MIND模型这里有k个兴趣向量，所以要分开进行召回
    user_embs1 = user_embs[:, 0, :]
    user_embs2 = user_embs[:, 1, :]
    
    # 对每个用户，拿到召回结果并返回回来
    user_recall_items_dict1 = get_mind_recall_res(user_embs1, doc_embs, user_idx_2_rawid, doc_idx_2_rawid, topk)
    user_recall_items_dict2 = get_mind_recall_res(user_embs1, doc_embs, user_idx_2_rawid, doc_idx_2_rawid, topk)
    
    # 合并，当然我这里没有去重
#     user_recall_items_dict = defaultdict(list)
#     for user in user_recall_items_dict1:
#         user_recall_items_dict[user] = user_recall_items_dict1[user] + user_recall_items_dict2[user]
    
    return user_recall_items_dict1, user_recall_items_dict2

In [11]:
user_recall_doc_dict1, user_recall_doc_dict2 = mind_recall(user_click_hist_df, negsample=0)

Train on 476201 samples


In [12]:
len(user_recall_doc_dict1), len(user_recall_doc_dict2)

(20000, 20000)

## 定义MIND模型

In [13]:
# 依次评估召回的前10, 20, 30, 40, 50个文章中的击中率
def metrics_recall(user_recall_items_dict, trn_last_click_df, topk=100):
    last_click_item_dict = dict(zip(trn_last_click_df['user_id'], trn_last_click_df['article_id']))
    user_num = len(user_recall_items_dict)
    
    for k in range(50, topk+1, 50):
        hit_num = 0
        for user, item_list in user_recall_items_dict.items():
            if user in last_click_item_dict:
                # 获取前k个召回的结果
                tmp_recall_items = [x[0] for x in user_recall_items_dict[user][:k]]
                if last_click_item_dict[user] in set(tmp_recall_items):
                    hit_num += 1
        
        hit_rate = round(hit_num * 1.0 / user_num, 5)
        print(' topk: ', k, ' : ', 'hit_num: ', hit_num, 'hit_rate: ', hit_rate, 'user_num : ', user_num)

In [14]:
metrics_recall(user_recall_doc_dict1, user_click_last_df, topk=200)

 topk:  50  :  hit_num:  28 hit_rate:  0.0014 user_num :  20000
 topk:  100  :  hit_num:  67 hit_rate:  0.00335 user_num :  20000
 topk:  150  :  hit_num:  67 hit_rate:  0.00335 user_num :  20000
 topk:  200  :  hit_num:  67 hit_rate:  0.00335 user_num :  20000


In [15]:
metrics_recall(user_recall_doc_dict2, user_click_last_df, topk=200)

 topk:  50  :  hit_num:  21 hit_rate:  0.00105 user_num :  20000
 topk:  100  :  hit_num:  64 hit_rate:  0.0032 user_num :  20000
 topk:  150  :  hit_num:  64 hit_rate:  0.0032 user_num :  20000
 topk:  200  :  hit_num:  64 hit_rate:  0.0032 user_num :  20000
