## Description:
这个是MMOE模型的demo

In [13]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat
from deepctr.feature_column import get_feature_names

from MMOE import MMOE

import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

In [2]:
data_path = '../data_process'
data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])

In [3]:
# 选择出需要用到的列
use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'img_num', 'cat_1', 'click']
data_new = data[use_cols]

In [4]:
# 由于这个data_new的数据量还是太大， 我电脑训练不动， 所以这里再进行一波抽样
users = set(data_new['user_id'])
sampled_users = random.sample(users, 1000)
data_new = data_new[data_new['user_id'].isin(sampled_users)]

In [24]:
data_new.head()

Unnamed: 0,user_id,article_id,expo_time,net_status,exop_position,duration,device,city,age,gender,img_num,cat_1,click
10661,60,2174,2021-06-30 13:36:57,0,17,0,174,237,1,1,0.0,15,0
10662,60,4458,2021-06-30 13:36:57,0,21,0,174,237,1,1,0.033149,1,0
10663,60,4037,2021-06-30 13:40:23,0,24,0,174,237,1,1,0.033149,12,0
10664,60,3109,2021-06-30 13:36:57,0,14,0,174,237,1,1,0.038674,13,0
10665,60,14125,2021-07-03 06:10:46,0,7,0,174,237,1,1,0.027624,13,0


## 数据预处理

In [5]:
# 处理img_num
def transform(x):
    if x == '上海':
        return 0
    elif isinstance(x, float):
        return float(x)
    else:
        return float(eval(x))
data_new['img_num'] = data_new['img_num'].apply(lambda x: transform(x))

In [6]:
user_id_raw = data_new[['user_id']].drop_duplicates('user_id')
doc_id_raw = data_new[['article_id']].drop_duplicates('article_id')

# 简单数据预处理
sparse_features = [
    'user_id', 'article_id', 'net_status', 'exop_position', 'device', 'city', 'age', 'gender', 'cat_1'
]
dense_features = [
    'img_num'
]

# 填充缺失值
data_new[sparse_features] = data_new[sparse_features].fillna('-1')
data_new[dense_features] = data_new[dense_features].fillna(0)

# 归一化
mms = MinMaxScaler(feature_range=(0, 1))
data_new[dense_features] = mms.fit_transform(data_new[dense_features])

feature_max_idx = {}
for feat in sparse_features:
    lbe = LabelEncoder()
    data_new[feat] = lbe.fit_transform(data_new[feat])
    feature_max_idx[feat] = data_new[feat].max() + 1

# 构建用户id词典和doc的id词典，方便从用户idx找到原始的id
# user_id_enc = data[['user_id']].drop_duplicates('user_id')
# doc_id_enc = data[['article_id']].drop_duplicates('article_id')
# user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))
# doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))

In [7]:
# 划分数据集  这里按照曝光时间划分
train_data = data_new[data_new['expo_time'] < '2021-07-06']
test_data = data_new[data_new['expo_time'] >= '2021-07-06']

## 特征封装

In [10]:
fixlen_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=4) for feat in sparse_features] \
                         + [DenseFeat(feat, 1) for feat in dense_features]

In [11]:
# 划分dnn和linear特征
dnn_features_columns = fixlen_feature_columns

In [14]:
feature_names = get_feature_names(dnn_features_columns)

In [15]:
# AttributeError: 'numpy.dtype[int64]' object has no attribute 'base_dtype' 
# Keras需要把输入声明为Keras张量，其他的比如numpy张量作为输入不好使
train_model_input = {name: tf.keras.backend.constant(train_data[name]) for name in feature_names}
test_model_input = {name: tf.keras.backend.constant(test_data[name]) for name in feature_names}

## 模型训练和预测

In [16]:
model = MMOE(dnn_features_columns, tower_dnn_hidden_units=[], task_types=['regression', 'binary'], 
             task_names=['duration', 'click'])
model.compile("adam", loss={"duration": "mean_squared_error", "click": "binary_crossentropy"}, 
              loss_weights={"duration": 0.02, "click": 0.98},
              metrics={"duration": "mae", "click": "binary_crossentropy"})

In [17]:
label_duration = tf.keras.backend.constant(train_data['duration'].values)
label_click = tf.keras.backend.constant(train_data['click'].values)

In [20]:
history = model.fit(train_model_input, [label_duration, label_click],
                        batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Train on 123209 samples, validate on 30803 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [21]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [22]:
print("test click AUC", round(roc_auc_score(test_data['click'], pred_ans[1]), 4))

test click AUC 0.647


In [23]:
print("test duration", round(mean_absolute_error(test_data['duration'], pred_ans[0]), 4))

test duration 48.4138
