## Description:
这个是sharedBottom模型的demo, 尝试在中级API的基础上，加一些loss优化的思路， 这次是DWA

In [1]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat
from deepctr.feature_column import get_feature_names
from SharedBottom import SharedBottom

import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True      # TensorFlow按需分配显存
config.gpu_options.per_process_gpu_memory_fraction = 0.5  # 指定显存分配比例
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

# tf.config.experimental_run_functions_eagerly(True)

DeepCTR version 0.9.0 detected. Your version is 0.8.2.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0








In [2]:
data_path = '../data_process'
data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])

In [3]:
# 选择出需要用到的列
use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'img_num', 'cat_1', 'click']
data_new = data[use_cols]

In [4]:
# 由于这个data_new的数据量还是太大， 我电脑训练不动， 所以这里再进行一波抽样
users = set(data_new['user_id'])
sampled_users = random.sample(users, 1000)
data_new = data_new[data_new['user_id'].isin(sampled_users)]

## 数据预处理

In [5]:
# 处理img_num
def transform(x):
    if x == '上海':
        return 0
    elif isinstance(x, float):
        return float(x)
    else:
        return float(eval(x))
data_new['img_num'] = data_new['img_num'].apply(lambda x: transform(x))

In [6]:
user_id_raw = data_new[['user_id']].drop_duplicates('user_id')
doc_id_raw = data_new[['article_id']].drop_duplicates('article_id')

# 简单数据预处理
sparse_features = [
    'user_id', 'article_id', 'net_status', 'exop_position', 'device', 'city', 'age', 'gender', 'cat_1'
]
dense_features = [
    'img_num'
]

# 填充缺失值
data_new[sparse_features] = data_new[sparse_features].fillna('-1')
data_new[dense_features] = data_new[dense_features].fillna(0)

# 归一化
mms = MinMaxScaler(feature_range=(0, 1))
data_new[dense_features] = mms.fit_transform(data_new[dense_features])

feature_max_idx = {}
for feat in sparse_features:
    lbe = LabelEncoder()
    data_new[feat] = lbe.fit_transform(data_new[feat])
    feature_max_idx[feat] = data_new[feat].max() + 1000

# 构建用户id词典和doc的id词典，方便从用户idx找到原始的id
# user_id_enc = data[['user_id']].drop_duplicates('user_id')
# doc_id_enc = data[['article_id']].drop_duplicates('article_id')
# user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))
# doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))

In [7]:
# 划分数据集  这里按照曝光时间划分
train_data = data_new[data_new['expo_time'] < '2021-07-03']
test_data = data_new[data_new['expo_time'] >= '2021-07-06']

## 特征封装

In [8]:
sparse_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=4) for feat in sparse_features]
Dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]

In [9]:
# 划分dnn和linear特征
dnn_features_columns = sparse_feature_columns + Dense_feature_columns
lhuc_feature_columns = sparse_feature_columns

In [10]:
feature_names = get_feature_names(dnn_features_columns)

In [11]:
# AttributeError: 'numpy.dtype[int64]' object has no attribute 'base_dtype' 
# Keras需要把输入声明为Keras张量，其他的比如numpy张量作为输入不好使
train_model_input = {name: tf.keras.backend.constant(train_data[name]) for name in feature_names}
test_model_input = {name: tf.keras.backend.constant(test_data[name]) for name in feature_names}

## 模型搭建

In [12]:
model = SharedBottom(dnn_features_columns, lhuc_feature_columns, tower_dnn_hidden_units=[], task_types=['regression', 'binary'], 
             task_names=['duration', 'click'])

In [13]:
# model.summary()

In [14]:
# 绘制模型结构  
# GraphViz's executables not found  
# 安装软件，配置环境变量即可  https://graphviz.org/download/
# from tensorflow import keras
# keras.utils.plot_model(model, to_file='./shared_bottom.png', show_shapes=True)

## 模型的训练和预测

In [15]:
label_duration = tf.keras.backend.constant(train_data['duration'].values)
label_click = tf.keras.backend.constant(train_data['click'].values)

In [16]:
# 构建数据管道
batch_size = 128
train_ds = tf.data.Dataset.from_tensor_slices((train_model_input, (label_duration, label_click))).shuffle(buffer_size=100).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [17]:
# 模型训练这里，需要用到底层的训练脚本，这里不能用高层keras的API
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

train_loss = tf.keras.metrics.Mean(name='train_loss')
train_reg_loss = tf.keras.metrics.Mean(name='train_reg_loss')
train_bin_loss = tf.keras.metrics.Mean(name='train_bin_loss')
loss_func = {"binary": tf.keras.losses.binary_crossentropy, "regression": tf.keras.losses.mean_squared_error}

In [18]:
@tf.function
def train_step(features, labels, task_types, weight=[0.02, 0.98]):
    losses = []
    
    with tf.GradientTape() as tape:
        # 遍历每个任务
        for i, task_type in enumerate(task_types):
            out = model(features, training=True)
            task_loss = loss_func[task_types[i]](out[i], labels[i])

            losses.append(weight[i] * task_loss)
            
        loss = tf.add_n(losses)
        gradients = tape.gradient(loss, model.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss.update_state(loss)
    train_reg_loss.update_state(losses[0])
    train_bin_loss.update_state(losses[1])
    return loss, losses[0], losses[1]

In [21]:
epochs = 10
K = 2
T = 2
batch_nums = math.ceil(train_data.shape[0] / batch_size)

task_types = ["regression", "binary"]

# 这里的task_weight 就不用tf.Variables了，因为不用梯度更新
task_weight = np.zeros([2, epochs], dtype=np.float32)
avg_cost = np.zeros([epochs, 2], dtype=np.float32)  # reg_loss, bin_loss

dynamic_weight_average = True

for epoch in tqdm(range(epochs)):
    
    # 累加损失
    # acc_cost = np.zeros(2, dtype=np.float32)
    
    # 如果使用动态加权平均，注意依然是epoch层面, 更新权重
    if dynamic_weight_average:
        # 初始化
        if epoch == 0 or epoch == 1:
            w_1 = 1.0
            w_2 = 1.0
            task_weight[0, epoch] = K*np.exp(w_1/T) / (np.exp(w_1/T) + np.exp(w_2/T))
            task_weight[1, epoch] = K*np.exp(w_2/T) / (np.exp(w_1/T) + np.exp(w_2/T))
        else: 
            # 获取每个任务的loss下降比率
            w_1 = avg_cost[epoch-1, 0] / avg_cost[epoch-2, 0]
            w_2 = avg_cost[epoch-1, 1] / avg_cost[epoch-2, 1]
            # 修改权重
            task_weight[0, epoch] = K*np.exp(w_1/T) / (np.exp(w_1/T) + np.exp(w_2/T))
            task_weight[1, epoch] = K*np.exp(w_2/T) / (np.exp(w_1/T) + np.exp(w_2/T))
    else:
        task_weight[0, epoch], task_weight[1, epoch] = 1.0, 1.0
    
    train_loss.reset_states()
    train_reg_loss.reset_states()
    train_bin_loss.reset_states()

    for feature, labels in train_ds:
        loss, loss_reg, loss_bin = train_step(feature, labels, task_types, task_weight[:,  epoch])
        
#         acc_cost[0] += sum(loss_reg) / batch_size
#         acc_cost[1] += sum(loss_bin) / batch_size
        
    # 更新avg_cost train_reg_loss.result算的就是平均损失， 每个batch的平均损失之和/batch_num
    avg_cost[epoch, 0] = train_reg_loss.result()
    avg_cost[epoch, 1] = train_bin_loss.result()
    # print(avg_cost[epoch], acc_cost/batch_nums)   这两个等价， 但差距就是有的不够batch_size个的，后面这个会按照batch_size个计算，所以不如前面这个精准
    
    template = 'Epoch {}, Loss: {} - regression_loss: {} - binary_loss:{}, loss_weight: {}-{}'
    print(template.format(epoch, train_loss.result(), 
                          train_reg_loss.result(),
                          train_bin_loss.result(), task_weight[0, epoch], task_weight[1, epoch]))

 10%|████████▎                                                                          | 1/10 [00:22<03:22, 22.54s/it]

Epoch 0, Loss: 6638.009765625 - regression_loss: 6635.50830078125 - binary_loss:2.4948813915252686, loss_weight: 1.0-1.0


 20%|████████████████▌                                                                  | 2/10 [00:33<02:06, 15.79s/it]

Epoch 1, Loss: 6536.70849609375 - regression_loss: 6534.333984375 - binary_loss:2.371436595916748, loss_weight: 1.0-1.0


 30%|████████████████████████▉                                                          | 3/10 [00:44<01:34, 13.47s/it]

Epoch 2, Loss: 6560.888671875 - regression_loss: 6558.53466796875 - binary_loss:2.3511602878570557, loss_weight: 1.0085577964782715-0.9914422631263733


 40%|█████████████████████████████████▏                                                 | 4/10 [00:55<01:14, 12.45s/it]

Epoch 3, Loss: 6507.11328125 - regression_loss: 6504.74609375 - binary_loss:2.3642385005950928, loss_weight: 1.003063440322876-0.996936559677124


 50%|█████████████████████████████████████████▌                                         | 5/10 [01:06<00:59, 11.94s/it]

Epoch 4, Loss: 6453.0849609375 - regression_loss: 6450.70703125 - binary_loss:2.379669189453125, loss_weight: 0.9965590834617615-1.0034409761428833


 60%|█████████████████████████████████████████████████▊                                 | 6/10 [01:17<00:46, 11.56s/it]

Epoch 5, Loss: 6442.40771484375 - regression_loss: 6440.0302734375 - binary_loss:2.38036847114563, loss_weight: 0.99629145860672-1.0037086009979248


 70%|██████████████████████████████████████████████████████████                         | 7/10 [01:27<00:33, 11.33s/it]

Epoch 6, Loss: 6460.50537109375 - regression_loss: 6458.13720703125 - binary_loss:2.3726625442504883, loss_weight: 0.999512791633606-1.000487208366394


 80%|██████████████████████████████████████████████████████████████████▍                | 8/10 [01:38<00:22, 11.19s/it]

Epoch 7, Loss: 6465.93701171875 - regression_loss: 6463.568359375 - binary_loss:2.368056297302246, loss_weight: 1.0015122890472412-0.9984877705574036


 90%|██████████████████████████████████████████████████████████████████████████▋        | 9/10 [01:49<00:11, 11.07s/it]

Epoch 8, Loss: 6456.97314453125 - regression_loss: 6454.60400390625 - binary_loss:2.3698625564575195, loss_weight: 1.0006955862045288-0.9993044137954712


100%|██████████████████████████████████████████████████████████████████████████████████| 10/10 [02:00<00:00, 12.03s/it]

Epoch 9, Loss: 6449.74609375 - regression_loss: 6447.37109375 - binary_loss:2.372725009918213, loss_weight: 0.9994626045227051-1.000537395477295





In [22]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [23]:
print("test click AUC", round(roc_auc_score(test_data['click'], pred_ans[1]), 4))

test click AUC 0.491


In [24]:
print("test duration", round(mean_absolute_error(test_data['duration'], pred_ans[0]), 4))

test duration 39.7805
