## Description:
这个是sharedBottom模型的demo, 尝试在中级API的基础上，加一些loss优化的思路， 这次是DTP

In [1]:
import os
import math
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

from tqdm import tqdm

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat
from deepctr.feature_column import get_feature_names
from SharedBottom import SharedBottom

import tensorflow as tf
from tensorflow.keras import backend as K

import warnings
warnings.filterwarnings('ignore')

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True      # TensorFlow按需分配显存
config.gpu_options.per_process_gpu_memory_fraction = 0.5  # 指定显存分配比例
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

# tf.config.experimental_run_functions_eagerly(True)

DeepCTR version 0.9.0 detected. Your version is 0.8.2.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0








In [2]:
data_path = '../data_process'
data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])

In [3]:
# 选择出需要用到的列
use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'img_num', 'cat_1', 'click']
data_new = data[use_cols]

In [4]:
# 由于这个data_new的数据量还是太大， 我电脑训练不动， 所以这里再进行一波抽样
users = set(data_new['user_id'])
sampled_users = random.sample(users, 1000)
data_new = data_new[data_new['user_id'].isin(sampled_users)]

## 数据预处理

In [5]:
# 处理img_num
def transform(x):
    if x == '上海':
        return 0
    elif isinstance(x, float):
        return float(x)
    else:
        return float(eval(x))
data_new['img_num'] = data_new['img_num'].apply(lambda x: transform(x))

In [6]:
user_id_raw = data_new[['user_id']].drop_duplicates('user_id')
doc_id_raw = data_new[['article_id']].drop_duplicates('article_id')

# 简单数据预处理
sparse_features = [
    'user_id', 'article_id', 'net_status', 'exop_position', 'device', 'city', 'age', 'gender', 'cat_1'
]
dense_features = [
    'img_num'
]

# 填充缺失值
data_new[sparse_features] = data_new[sparse_features].fillna('-1')
data_new[dense_features] = data_new[dense_features].fillna(0)

# 归一化
mms = MinMaxScaler(feature_range=(0, 1))
data_new[dense_features] = mms.fit_transform(data_new[dense_features])

feature_max_idx = {}
for feat in sparse_features:
    lbe = LabelEncoder()
    data_new[feat] = lbe.fit_transform(data_new[feat])
    feature_max_idx[feat] = data_new[feat].max() + 1000

# 构建用户id词典和doc的id词典，方便从用户idx找到原始的id
# user_id_enc = data[['user_id']].drop_duplicates('user_id')
# doc_id_enc = data[['article_id']].drop_duplicates('article_id')
# user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))
# doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))

In [7]:
# 这里需要单独把duration标签处理下， 映射到0-1之间
duration_mms= MinMaxScaler()
data_new['duration'] = duration_mms.fit_transform(data_new['duration'].values.reshape(-1, 1))

In [8]:
# 划分数据集  这里按照曝光时间划分
train_data = data_new[data_new['expo_time'] < '2021-07-03']
test_data = data_new[data_new['expo_time'] >= '2021-07-06']

## 特征封装

In [9]:
sparse_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=4) for feat in sparse_features]
Dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]

In [10]:
# 划分dnn和linear特征
dnn_features_columns = sparse_feature_columns + Dense_feature_columns
lhuc_feature_columns = sparse_feature_columns

In [11]:
feature_names = get_feature_names(dnn_features_columns)

In [12]:
# AttributeError: 'numpy.dtype[int64]' object has no attribute 'base_dtype' 
# Keras需要把输入声明为Keras张量，其他的比如numpy张量作为输入不好使
train_model_input = {name: tf.keras.backend.constant(train_data[name]) for name in feature_names}
test_model_input = {name: tf.keras.backend.constant(test_data[name]) for name in feature_names}

## 模型搭建

In [13]:
model = SharedBottom(dnn_features_columns, lhuc_feature_columns, tower_dnn_hidden_units=[], task_types=['regression', 'binary'], 
             task_names=['duration', 'click'])

In [14]:
# model.summary()

In [15]:
# 绘制模型结构  
# GraphViz's executables not found  
# 安装软件，配置环境变量即可  https://graphviz.org/download/
# from tensorflow import keras
# keras.utils.plot_model(model, to_file='./shared_bottom.png', show_shapes=True)

## 模型的训练和预测

In [16]:
label_duration = tf.keras.backend.constant(train_data['duration'].values)
label_click = tf.keras.backend.constant(train_data['click'].values)

In [17]:
# 构建数据管道
batch_size = 128
train_ds = tf.data.Dataset.from_tensor_slices((train_model_input, (label_duration, label_click))).shuffle(buffer_size=100).batch(batch_size).prefetch(tf.data.experimental.AUTOTUNE)

In [18]:
# 模型训练这里，需要用到底层的训练脚本，这里不能用高层keras的API
optimizer = tf.keras.optimizers.Adam(learning_rate=0.005)

In [19]:
# 这里为了让每个任务又可比性，还是用loss值进行评估
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_reg_loss = tf.keras.metrics.Mean(name='train_reg_loss')
train_bin_loss = tf.keras.metrics.Mean(name='train_bin_loss')

In [20]:
# loss fuc这里需要自定义focal_loss
def focal_loss_binary(y_true, y_pred, gamma=2., alpha=.25):
    """
     Multi-labels Focal loss formula:
            FL = -alpha * (z-p)^gamma * log(p) -(1-alpha) * p^gamma * log(1-p)
                 ,which alpha = 0.25, gamma = 2, p = sigmoid(x), z = target_tensor.
    """
    # 这里是过滤，对于正样本，y_true等于1的位置保留y_pred，为0的地方置为1， 因为log1=0，负样本保留为0的地方，log1-0=0
    pt_1 = tf.where(tf.equal(y_true, 1), y_pred, tf.ones_like(y_pred))
    pt_0 = tf.where(tf.equal(y_true, 0), y_pred, tf.zeros_like(y_pred))
    
    return  -K.mean(alpha * K.pow(1.-pt_1, gamma) * K.log(pt_1+K.epsilon())) - K.mean((1-alpha) * K.pow(pt_0, gamma) * K.log(1.-pt_0+K.epsilon()))

def focal_loss_reg(y_true, y_pred, gamma=2.):
    
    mse = 1 / 2 * K.pow(y_true-y_pred, 2)
    # 保证在0-1之间
    mse = tf.nn.sigmoid(mse)
    return -K.mean(K.pow(1.-mse, gamma) * K.log(mse))

def focal_task_weight(k, r):
    # 保证在0-1之间
    k = tf.nn.sigmoid(k)
    return -K.pow(1-k, r) * K.log(k)

In [21]:
loss_func = {"binary": focal_loss_binary, "regression": focal_loss_reg}

In [22]:
@tf.function
def train_step(features, labels, task_types, weight=[1.0, 1.0], gamma_0=1.0):
    losses = []
    
    with tf.GradientTape() as tape:
        # 遍历每个任务
        for i, task_type in enumerate(task_types):
            out = model(features, training=True)
            task_loss = loss_func[task_types[i]](out[i], labels[i], gamma_0)

            losses.append(weight[i] * task_loss)
            
        loss = tf.add_n(losses)
        gradients = tape.gradient(loss, model.trainable_variables)
    
    optimizer.apply_gradients(zip(gradients, model.trainable_variables))
    train_loss.update_state(loss)
    train_reg_loss.update_state(losses[0])
    train_bin_loss.update_state(losses[1])
    return loss, losses[0], losses[1]

In [23]:
epochs = 2
batch_nums = math.ceil(train_data.shape[0] / batch_size)

task_types = ["regression", "binary"]
task_weight = [1.0, 1.0]
task_gamma = [1.0, 1.0]

for epoch in tqdm(range(epochs)):
    
    train_loss.reset_states()
    train_reg_loss.reset_states()
    train_bin_loss.reset_states()

    for feature, labels in train_ds:
        loss, loss_reg, loss_bin = train_step(feature, labels, task_types, task_weight)
    
    # 更新task weight FL(kt,rt)
    task_weight = [focal_task_weight(train_reg_loss.result(), task_gamma[0]), focal_task_weight(train_bin_loss.result(), task_gamma[1])]

    template = 'Epoch {}, Loss: {} - regression_loss: {} - binary_loss:{}, loss_weight: {}-{}'
    print(template.format(epoch, train_loss.result(), 
                          train_reg_loss.result(),
                          train_bin_loss.result(), task_weight[0], task_weight[1]))

  0%|                                                                                            | 0/2 [00:00<?, ?it/s]











 50%|██████████████████████████████████████████                                          | 1/2 [00:19<00:19, 19.44s/it]

Epoch 0, Loss: 3.3285491466522217 - regression_loss: 0.00014336765161715448 - binary_loss:3.3284056186676025, loss_weight: 0.3465128540992737-0.0012190319830551744






100%|████████████████████████████████████████████████████████████████████████████████████| 2/2 [00:34<00:00, 17.06s/it]

Epoch 1, Loss: 0.004154697526246309 - regression_loss: 0.0 - binary_loss:0.004154697526246309, loss_weight: 0.3465735912322998-0.34481820464134216





In [24]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [25]:
print("test click AUC", round(roc_auc_score(test_data['click'], pred_ans[1]), 4))

test click AUC 0.4885


In [33]:
print("test duration", round(mean_absolute_error(duration_mms.inverse_transform(test_data['duration'].values.reshape(-1, 1)), 
                                                 duration_mms.inverse_transform(pred_ans[0]).reshape(-1, 1)), 4))

test duration 21146.9607


这种一个回归一个分类的时候，回归这个不行直接。 不能这么玩。