## Description:
这个是sharedBottom模型的demo

In [1]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import random

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.metrics import roc_auc_score, mean_squared_error, mean_absolute_error
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat
from deepctr.feature_column import get_feature_names

import tensorflow as tf

import warnings
warnings.filterwarnings('ignore')

config = tf.compat.v1.ConfigProto()
config.gpu_options.allow_growth = True      # TensorFlow按需分配显存
config.gpu_options.per_process_gpu_memory_fraction = 0.5  # 指定显存分配比例
tf.compat.v1.keras.backend.set_session(tf.compat.v1.Session(config=config))

DeepCTR version 0.9.0 detected. Your version is 0.8.2.
Use `pip install -U deepctr` to upgrade.Changelog: https://github.com/shenweichen/DeepCTR/releases/tag/v0.9.0








In [2]:
data_path = '../data_process'
data = pd.read_csv(os.path.join(data_path, 'train_data.csv'), index_col=0, parse_dates=['expo_time'])

In [3]:
# 选择出需要用到的列
use_cols = ['user_id', 'article_id', 'expo_time', 'net_status', 'exop_position', 'duration', 'device', 'city', 'age', 'gender', 'img_num', 'cat_1', 'click']
data_new = data[use_cols]

In [4]:
# 由于这个data_new的数据量还是太大， 我电脑训练不动， 所以这里再进行一波抽样
users = set(data_new['user_id'])
sampled_users = random.sample(users, 1000)
data_new = data_new[data_new['user_id'].isin(sampled_users)]

In [24]:
data_new.head()

Unnamed: 0,user_id,article_id,expo_time,net_status,exop_position,duration,device,city,age,gender,img_num,cat_1,click
10661,60,2174,2021-06-30 13:36:57,0,17,0,174,237,1,1,0.0,15,0
10662,60,4458,2021-06-30 13:36:57,0,21,0,174,237,1,1,0.033149,1,0
10663,60,4037,2021-06-30 13:40:23,0,24,0,174,237,1,1,0.033149,12,0
10664,60,3109,2021-06-30 13:36:57,0,14,0,174,237,1,1,0.038674,13,0
10665,60,14125,2021-07-03 06:10:46,0,7,0,174,237,1,1,0.027624,13,0


## 数据预处理

In [5]:
# 处理img_num
def transform(x):
    if x == '上海':
        return 0
    elif isinstance(x, float):
        return float(x)
    else:
        return float(eval(x))
data_new['img_num'] = data_new['img_num'].apply(lambda x: transform(x))

In [6]:
user_id_raw = data_new[['user_id']].drop_duplicates('user_id')
doc_id_raw = data_new[['article_id']].drop_duplicates('article_id')

# 简单数据预处理
sparse_features = [
    'user_id', 'article_id', 'net_status', 'exop_position', 'device', 'city', 'age', 'gender', 'cat_1'
]
dense_features = [
    'img_num'
]

# 填充缺失值
data_new[sparse_features] = data_new[sparse_features].fillna('-1')
data_new[dense_features] = data_new[dense_features].fillna(0)

# 归一化
mms = MinMaxScaler(feature_range=(0, 1))
data_new[dense_features] = mms.fit_transform(data_new[dense_features])

feature_max_idx = {}
for feat in sparse_features:
    lbe = LabelEncoder()
    data_new[feat] = lbe.fit_transform(data_new[feat])
    feature_max_idx[feat] = data_new[feat].max() + 1000

# 构建用户id词典和doc的id词典，方便从用户idx找到原始的id
# user_id_enc = data[['user_id']].drop_duplicates('user_id')
# doc_id_enc = data[['article_id']].drop_duplicates('article_id')
# user_idx_2_rawid = dict(zip(user_id_enc['user_id'], user_id_raw['user_id']))
# doc_idx_2_rawid = dict(zip(doc_id_enc['article_id'], doc_id_raw['article_id']))

In [7]:
# 划分数据集  这里按照曝光时间划分
train_data = data_new[data_new['expo_time'] < '2021-07-06']
test_data = data_new[data_new['expo_time'] >= '2021-07-06']

## 特征封装

In [8]:
sparse_feature_columns = [SparseFeat(feat, feature_max_idx[feat], embedding_dim=4) for feat in sparse_features]
Dense_feature_columns = [DenseFeat(feat, 1) for feat in dense_features]

In [9]:
# 划分dnn和linear特征
dnn_features_columns = sparse_feature_columns + Dense_feature_columns
lhuc_feature_columns = sparse_feature_columns

In [10]:
feature_names = get_feature_names(dnn_features_columns)

In [11]:
# AttributeError: 'numpy.dtype[int64]' object has no attribute 'base_dtype' 
# Keras需要把输入声明为Keras张量，其他的比如numpy张量作为输入不好使
train_model_input = {name: tf.keras.backend.constant(train_data[name]) for name in feature_names}
test_model_input = {name: tf.keras.backend.constant(test_data[name]) for name in feature_names}

## 模型搭建

In [12]:
import tensorflow as tf
from tensorflow.keras.layers import *
from tensorflow.keras.models import *
from tensorflow.python.keras.models import Model
from tensorflow.python.keras.initializers import Zeros, glorot_normal
from tensorflow.python.keras.regularizers import l2

from collections import OrderedDict
import itertools
from deepctr.feature_column import SparseFeat, VarLenSparseFeat, DenseFeat

In [13]:
from ShareBottom import DNN, build_input_layers, build_embedding_layers, concat_embedding_list, combined_dnn_input, concat_func, PredictionLayer

## Lhuc_net:
lhuc_net: 语言识别领域的模型，核心思想是做说话人的自适应，其中一个关键突破是DNN网络中，为每个说话人学习一个特定的隐式单位贡献，提升不同说话人的语音效果

In [14]:
def lhuc_net(name, nn_inputs, lhuc_inputs, nn_hidden_units=(128, 64, ), lhuc_units=(32, ), 
             dnn_activation='relu', l2_reg_dnn=0, dnn_dropout=0, dnn_use_bn=False, scale_last=True, seed=2021):
    """这个网络是全连接网络搭建的，主要完成lhuc_feature与其他特征的交互， 算是一个特征交互层，不过交互的方式非常新颖
    
        name: 为当前lhuc_net起的名字
        nn_inputs: 与lhuc_feature进行交互的特征输入，比如fm_out， 或者其他特征的embedding拼接等
        lhuc_inputs: lhuc_net的特征输入，在推荐里面，这个其实是能体现用户个性化的一些特征embedding等
        nn_hidden_units: 普通DNN每一层神经单元个数
        lhuc_units: lhuc_net的神经单元个数
        后面就是激活函数， 正则化以及bn的指定参数，不过多解释
    """
    
    # nn_inputs可以是其他特征的embedding拼接向量，或者是其他网络的输出，比如fM的输出向量等
    cur_layer = nn_inputs       
    
    # 这里的nn_hidden_units是一个列表，里面是全连接每一层神经单元个数
    for idx, nn_dim in enumerate(nn_hidden_units):
        # lhuc_feature走一个塔， 这个塔两层， 最终输出的向量维度和nn_inputs的向量维度保持一致， 每个值在0-1之间，代表权重
        # 表示fm_embedding或者其他特征embdding每个维度上的重要性  
        # 这里其实可以用多层 激活函数用relu 
        lhuc_output = DNN(lhuc_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, 
                          seed=seed, name="{}_lhuc_{}".format(name, idx))(lhuc_inputs)
        # 最后这里的输出维度要和交互的embedding保持一致， 激活函数是sigmoid，
        lhuc_scale = Dense(int(cur_layer.shape[1]), activation='sigmoid')(lhuc_output)
        
        # 有了权重之后， lhuc_scale与nn_inputs再过一个塔
        cur_layer = DNN((nn_dim, ), dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, 
                        seed=seed, name="{}_layer_{}".format(name, idx))(cur_layer * lhuc_scale * 2.0)
        
    # 上面这个操作相当于nn_input_embedding过了len(nn_hidden_units)层全连接， 只不过，在过每一层之前，会先lhuc_slot特征通过lhuc_net为
    # nn_input_embedding过完全连接之后的每个维度学习权重，作为每个维度的重要性
    # 如果最后的输出还需要加权，再走一遍上面的操作
    if scale_last:
        lhuc_output = DNN(lhuc_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, 
                          seed=seed, name="{}_lhuc_{}".format(name, len(nn_hidden_units)))(lhuc_inputs)
        lhuc_scale = Dense(int(cur_layer.shape[1]), activation='sigmoid')(lhuc_output)
        
        cur_layer = cur_layer * lhuc_scale * 2.0
    
    return cur_layer

上⾯我觉得关键是lhuc_slot⽤的特征是什么， 为什么这样可以对fm_embedding或者bias_embedding进⾏加权操作。 得需要搞明⽩这⾥⾯⽤的特征之间的制约性或者相关性。 
* lhuc_feature: 主要是用户id， doc_id，doc_类别， doc_字数， doc_作者等拼接， 这些都是用户和item的强烈代表特征， 这个拼接的embedding代表的是用户对于item的兴趣偏好
* 待交互的特征：
    * bias_nn_inputs: 这里一般是原始的特征embedding拼接起来，代表特征的原始信息
    * 其他模块输出，比如fm的输出: 这个是能产生交互的特征embedding，代表的是重要的特征交互信息

所以，lhuc_net主要是在原始信息或者是特征交互信息过DNN的每一层，先对DNN每一层的输出的每个维度，根据用户对于item的兴趣偏好，进行加权，来提升每一层DNN输出的不同维度的贡献程度，来体现用户的个性化信息(相比于不加lhuc_net)，此外，还能进行降维。毕竟通过个性化进行了一波选择吗。有点像Fibinet那里的se模块，不过那个是对每个embedding进行加权筛选，而这里是对DNN输出(可以看成一个embedding)的每个维度进行加权。

所以我感觉这个lhuc_net的思路也是⾮常不错的， 相当于在原来的基础上，通过⽤⼾对于视频的兴趣 偏好，对embedding的各个维度进⾏加权，提升不同维度的贡献程度。相当于只提取了更加重要的⼀ 些维度信息。 既节省了计算量，⼜避免维度冗余。

In [15]:
class BilinearInteraction(Layer):
    def __init__(self, bilinear_type="interaction", seed=2022, **kwargs):
        super(BilinearInteraction, self).__init__(**kwargs)
        self.bilinear_type = bilinear_type
        self.seed = seed
    def build(self, input_shape):
        # input_shape: [None, field_num, embed_num]
        self.field_size = input_shape[1]
        self.embedding_size = input_shape[-1]
        
        if self.bilinear_type == 'all':  #所有embedding矩阵共用一个矩阵W
            self.W = self.add_weight(shape=(self.embedding_size, self.embedding_size), 
                                     initializer=glorot_normal(seed=self.seed), name="bilinear_weight")
        elif self.bilinear_type == "each": # 每个field共用一个矩阵W
            self.W_list = [self.add_weight(shape=(self.embedding_size, self.embedding_size), initializer=glorot_normal(
                seed=self.seed), name="bilinear_weight" + str(i)) for i in range(self.field_size-1)]
        elif self.bilinear_type == "interaction":  # 每个交互用一个矩阵W
            self.W_list = [self.add_weight(shape=(self.embedding_size, self.embedding_size), initializer=glorot_normal(
                seed=self.seed), name="bilinear_weight" + str(i) + '_' + str(j)) for i, j in
                           itertools.combinations(range(self.field_size), 2)]
        super(BilinearInteraction, self).build(input_shape)  # Be sure to call this somewhere!
    
    def call(self, inputs):
        # inputs: [None, field_nums, embed_dims]
        # 这里把inputs从field_nums处split, 划分成field_nums个embed_dims长向量的列表
        inputs = tf.split(inputs, self.field_size, axis=1)  # [(None, embed_dims), (None, embed_dims), ..] 
        n = len(inputs)  # field_nums个
        
        if self.bilinear_type == "all":
            # inputs[i] (none, embed_dims)    self.W (embed_dims, embed_dims) -> (None, embed_dims)
            vidots = [tf.tensordot(inputs[i], self.W, axes=(-1, 0)) for i in range(n)]   # 点积
            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]  # 哈达玛积
        elif self.bilinear_type == "each":
            vidots = [tf.tensordot(inputs[i], self.W_list[i], axes=(-1, 0)) for i in range(n - 1)]
            # 假设3个域， 则两两组合[(0,1), (0,2), (1,2)]  这里的vidots是第一个维度， inputs是第二个维度 哈达玛积运算
            p = [tf.multiply(vidots[i], inputs[j]) for i, j in itertools.combinations(range(n), 2)]
        elif self.bilinear_type == "interaction":
            # combinations(inputs, 2)  这个得到的是两两向量交互的结果列表
            # 比如 combinations([[1,2], [3,4], [5,6]], 2)
            # 得到 [([1, 2], [3, 4]), ([1, 2], [5, 6]), ([3, 4], [5, 6])]  (v[0], v[1]) 先v[0]与W点积，然后再和v[1]哈达玛积
            p = [tf.multiply(tf.tensordot(v[0], w, axes=(-1, 0)), v[1])
                 for v, w in zip(itertools.combinations(inputs, 2), self.W_list)]
        
        output = Concatenate(axis=1)(p)
        return output

In [16]:
def SharedBottom(dnn_feature_columns, lhuc_feature_columns, bottom_dnn_hidden_units=(256, 128), tower_dnn_hidden_units=(64, ), 
                l2_reg_embedding=0.00001, l2_reg_dnn=0, seed=2021, dnn_dropout=0, dnn_activation='relu',
                dnn_use_bn=False, task_types=('binary', 'binary'), task_names=('ctr', 'ctcvr'), bilinear_type='interaction'):
    
    num_tasks = len(task_names)
    
    # 异常判断
    for task_type in task_types:
        if task_type not in ['binary', 'regression']:
            raise ValueError("task must be binary or regression, {} is illegal".format(task_type))
    
    # 构建Input层并将Input层转成列表作为模型的输入
    input_layer_dict = build_input_layers(dnn_feature_columns)
    input_layers = list(input_layer_dict.values())
    
    # 筛选出特征中的sparse和Dense特征， 后面要单独处理
    sparse_feature_columns = list(filter(lambda x: isinstance(x, SparseFeat), dnn_feature_columns))
    dense_feature_columns = list(filter(lambda x: isinstance(x, DenseFeat), dnn_feature_columns))
    
    # 获取Dense Input
    dnn_dense_input = []
    for fc in dense_feature_columns:
        dnn_dense_input.append(input_layer_dict[fc.name])
    
    # 构建embedding字典
    embedding_layer_dict = build_embedding_layers(dnn_feature_columns)
    # 离散的这些特特征embedding之后，然后拼接，然后直接作为全连接层Dense的输入，所以需要进行Flatten
    dnn_sparse_embed_input = concat_embedding_list(sparse_feature_columns, input_layer_dict, embedding_layer_dict, flatten=False)
    
    # 把连续特征和离散特征合并起来
    bias_input = combined_dnn_input(dnn_sparse_embed_input, dnn_dense_input)
    
    # 下面dnn_sparse_embed_input进行双线性交互
    bilinear_out = BilinearInteraction(bilinear_type=bilinear_type)(Concatenate(axis=1)(dnn_sparse_embed_input))
    
    # lhuc_features_columns
    lhuc_input = concat_embedding_list(lhuc_feature_columns, input_layer_dict, embedding_layer_dict, flatten=True)
    lhuc_input = concat_func(lhuc_input)
    
    # bilinear_out与lhuc_input过lhuc_net
    bilinear_out_flatt = Flatten()(bilinear_out)
    bilinear_lhuc_out = lhuc_net("bilinear_lhuc", bilinear_out_flatt, lhuc_input)
    
    # bias_input与lhuc_input过lhuc_net
    bias_lhuc_out = lhuc_net("bias_lhuc", bias_input, lhuc_input)
    
    # 两个输出拼接就是双线性net的最终输出结果，汇总了原始信息和交叉信息， 且通过lhuc_net对维度加权，在DNN每一层做一个维度筛选
    sb_out = Concatenate(axis=-1)([bilinear_lhuc_out, bias_lhuc_out])
    
    # 每个任务独立的tower
    task_outputs = []
    for task_type, task_name in zip(task_types, task_names):
        # 建立tower
        tower_output = DNN(tower_dnn_hidden_units, dnn_activation, l2_reg_dnn, dnn_dropout, dnn_use_bn, seed=2022, name='tower_'+task_name)(sb_out)
        logit = Dense(1, use_bias=False, activation=None)(tower_output)
        output = PredictionLayer(task_type, name=task_name)(logit)
        task_outputs.append(output)
    
    model = Model(inputs=input_layers, outputs=task_outputs)
    return model

In [17]:
model = SharedBottom(dnn_features_columns, lhuc_feature_columns, tower_dnn_hidden_units=[], task_types=['regression', 'binary'], 
             task_names=['duration', 'click'])

In [52]:
# model.summary()

## 模型的训练和预测

In [18]:
model.compile("adam", loss={"duration": "mean_squared_error", "click": "binary_crossentropy"}, 
              loss_weights={"duration": 0.02, "click": 0.98},
              metrics={"duration": "mae", "click": "binary_crossentropy"})

In [19]:
label_duration = tf.keras.backend.constant(train_data['duration'].values)
label_click = tf.keras.backend.constant(train_data['click'].values)

In [28]:
history = model.fit(train_model_input, [label_duration, label_click],
                        batch_size=128, epochs=10, verbose=1, validation_split=0.2)

Train on 124189 samples, validate on 31048 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [29]:
pred_ans = model.predict(test_model_input, batch_size=256)

In [30]:
print("test click AUC", round(roc_auc_score(test_data['click'], pred_ans[1]), 4))

test click AUC 0.6216


In [31]:
print("test duration", round(mean_absolute_error(test_data['duration'], pred_ans[0]), 4))

test duration 52.0987
