# 一、数据包导入及数据查看

In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
%matplotlib inline

In [2]:
data = pd.read_csv('./data/random_data.csv')
data.shape

(1500000, 17)

In [3]:
data.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13,I14,C1,C2,label
0,4110010518477440242,1896312013845627886,6717084795003240529,1080695977093114212,4111614179005464985,8798042721559307282,5895132605950582744,6084180829892317120,7697383445739955609,4884422954281974949,8336763477901001868,6242285785126424193,7900538574498788796,134740763003655426,128.0,2996580352,0
1,8549798872825571966,1896312013845627886,5931133767701417382,1080695977093114212,6220860133300853649,8798042721559307282,7906599642295772887,8074429082947521887,2016335991759856420,8321822893668424639,6374970459031672275,2638981520939704579,1276775102817461451,7804158046819701007,0.0,3960238080,0
2,4110010518477440242,1896312013845627886,6738940029233988884,1080695977093114212,4111614179005464985,4661809481046063301,4860738004335111677,9151208213879479282,287166139677673559,3794272402836699782,4828615104526241983,6674460745934457691,1276775102817461451,134740763003655426,102.0,5837828096,1
3,754307468715170673,1896312013845627886,4744813893973488598,1080695977093114212,4111614179005464985,8798042721559307282,3365621371127355288,8074429082947521887,8049740328582698059,9094510792016618155,1867504036779671592,3042470253253863991,1276775102817461451,134740763003655426,92.0,3950256128,0
4,2116136986973194273,1896312013845627886,7734665466258756703,1080695977093114212,4417047100345135239,8798042721559307282,5895132605950582744,4821330781467424349,5661661104604751783,3794272402836699782,7803311518978882459,4436990931532908761,1276775102817461451,134740763003655426,46.0,7796359168,0


In [4]:
cols = data.columns.values

# 二、数据预处理

## 2.1 定义特征组

In [5]:
dense_feats = [f for f in cols if f[0] == "I"]
sparse_feats = [f for f in cols if f[0] == "C"]

## 2.2 处理dense特征

In [6]:
def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

data_dense = process_dense_feats(data, dense_feats)

## 2.3 处理sparse特征

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        
    return d

In [9]:
data_sparse = process_sparse_feats(data, sparse_feats)
total_data = pd.concat([data_dense, data_sparse], axis=1)
total_data['label'] = data['label']

# 三、模型构建与训练
## 3.1 一阶特征
### 1）dense特征

In [10]:
# 构造 dense 特征的输入
dense_inputs = []
for f in dense_feats:
    _input = Input([1], name=f)
    dense_inputs.append(_input)
# 将输入拼接到一起，方便连接 Dense 层
concat_dense_inputs = Concatenate(axis=1)(dense_inputs)  # ?, 13
# 然后连上输出为1个单元的全连接层，表示对 dense 变量的加权求和
fst_order_dense_layer = Dense(1)(concat_dense_inputs)  # ?, 1

### 2）sparse特征

In [11]:
# 这里单独对每一个 sparse 特征构造输入，
# 目的是方便后面构造二阶组合特征
sparse_inputs = []
for f in sparse_feats:
    _input = Input([1], name=f)
    sparse_inputs.append(_input)
    
sparse_1d_embed = []
for i, _input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = total_data[f].nunique()
    # 使用 l2 正则化防止过拟合
    reg = tf.keras.regularizers.l2(0.5)
    _embed = Embedding(voc_size, 1, embeddings_regularizer=reg)(_input)
    # 由于 Embedding 的结果是二维的，
    # 因此如果需要在 Embedding 之后加入 Dense 层，则需要先连接上 Flatten 层
    _embed = Flatten()(_embed)
    sparse_1d_embed.append(_embed)
# 对每个 embedding lookup 的结果 wi 求和
fst_order_sparse_layer = Add()(sparse_1d_embed)

### 3）Linear部分合并

In [12]:
linear_part = Add()([fst_order_dense_layer, fst_order_sparse_layer])

## 3.2 二阶特征

In [13]:
# embedding size
k = 8

# 只考虑sparse的二阶交叉
sparse_kd_embed = []
for i, _input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = total_data[f].nunique()
    reg = tf.keras.regularizers.l2(0.7)
    _embed = Embedding(voc_size, k, embeddings_regularizer=reg)(_input)
    sparse_kd_embed.append(_embed)

In [14]:
# 1.将所有sparse的embedding拼接起来，得到 (n, k)的矩阵，其中n为特征数，k为embedding大小
concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed)  # ?, n, k

In [15]:
# 2.先求和再平方
sum_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(concat_sparse_kd_embed)  # ?, k
square_sum_kd_embed = Multiply()([sum_kd_embed, sum_kd_embed])  # ?, k

In [16]:
# 3.先平方再求和
square_kd_embed = Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed]) # ?, n, k
sum_square_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(square_kd_embed)  # ?, k

In [17]:
# 4.相减除以2
sub = Subtract()([square_sum_kd_embed, sum_square_kd_embed])  # ?, k
sub = Lambda(lambda x: x*0.5)(sub)  # ?, k
snd_order_sparse_layer = Lambda(lambda x: K.sum(x, axis=1, keepdims=True))(sub)  # ?, 1

## 3.3 DNN部分

In [18]:
flatten_sparse_embed = Flatten()(concat_sparse_kd_embed)  # ?, n*k

In [19]:
fc_layer = Dropout(0.5)(Dense(256, activation='relu')(flatten_sparse_embed))  # ?, 256
fc_layer = Dropout(0.3)(Dense(256, activation='relu')(fc_layer))  # ?, 256
fc_layer = Dropout(0.1)(Dense(256, activation='relu')(fc_layer))  # ?, 256

In [20]:
fc_layer_output = Dense(1)(fc_layer)  # ?, 1

## 3.4 输出结果

In [21]:
output_layer = Add()([linear_part, snd_order_sparse_layer, fc_layer_output])
output_layer = Activation("sigmoid")(output_layer)

## 3.5 模型编译

In [22]:
model = Model(dense_inputs+sparse_inputs, output_layer)

In [23]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
embedding_2 (Embedding)         (None, 1, 8)         30184       C1[0][0]                         
__________________________________________________________________________________________________
embedding_3 (Embedding)         (None, 1, 8)         169272      C2[0][0]                         
______________________________________________________________________________________________

In [24]:
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

## 3.6 模型训练

In [25]:
train_data = total_data.loc[:1200000-1]
valid_data = total_data.loc[1200000:]

In [26]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

In [27]:
train_label = [train_data['label'].values]

In [28]:
val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

In [29]:
val_label = [valid_data['label'].values]

In [30]:
model.fit(train_dense_x + train_sparse_x, 
          train_label, epochs=5, batch_size=256,
          validation_data=(val_dense_x + val_sparse_x, val_label),
         )

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x242912c20d0>