In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
%matplotlib inline

# 加载数据

In [2]:
data = pd.read_csv('../data/criteo_sampled_data.csv')

In [3]:
data.shape

(600000, 40)

In [4]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [5]:
cols = data.columns.values
print(cols)

['label' 'I1' 'I2' 'I3' 'I4' 'I5' 'I6' 'I7' 'I8' 'I9' 'I10' 'I11' 'I12'
 'I13' 'C1' 'C2' 'C3' 'C4' 'C5' 'C6' 'C7' 'C8' 'C9' 'C10' 'C11' 'C12'
 'C13' 'C14' 'C15' 'C16' 'C17' 'C18' 'C19' 'C20' 'C21' 'C22' 'C23' 'C24'
 'C25' 'C26']


# 数据预处理

## 定义特征组

In [6]:
dense_feats = [f for f in cols if f[0] == "I"]
sparse_feats = [f for f in cols if f[0] == "C"]
print(dense_feats)
print(sparse_feats)

['I1', 'I2', 'I3', 'I4', 'I5', 'I6', 'I7', 'I8', 'I9', 'I10', 'I11', 'I12', 'I13']
['C1', 'C2', 'C3', 'C4', 'C5', 'C6', 'C7', 'C8', 'C9', 'C10', 'C11', 'C12', 'C13', 'C14', 'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21', 'C22', 'C23', 'C24', 'C25', 'C26']


## 处理dense特征

In [7]:
def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

In [8]:
data_dense = process_dense_feats(data, dense_feats)

In [9]:
data_dense.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,I11,I12,I13
0,0.693147,0.693147,1.791759,0.0,7.23201,1.609438,2.772589,1.098612,5.204007,0.693147,1.098612,0.0,1.098612
1,1.098612,0.0,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,0.693147,0.0,1.609438
2,1.098612,0.0,0.693147,2.70805,6.64379,4.49981,1.609438,1.098612,5.505332,0.693147,1.386294,1.386294,3.828641
3,0.0,6.795706,0.0,0.0,8.387768,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,1.386294,-1.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.0,0.693147,0.693147,0.0,0.0


## 处理sparse特征

In [10]:
from sklearn.preprocessing import LabelEncoder

In [11]:
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        
    return d

In [12]:
data_sparse = process_sparse_feats(data, sparse_feats)
data_sparse.head()

Unnamed: 0,C1,C2,C3,C4,C5,C6,C7,C8,C9,C10,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,470,261,203952,41641,38,6,8961,63,2,16515,...,9,3439,213,3,4954,0,3,24768,52,14364
1,470,498,90258,22218,38,13,5957,19,2,4195,...,0,2465,213,1,60664,0,3,8432,52,10835
2,170,24,2223,65253,38,6,8067,19,2,5767,...,6,738,0,0,143786,9,3,7344,0,0
3,470,93,137623,15635,38,13,1935,19,2,23623,...,1,1648,0,0,67107,0,3,18107,0,0
4,612,368,162265,83638,38,2,7067,19,2,8071,...,1,556,0,0,21257,0,2,22439,0,0


In [13]:
total_data = pd.concat([data_dense, data_sparse], axis=1)

In [14]:
total_data['label'] = data['label']

In [15]:
total_data.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
0,0.693147,0.693147,1.791759,0.0,7.23201,1.609438,2.772589,1.098612,5.204007,0.693147,...,3439,213,3,4954,0,3,24768,52,14364,0
1,1.098612,0.0,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,...,2465,213,1,60664,0,3,8432,52,10835,0
2,1.098612,0.0,0.693147,2.70805,6.64379,4.49981,1.609438,1.098612,5.505332,0.693147,...,738,0,0,143786,9,3,7344,0,0,0
3,0.0,6.795706,0.0,0.0,8.387768,0.0,0.0,0.0,0.0,0.0,...,1648,0,0,67107,0,3,18107,0,0,0
4,1.386294,-1.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.0,0.693147,...,556,0,0,21257,0,2,22439,0,0,0


# 模型构建与训练

## 一阶特征

### dense特征

In [16]:
# 构造 dense 特征的输入
dense_inputs = []
for f in dense_feats:
    _input = Input([1], name=f)
    dense_inputs.append(_input)
# 将输入拼接到一起，方便连接 Dense 层
concat_dense_inputs = Concatenate(axis=1)(dense_inputs)  # ?, 13
# 然后连上输出为1个单元的全连接层，表示对 dense 变量的加权求和
fst_order_dense_layer = Dense(1)(concat_dense_inputs)  # ?, 1

In [17]:
dense_inputs

[<tf.Tensor 'I1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I13:0' shape=(None, 1) dtype=float32>]

In [19]:
concat_dense_inputs

<tf.Tensor 'concatenate/concat:0' shape=(None, 13) dtype=float32>

In [21]:
fst_order_dense_layer

<tf.Tensor 'dense/BiasAdd:0' shape=(None, 1) dtype=float32>

### sparse特征

In [22]:
# 这里单独对每一个 sparse 特征构造输入，
# 目的是方便后面构造二阶组合特征
sparse_inputs = []
for f in sparse_feats:
    _input = Input([1], name=f)
    sparse_inputs.append(_input)
    
sparse_1d_embed = []
for i, _input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = total_data[f].nunique()
    # 使用 l2 正则化防止过拟合
    reg = tf.keras.regularizers.l2(0.5)
    _embed = Embedding(voc_size, 1, embeddings_regularizer=reg)(_input)
    # 由于 Embedding 的结果是二维的，
    # 因此如果需要在 Embedding 之后加入 Dense 层，则需要先连接上 Flatten 层
    _embed = Flatten()(_embed)
    sparse_1d_embed.append(_embed)
# 对每个 embedding lookup 的结果 wi 求和
fst_order_sparse_layer = Add()(sparse_1d_embed)
fst_order_sparse_layer

<tf.Tensor 'add/add_24:0' shape=(None, 1) dtype=float32>

In [25]:
sparse_inputs

[<tf.Tensor 'C1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C13:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C14:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C15:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C16:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C17:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C18:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C19:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C20:0' s

In [23]:
sparse_1d_embed

[<tf.Tensor 'flatten/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_1/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_2/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_3/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_4/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_5/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_6/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_7/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_8/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_9/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_10/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_11/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_12/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_13/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'flatten_14/Reshape:0' shape=(None, 1) dtype=float32>,
 <tf.Te

In [24]:
fst_order_sparse_layer

<tf.Tensor 'add/add_24:0' shape=(None, 1) dtype=float32>

### Linear部分合并 

In [24]:
linear_part = Add()([fst_order_dense_layer, fst_order_sparse_layer])
linear_part

<tf.Tensor 'add_1/add:0' shape=(None, 1) dtype=float32>

## 二阶特征

In [27]:
# embedding size
k = 8

# 只考虑sparse的二阶交叉
sparse_kd_embed = []
for i, _input in enumerate(sparse_inputs):
    f = sparse_feats[i]
    voc_size = total_data[f].nunique()
    reg = tf.keras.regularizers.l2(0.7)
    _embed = Embedding(voc_size, k, embeddings_regularizer=reg)(_input)
    sparse_kd_embed.append(_embed)
sparse_kd_embed

[<tf.Tensor 'embedding_52/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_53/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_54/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_55/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_56/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_57/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_58/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_59/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_60/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_61/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtype=float32>,
 <tf.Tensor 'embedding_62/embedding_lookup/Identity_1:0' shape=(None, 1, 8) dtyp

In [28]:
# 1.将所有sparse的embedding拼接起来，得到 (n, k)的矩阵，其中n为特征数，k为embedding大小
concat_sparse_kd_embed = Concatenate(axis=1)(sparse_kd_embed)  # ?, n, k
concat_sparse_kd_embed

<tf.Tensor 'concatenate_1/concat:0' shape=(None, 26, 8) dtype=float32>

In [29]:
# 2.先求和再平方
sum_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(concat_sparse_kd_embed)  # ?, k
square_sum_kd_embed = Multiply()([sum_kd_embed, sum_kd_embed])  # ?, k
square_sum_kd_embed

<tf.Tensor 'multiply/mul:0' shape=(None, 8) dtype=float32>

In [30]:
# 3.先平方再求和
square_kd_embed = Multiply()([concat_sparse_kd_embed, concat_sparse_kd_embed]) # ?, n, k
sum_square_kd_embed = Lambda(lambda x: K.sum(x, axis=1))(square_kd_embed)  # ?, k
sum_square_kd_embed

<tf.Tensor 'lambda_1/Sum:0' shape=(None, 8) dtype=float32>

In [31]:
# 4.相减除以2
sub = Subtract()([square_sum_kd_embed, sum_square_kd_embed])  # ?, k
sub = Lambda(lambda x: x*0.5)(sub)  # ?, k
snd_order_sparse_layer = Lambda(lambda x: K.sum(x, axis=1, keepdims=True))(sub)  # ?, 1
snd_order_sparse_layer

<tf.Tensor 'lambda_3/Sum:0' shape=(None, 1) dtype=float32>

## DNN部分

In [31]:
flatten_sparse_embed = Flatten()(concat_sparse_kd_embed)  # ?, n*k
flatten_sparse_embed

<tf.Tensor 'flatten_26/Reshape:0' shape=(None, 208) dtype=float32>

In [32]:
fc_layer = Dropout(0.5)(Dense(256, activation='relu')(flatten_sparse_embed))  # ?, 256
fc_layer = Dropout(0.3)(Dense(256, activation='relu')(fc_layer))  # ?, 256
fc_layer = Dropout(0.1)(Dense(256, activation='relu')(fc_layer))  # ?, 256

In [33]:
fc_layer_output = Dense(1)(fc_layer)  # ?, 1
fc_layer_output

<tf.Tensor 'dense_4/BiasAdd:0' shape=(None, 1) dtype=float32>

## 输出结果

In [34]:
output_layer = Add()([linear_part, snd_order_sparse_layer, fc_layer_output])
output_layer = Activation("sigmoid")(output_layer)
output_layer

<tf.Tensor 'activation/Sigmoid:0' shape=(None, 1) dtype=float32>

## 编译模型

In [35]:
dense_inputs

[<tf.Tensor 'I1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I13:0' shape=(None, 1) dtype=float32>]

In [36]:
sparse_inputs

[<tf.Tensor 'C1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C13:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C14:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C15:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C16:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C17:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C18:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C19:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'C20:0' s

In [37]:
model = Model(dense_inputs+sparse_inputs, output_layer)

In [38]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C4 (InputLayer)                 [(None, 1)]          0                                            
_______________________________________________________________________________________

In [39]:
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

## 训练

In [40]:
train_data = total_data.loc[:500000-1]
valid_data = total_data.loc[500000:]

In [41]:
train_data.head()

Unnamed: 0,I1,I2,I3,I4,I5,I6,I7,I8,I9,I10,...,C18,C19,C20,C21,C22,C23,C24,C25,C26,label
0,0.693147,0.693147,1.791759,0.0,7.23201,1.609438,2.772589,1.098612,5.204007,0.693147,...,3439,213,3,4954,0,3,24768,52,14364,0
1,1.098612,0.0,3.806662,0.693147,4.634729,2.197225,1.098612,1.098612,1.609438,0.693147,...,2465,213,1,60664,0,3,8432,52,10835,0
2,1.098612,0.0,0.693147,2.70805,6.64379,4.49981,1.609438,1.098612,5.505332,0.693147,...,738,0,0,143786,9,3,7344,0,0,0
3,0.0,6.795706,0.0,0.0,8.387768,0.0,0.0,0.0,0.0,0.0,...,1648,0,0,67107,0,3,18107,0,0,0
4,1.386294,-1.0,0.0,0.0,1.098612,0.0,1.386294,0.0,0.0,0.693147,...,556,0,0,21257,0,2,22439,0,0,0


In [42]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

In [43]:
train_dense_x[:1]

[array([0.69314718, 1.09861229, 1.09861229, ..., 0.        , 0.69314718,
        0.        ])]

In [44]:
train_label = [train_data['label'].values]
train_label

[array([0, 0, 0, ..., 0, 0, 1])]

In [45]:
val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

In [46]:
val_label = [valid_data['label'].values]

In [47]:
model.fit(train_dense_x+train_sparse_x, 
          train_label, epochs=5, batch_size=256,
          validation_data=(val_dense_x+val_sparse_x, val_label),
         )

Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f6d67c43b50>