In [1]:
import numpy as np
import pandas as pd
from tensorflow.keras.models import Model
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
%matplotlib inline

# 加载数据

In [2]:
data = pd.read_csv('criteo_sampled_data.csv')

In [3]:
data.head()

Unnamed: 0,label,I1,I2,I3,I4,I5,I6,I7,I8,I9,...,C17,C18,C19,C20,C21,C22,C23,C24,C25,C26
0,0,1.0,1,5.0,0.0,1382.0,4.0,15.0,2.0,181.0,...,e5ba7672,f54016b9,21ddcdc9,b1252a9d,07b5194c,,3a171ecb,c5c50484,e8b83407,9727dd16
1,0,2.0,0,44.0,1.0,102.0,8.0,2.0,2.0,4.0,...,07c540c4,b04e4670,21ddcdc9,5840adea,60f6221e,,3a171ecb,43f13e8b,e8b83407,731c3655
2,0,2.0,0,1.0,14.0,767.0,89.0,4.0,2.0,245.0,...,8efede7f,3412118d,,,e587c466,ad3062eb,3a171ecb,3b183c5c,,
3,0,,893,,,4392.0,,0.0,0.0,0.0,...,1e88c74f,74ef3502,,,6b3a5ca6,,3a171ecb,9117a34a,,
4,0,3.0,-1,,0.0,2.0,0.0,3.0,0.0,0.0,...,1e88c74f,26b3c7a7,,,21c9516a,,32c7478e,b34f3128,,


In [4]:
cols = data.columns.values

# 数据预处理

## 定义特征组

In [5]:
dense_feats = [f for f in cols if f[0] == "I"]
sparse_feats = [f for f in cols if f[0] == "C"]

## 处理dense特征

In [6]:
def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

In [7]:
data_dense = process_dense_feats(data, dense_feats)

## 处理sparse特征

In [8]:
from sklearn.preprocessing import LabelEncoder

In [9]:
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        
    return d

In [10]:
data_sparse = process_sparse_feats(data, sparse_feats)

In [11]:
total_data = pd.concat([data_dense, data_sparse], axis=1)

In [12]:
total_data['label'] = data['label']

# 模型构建与训练

## 输入层

### dense特征

In [13]:
import tensorflow as tf

In [14]:
dense_inputs = []
for f in dense_feats:
    _input = Input([1], name=f)
    dense_inputs.append(_input)

In [15]:
dense_inputs

[<tf.Tensor 'I1:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I2:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I3:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I4:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I5:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I6:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I7:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I8:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I9:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I10:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I11:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I12:0' shape=(None, 1) dtype=float32>,
 <tf.Tensor 'I13:0' shape=(None, 1) dtype=float32>]

In [16]:
concat_dense_inputs = Concatenate(axis=1)(dense_inputs)

In [17]:
concat_dense_inputs

<tf.Tensor 'concatenate/Identity:0' shape=(None, 13) dtype=float32>

### sparse特征

In [18]:
sparse_inputs = []
for f in sparse_feats:
    _input = Input([1], name=f)
    sparse_inputs.append(_input)

In [19]:
k = 8

In [20]:
sparse_kd_embed = []
for _input in sparse_inputs:
    f = _input.name.split(':')[0]
    voc_size = total_data[f].nunique()
    _embed = Flatten()(Embedding(voc_size, k, embeddings_regularizer=tf.keras.regularizers.l2(0.7))(_input))
    sparse_kd_embed.append(_embed)

In [21]:
sparse_kd_embed

[<tf.Tensor 'flatten/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_1/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_2/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_3/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_4/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_5/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_6/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_7/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_8/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_9/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_10/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_11/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_12/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_13/Identity:0' shape=(None, 8) dtype=float32>,
 <tf.Tensor 'flatten_14/Identity:0' shape=(None, 8) dtype=fl

In [22]:
concat_sparse_inputs = Concatenate(axis=1)(sparse_kd_embed)

In [23]:
concat_sparse_inputs

<tf.Tensor 'concatenate_1/Identity:0' shape=(None, 208) dtype=float32>

### 所有输入embedding

In [None]:
embed_inputs = Concatenate(axis=1)([concat_sparse_inputs, concat_dense_inputs])

In [None]:
embed_inputs

<tf.Tensor 'concatenate_2/Identity:0' shape=(None, 221) dtype=float32>

## Cross Network

In [None]:
def cross_layer(x0, xl):
    """
    实现一层cross layer
    @param x0: 特征embeddings
    @param xl: 第l层的输出结果
    """
    # 1.获取xl层的embedding size
    embed_dim = xl.shape[-1]
    # 2.初始化当前层的W和b
    w = tf.Variable(tf.random.truncated_normal(shape=(embed_dim,), stddev=0.01))
    b = tf.Variable(tf.zeros(shape=(embed_dim,)))
    # 3.计算feature crossing
    x1_T = tf.reshape(xl, [-1, 1, embed_dim])
    x_lw = tf.tensordot(x1_T, w, axes=1)
    cross = x0 * x_lw 
    return cross + b + xl

In [None]:
def build_cross_layer(x0, num_layer=3):
    """
    构建多层cross layer
    @param x0: 所有特征的embeddings
    @param num_layers: cross net的层数
    """
    # 初始化xl为x0
    xl = x0
    # 构建多层cross net
    for i in range(num_layer):
        xl = cross_layer(x0, xl)
    
    return xl

In [None]:
# cross net
cross_layer_output = build_cross_layer(embed_inputs, 3)

In [None]:
cross_layer_output

<tf.Tensor 'add_5:0' shape=(None, 221) dtype=float32>

## DNN部分

In [None]:
fc_layer = Dropout(0.5)(Dense(128, activation='relu')(embed_inputs))
fc_layer = Dropout(0.3)(Dense(128, activation='relu')(fc_layer))
fc_layer_output = Dropout(0.1)(Dense(128, activation='relu')(fc_layer))

In [None]:
fc_layer_output

<tf.Tensor 'dropout_2/Identity:0' shape=(None, 128) dtype=float32>

## 输出结果

In [None]:
cross_layer_output

<tf.Tensor 'add_5:0' shape=(None, 221) dtype=float32>

In [None]:
fc_layer_output

<tf.Tensor 'dropout_2/Identity:0' shape=(None, 128) dtype=float32>

In [None]:
stack_layer = Concatenate()([cross_layer_output, fc_layer_output])

In [None]:
output_layer = Dense(1, activation='sigmoid', use_bias=True)(stack_layer)

## 编译模型

In [None]:
model = Model(dense_inputs+sparse_inputs, output_layer)

In [None]:
model.summary()

Model: "model"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
C1 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C2 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C3 (InputLayer)                 [(None, 1)]          0                                            
__________________________________________________________________________________________________
C4 (InputLayer)                 [(None, 1)]          0                                            
______________________________________________________________________________________________

In [None]:
model.compile(optimizer="adam", 
              loss="binary_crossentropy", 
              metrics=["binary_crossentropy", tf.keras.metrics.AUC(name='auc')])

## 训练

In [None]:
train_data = total_data.loc[:500000-1]
valid_data = total_data.loc[500000:]

In [None]:
train_dense_x = [train_data[f].values for f in dense_feats]
train_sparse_x = [train_data[f].values for f in sparse_feats]

In [None]:
train_label = [train_data['label'].values]

In [None]:
val_dense_x = [valid_data[f].values for f in dense_feats]
val_sparse_x = [valid_data[f].values for f in sparse_feats]

In [None]:
val_label = [valid_data['label'].values]

In [None]:
model.fit(train_dense_x+train_sparse_x, 
          train_label, epochs=5, batch_size=128,
          validation_data=(val_dense_x+val_sparse_x, val_label)
         )

Train on 500000 samples, validate on 100000 samples
Epoch 1/5


  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "
  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


