In [1]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import *
from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.constraints import *

%matplotlib inline

# 准备数据

In [2]:
path = '/disk/share/criteo/'
data = pd.read_csv(path+'criteo_sampled_data.csv')
cols = data.columns.values

dense_feats = [f for f in cols if f[0] == "I"]
sparse_feats = [f for f in cols if f[0] == "C"]

def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

data_dense = process_dense_feats(data, dense_feats)

vocab_sizes = {}
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        vocab_sizes[f] = d[f].nunique() + 1
    return d

data_sparse = process_sparse_feats(data, sparse_feats)
total_data = pd.concat([data_dense, data_sparse], axis=1)
total_data['label'] = data['label']

# 自定义层

In [3]:
class SparseEmbedding(Layer):
    def __init__(self, sparse_feats, vocab_sizes, embed_dims=8):
        super().__init__()
        self.sparse_feats = sparse_feats
        self.vocab_sizes = vocab_sizes
        self.embed_dims = embed_dims
        
        # 离散特征嵌入矩阵
        self.sparse_embeds_mat = []
        for idx, feat in enumerate(self.sparse_feats):
            # reg = tf.keras.regularizers.l2(0.5)
            emb = Embedding(input_dim=self.vocab_sizes[feat],
                            output_dim=self.embed_dims,
                            # embeddings_regularizer=reg,
                            name=f'{feat}_emb')
            self.sparse_embeds_mat.append(emb)
        
    def call(self, sparse_inputs):
        sparse_embeds = []
        for idx, emb_mat in enumerate(self.sparse_embeds_mat):
            emb = emb_mat(sparse_inputs[idx])
            sparse_embeds.append(emb)
        concat_sparse_embeds = Concatenate(axis=1)(sparse_embeds)
        return concat_sparse_embeds

In [4]:
class single_cross_layer(Layer):
    def __init__(self):
        super().__init__()
        
    def build(self, input_shape):
        embed_dims = input_shape[0][-1]
        self.w = self.add_weight(shape=[embed_dims, 1], name='w')
        self.b = self.add_weight(shape=[embed_dims, 1], name='b')
    
    def call(self, inputs):
        x0, xl = inputs
        x0 = tf.expand_dims(x0, -1)
        xl = tf.expand_dims(xl, -1)
        x0_xl = tf.matmul(x0, xl, transpose_b=True)
        x_next = tf.matmul(x0_xl, self.w) + xl + self.b
        x_next = tf.squeeze(x_next, axis=-1)
        return x_next

In [5]:
class cross_layer(Layer):
    def __init__(self, cross_layer_num):
        super().__init__()
        self.cross_layer_num = cross_layer_num
        self.cross_layers = []
        for i in range(cross_layer_num):
            self.cross_layers.append(single_cross_layer())
    
    def call(self, inputs):
        x0 = inputs
        xl = self.cross_layers[0]([x0, x0])
        for layer in self.cross_layers[1:]:
            xl = layer([x0, xl])
        return xl
    

In [6]:
class DNN(Layer):
    def __init__(self, hid_units=[256,256,256], use_dropout=True, output_unit=16):
        super().__init__()
        self.hid_units = hid_units
        self.use_dropout = use_dropout
        self.output_unit = output_unit
        self.Dropout = Dropout(0.3)
        self.dense_layers = []
        for unit in self.hid_units:
            self.dense_layers.append(Dense(unit, activation='relu'))
        self.dense_layers.append(Dense(self.output_unit))
        
    def call(self, concat_sparse_embeds):
        flat_sparse_embed = Flatten()(concat_sparse_embeds)
        
        x = self.dense_layers[0](flat_sparse_embed)
        for dense in self.dense_layers[1:]:
            x = dense(x)
            if self.use_dropout:
                x = self.Dropout(x)
        return x

# 构建模型 (keras函数式)

In [7]:
class DCN:
    def __init__(self, dense_feats, sparse_feats, vocab_sizes, 
                 embed_dims=8, cross_layer_num=3):
        
        # 连续特征
        self.dense_inputs = []
        for feat in dense_feats:
            self.dense_inputs.append(Input(shape=1, name=feat))
            
        # 离散特征
        self.sparse_inputs = []
        for feat in sparse_feats:
            self.sparse_inputs.append(Input(shape=1, name=feat))
        
        self.SparseEmbedding = SparseEmbedding(sparse_feats, vocab_sizes, embed_dims=8)
        
        self.cross_layer = cross_layer(cross_layer_num)
        
        self.DNN = DNN()
        self.dense = Dense(1, activation='sigmoid')
        
    def bulid_model(self):
        all_inputs = [self.dense_inputs, self.sparse_inputs]
        
        concat_dense_inputs = Concatenate(axis=1)(self.dense_inputs)
        
        concat_sparse_embeds = self.SparseEmbedding(self.sparse_inputs)
        flatten_sparse_embeds = Flatten()(concat_sparse_embeds)
        
        concat_inputs = Concatenate(axis=1)([flatten_sparse_embeds, concat_dense_inputs])
        cross_output = self.cross_layer(concat_inputs)
        
        fc_layer_output = self.DNN(concat_sparse_embeds)
        
        # 输出部分
        concat_layer = Concatenate()([cross_output, fc_layer_output])
        output = self.dense(concat_layer)
        
        model = Model(inputs=all_inputs, outputs=output)
        return model

In [19]:
train_data = total_data.loc[:500000-1]
valid_data = total_data.loc[500000:]

train_dense_x_all = [train_data[f].values for f in dense_feats]
train_sparse_x_all = [train_data[f].values for f in sparse_feats]
train_label_all = train_data[['label']].values

val_dense_x_all = [valid_data[f].values for f in dense_feats]
val_sparse_x_all = [valid_data[f].values for f in sparse_feats]
val_label_all = valid_data[['label']].values


model = DCN(dense_feats, sparse_feats, vocab_sizes).bulid_model()
model.compile(optimizer='rmsprop', loss='binary_crossentropy', 
              metrics=['binary_crossentropy', 'AUC']) # tf.keras.metrics.AUC()

os.makedirs('checkpoints', exist_ok=True)
checkpoints = ModelCheckpoint('checkpoints/model.h5', monitor='val_auc', 
                              mode='max', save_weights_only=True)# save_best_only=True
early_stopping = EarlyStopping(monitor='val_auc', min_delta=0.0001, patience=2)
def scheduler(epoch):
    thred = 10
    if epoch < thred:
        return 0.001
    else:
        return 0.001 * tf.math.exp(0.1 * (thred - epoch))
lr_schedule = LearningRateScheduler(scheduler)
callbacks = [early_stopping, lr_schedule, checkpoints] # 


model.fit([train_dense_x_all, train_sparse_x_all], train_label_all, batch_size=256,
         validation_data=([val_dense_x_all, val_sparse_x_all], val_label_all),
         callbacks=callbacks, epochs=3)

Epoch 1/3
Instructions for updating:
If using Keras pass *_constraint arguments to layers.
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x7f5d7f7d48d0>

In [16]:
# # 加载模型
# model = DCN(dense_feats, sparse_feats, vocab_sizes).bulid_model()
# model.load_weights('checkpoints/model.h5')