In [1]:
import os
import numpy as np
import pandas as pd
from tensorflow.keras.layers import *
import tensorflow.keras.backend as K
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.utils import plot_model
from tensorflow.keras.callbacks import *
from sklearn.preprocessing import LabelEncoder
# from tensorflow.keras.constraints import *

%matplotlib inline

# 准备数据

In [2]:
path = '/disk/share/criteo/'
data = pd.read_csv(path+'criteo_sampled_data.csv')
cols = data.columns.values

dense_feats = [f for f in cols if f[0] == "I"]
sparse_feats = [f for f in cols if f[0] == "C"]

def process_dense_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna(0.0)
    for f in feats:
        d[f] = d[f].apply(lambda x: np.log(x+1) if x > -1 else -1)
    
    return d

data_dense = process_dense_feats(data, dense_feats)

vocab_sizes = {}
def process_sparse_feats(data, feats):
    d = data.copy()
    d = d[feats].fillna("-1")
    for f in feats:
        label_encoder = LabelEncoder()
        d[f] = label_encoder.fit_transform(d[f])
        vocab_sizes[f] = d[f].nunique() + 1
    return d

data_sparse = process_sparse_feats(data, sparse_feats)
total_data = pd.concat([data_dense, data_sparse], axis=1)
total_data['label'] = data['label']

# 自定义层

In [3]:
class SparseEmbedding(Layer):
    def __init__(self, sparse_feats, vocab_sizes, embed_dims=8):
        super().__init__()
        self.sparse_feats = sparse_feats
        self.vocab_sizes = vocab_sizes
        self.embed_dims = embed_dims
        
        # 离散特征嵌入矩阵
        self.sparse_embeds_mat = []
        for idx, feat in enumerate(self.sparse_feats):
            # reg = tf.keras.regularizers.l2(0.5)
            emb = Embedding(input_dim=self.vocab_sizes[feat],
                            output_dim=self.embed_dims,
                            # embeddings_regularizer=reg,
                            name=f'{feat}_emb')
            self.sparse_embeds_mat.append(emb)
        
    def call(self, sparse_inputs):
        sparse_embeds = []
        for idx, emb_mat in enumerate(self.sparse_embeds_mat):
            emb = emb_mat(sparse_inputs[idx])
            sparse_embeds.append(emb)
        concat_sparse_embeds = Concatenate(axis=1)(sparse_embeds)
        return concat_sparse_embeds

In [48]:
class DenseEmbedding(Layer):
    def __init__(self, dense_feats, embed_dims=8):
        super().__init__()
        self.embed_dims = embed_dims
        
        self.dense_embs = []
        for feat in dense_feats:
            dense_emb = self.add_weight(shape=[1, self.embed_dims], 
                                        name=f'dense_emb_{feat}')
            self.dense_embs.append(dense_emb)
        
        
    def call(self, dense_inputs):
        scaled_embs = []
        for i, dense_input in enumerate(dense_inputs):
            dense_emb = dense_input * self.dense_embs[i]
            dense_emb = tf.expand_dims(dense_emb, axis=1)
            scaled_embs.append(dense_emb)
            
        concat_scaled_embs = Concatenate(axis=1)(scaled_embs)
        return concat_scaled_embs


In [49]:
class attention_cross_layer(Layer):
    def __init__(self, n_heads=6, att_dim=8):
        super().__init__()
        self.n_heads = n_heads
        self.att_dim = att_dim
        
    def build(self, input_shape):
        emb_dim = input_shape[-1]
        self.Wq = []
        self.Wv = []
        shape = [emb_dim, self.att_dim]
        for i in range(self.n_heads):
            self.Wq.append(self.add_weight(shape=shape, name=f'Wq_{i}'))
            self.Wv.append(self.add_weight(shape=shape, name=f'Wv_{i}'))
        
    def call(self, embeds): # ?,n,d
        heads = []
        for i in range(self.n_heads):
            emb_q = tf.matmul(embeds, self.Wq[i]) # ?,n,att_dim
            emb_v = tf.matmul(embeds, self.Wv[i]) # ?,n,att_dim
            emb = Attention()([emb_q, emb_v])
            heads.append(emb)
        if len(heads) > 1:
            heads = tf.concat(heads, axis=-1)
        else:
            heads = heads[0]
        return heads

# 构建模型 (keras函数式)

In [52]:
class AutoInt:
    def __init__(self, dense_feats, sparse_feats, vocab_sizes, 
                 embed_dims=8, cross_layer_num=3, n_atten_layers=2):
        
        # 连续特征
        self.dense_inputs = []
        for feat in dense_feats:
            self.dense_inputs.append(Input(shape=1, name=feat))
            
        # 离散特征
        self.sparse_inputs = []
        for feat in sparse_feats:
            self.sparse_inputs.append(Input(shape=1, name=feat))
        
        self.SparseEmbedding = SparseEmbedding(sparse_feats, vocab_sizes, embed_dims=8)
        self.DenseEmbedding = DenseEmbedding(dense_feats, embed_dims=8)
        
        self.atten_cross_layers = []
        for i in range(n_atten_layers):
            self.atten_cross_layers.append(attention_cross_layer())
        
        self.dense = Dense(1, activation='sigmoid')
        
    def bulid_model(self):
        all_inputs = [self.dense_inputs, self.sparse_inputs]
        
        
        concat_sparse_embeds = self.SparseEmbedding(self.sparse_inputs)
        concat_dense_embeds = self.DenseEmbedding(self.dense_inputs)

        concat_embeds = Concatenate(axis=1)([concat_sparse_embeds, concat_dense_embeds])
        
        atten_output = concat_embeds
        for layer in self.atten_cross_layers:
            atten_output = layer(atten_output)
        
        # 输出部分
        output = self.dense(Flatten()(atten_output))
        
        model = Model(inputs=all_inputs, outputs=output)
        return model

In [53]:
train_data = total_data.loc[:500000-1]
valid_data = total_data.loc[500000:]

train_dense_x_all = [train_data[f].values for f in dense_feats]
train_sparse_x_all = [train_data[f].values for f in sparse_feats]
train_label_all = train_data[['label']].values

val_dense_x_all = [valid_data[f].values for f in dense_feats]
val_sparse_x_all = [valid_data[f].values for f in sparse_feats]
val_label_all = valid_data[['label']].values


model = AutoInt(dense_feats, sparse_feats, vocab_sizes).bulid_model()
model.compile(optimizer='rmsprop', loss='binary_crossentropy', 
              metrics=['binary_crossentropy', 'AUC']) # tf.keras.metrics.AUC()

os.makedirs('checkpoints', exist_ok=True)
checkpoints = ModelCheckpoint('checkpoints/model.h5', monitor='val_auc', 
                              mode='max', save_weights_only=True)# save_best_only=True
early_stopping = EarlyStopping(monitor='val_auc', min_delta=0.0001, patience=2)
def scheduler(epoch):
    thred = 10
    if epoch < thred:
        return 0.001
    else:
        return 0.001 * tf.math.exp(0.1 * (thred - epoch))
lr_schedule = LearningRateScheduler(scheduler)
callbacks = [early_stopping, lr_schedule, checkpoints] # 


model.fit([train_dense_x_all, train_sparse_x_all], train_label_all, batch_size=256,
         validation_data=([val_dense_x_all, val_sparse_x_all], val_label_all),
         callbacks=callbacks, epochs=3)

Epoch 1/3
Epoch 2/3
 107/1954 [>.............................] - ETA: 54s - loss: 0.4650 - binary_crossentropy: 0.4650 - auc: 0.7802

KeyboardInterrupt: 