In [9]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader,TensorDataset
import torch.nn.functional as F
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
import numpy as np
from sklearn.metrics import roc_auc_score, accuracy_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')

In [10]:
class DeepCrossing(nn.Module):
    def __init__(self, feature_columns, hidden_units, dropout=0., embed_dim=10, output_dim=1):
        super(DeepCrossing, self).__init__()
        self.dense_feature_cols, self.sparse_feature_cols = feature_columns

        self.embed_layers = nn.ModuleDict({
            'embed_' + str(i): nn.Embedding(num_embeddings=feat['feat_num'], embedding_dim=feat['embed_dim']) for i, feat in enumerate(self.sparse_feature_cols)
        })
        
        # 统计embedding层的输出维度
        embed_dim_sum = sum([feat['embed_dim'] for feat in self.sparse_feature_cols])
        
        # stack layers的总维度
        dim_stack = len(self.dense_feature_cols) + embed_dim_sum
        
        # 残差层，可能会有多层
        self.res_layers = nn.ModuleList([
            Residual_block(unit, dim_stack) for unit in hidden_units
        ])

        # dropout层
        self.res_dropout = nn.Dropout(dropout)
        
        # 线性层
        self.linear = nn.Linear(dim_stack, output_dim)

    def forward(self, x):
        dense_inputs, sparse_inputs = x[:, :13], x[:, 13:]
        sparse_inputs = sparse_inputs.long()  # 需要转成长张量，这个是embedding的输入格式要求
        sparse_embeds = [self.embed_layers['embed_'+str(i)](sparse_inputs[:, i]) for i in range(sparse_inputs.shape[1])]
        sparse_embed = torch.cat(sparse_embeds, dim=-1)
        
        stack = torch.cat([sparse_embed, dense_inputs], dim=-1)
        r = stack
        for res in self.res_layers:
            r = res(r)
        r = self.res_dropout(r)
        outputs = torch.sigmoid(self.linear(r))
        outputs = outputs.squeeze(-1)
        return outputs

class Residual_block(nn.Module):
    def __init__(self, hidden_unit, dim_stack):
        super(Residual_block, self).__init__()
        self.linear1 = nn.Linear(dim_stack, hidden_unit)
        self.linear2 = nn.Linear(hidden_unit, dim_stack)
        self.relu = nn.ReLU()
    def forward(self, x):
        orig_x = x.clone()
        x = self.linear1(x)
        x = self.linear2(x)
        output = self.relu(x + orig_x)
        return output

[[{'feat': 'I1'}, {'feat': 'I2'}, {'feat': 'I3'}, {'feat': 'I4'}, {'feat': 'I5'}, {'feat': 'I6'}, {'feat': 'I7'}, {'feat': 'I8'}, {'feat': 'I9'}, {'feat': 'I10'}, {'feat': 'I11'}, {'feat': 'I12'}, {'feat': 'I13'}], [{'feat': 'C1', 'feat_num': 79, 'embed_dim': 8}, {'feat': 'C2', 'feat_num': 252, 'embed_dim': 8}, {'feat': 'C3', 'feat_num': 1293, 'embed_dim': 8}, {'feat': 'C4', 'feat_num': 1043, 'embed_dim': 8}, {'feat': 'C5', 'feat_num': 30, 'embed_dim': 8}, {'feat': 'C6', 'feat_num': 7, 'embed_dim': 8}, {'feat': 'C7', 'feat_num': 1164, 'embed_dim': 8}, {'feat': 'C8', 'feat_num': 39, 'embed_dim': 8}, {'feat': 'C9', 'feat_num': 2, 'embed_dim': 8}, {'feat': 'C10', 'feat_num': 908, 'embed_dim': 8}, {'feat': 'C11', 'feat_num': 926, 'embed_dim': 8}, {'feat': 'C12', 'feat_num': 1239, 'embed_dim': 8}, {'feat': 'C13', 'feat_num': 824, 'embed_dim': 8}, {'feat': 'C14', 'feat_num': 20, 'embed_dim': 8}, {'feat': 'C15', 'feat_num': 819, 'embed_dim': 8}, {'feat': 'C16', 'feat_num': 1159, 'embed_dim': 

In [13]:
hidden_units = [256, 128, 64]
batch_size = 64
num_workers = 4
lr = 1e-4
epochs = 20

model = DeepCrossing(feature_columns, hidden_units)
model

DeepCrossing(
  (embed_layers): ModuleDict(
    (embed_0): Embedding(79, 8)
    (embed_1): Embedding(252, 8)
    (embed_2): Embedding(1293, 8)
    (embed_3): Embedding(1043, 8)
    (embed_4): Embedding(30, 8)
    (embed_5): Embedding(7, 8)
    (embed_6): Embedding(1164, 8)
    (embed_7): Embedding(39, 8)
    (embed_8): Embedding(2, 8)
    (embed_9): Embedding(908, 8)
    (embed_10): Embedding(926, 8)
    (embed_11): Embedding(1239, 8)
    (embed_12): Embedding(824, 8)
    (embed_13): Embedding(20, 8)
    (embed_14): Embedding(819, 8)
    (embed_15): Embedding(1159, 8)
    (embed_16): Embedding(9, 8)
    (embed_17): Embedding(534, 8)
    (embed_18): Embedding(201, 8)
    (embed_19): Embedding(4, 8)
    (embed_20): Embedding(1204, 8)
    (embed_21): Embedding(7, 8)
    (embed_22): Embedding(12, 8)
    (embed_23): Embedding(729, 8)
    (embed_24): Embedding(33, 8)
    (embed_25): Embedding(554, 8)
  )
  (res_layers): ModuleList(
    (0): Residual_block(
      (linear1): Linear(in_features

In [14]:
hidden_units = [256, 128, 64, 32]
model = DeepCrossing(feature_columns, hidden_units)
model

DeepCrossing(
  (embed_layers): ModuleDict(
    (embed_0): Embedding(79, 8)
    (embed_1): Embedding(252, 8)
    (embed_2): Embedding(1293, 8)
    (embed_3): Embedding(1043, 8)
    (embed_4): Embedding(30, 8)
    (embed_5): Embedding(7, 8)
    (embed_6): Embedding(1164, 8)
    (embed_7): Embedding(39, 8)
    (embed_8): Embedding(2, 8)
    (embed_9): Embedding(908, 8)
    (embed_10): Embedding(926, 8)
    (embed_11): Embedding(1239, 8)
    (embed_12): Embedding(824, 8)
    (embed_13): Embedding(20, 8)
    (embed_14): Embedding(819, 8)
    (embed_15): Embedding(1159, 8)
    (embed_16): Embedding(9, 8)
    (embed_17): Embedding(534, 8)
    (embed_18): Embedding(201, 8)
    (embed_19): Embedding(4, 8)
    (embed_20): Embedding(1204, 8)
    (embed_21): Embedding(7, 8)
    (embed_22): Embedding(12, 8)
    (embed_23): Embedding(729, 8)
    (embed_24): Embedding(33, 8)
    (embed_25): Embedding(554, 8)
  )
  (res_layers): ModuleList(
    (0): Residual_block(
      (linear1): Linear(in_features

In [15]:
def auc(y_pred, y_true):
    pred = y_pred.data
    y = y_true.data
    return roc_auc_score(y, pred)

loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
metric_func = auc
metric_name = 'auc'
epochs = 10
log_step_freq = 10

dfhistory = pd.DataFrame(columns=['epoch', 'loss', metric_name, 'val_loss', 'val_'+metric_name])

print('start_training.........')

print('========'*8)

for epoch in range(1, epochs+1):
    
    # 训练阶段
    model.train()
    loss_sum = 0.0
    metric_sum = 0.0
    step = 1
    

    
    for step, (features, labels) in enumerate(dl_train, 1):
        # 梯度清零
        optimizer.zero_grad()
        labels = labels.view(-1)
        
        # 正向传播
        predictions = model(features);
        loss = loss_func(predictions, labels)
        try:
            metric = metric_func(predictions, labels)
        except ValueError:
            pass
        
        # 反向传播
        loss.backward()
        optimizer.step()
        
        # 打印batch级别日志
        loss_sum += loss.item()
        metric_sum += metric.item()
        if step % log_step_freq == 0:
            print(("[step=%d] loss: %.3f, " + metric_name + ": %.3f") % (step, loss_sum/step, metric_sum/step));
    
    # 验证阶段
    model.eval()
    val_loss_sum = 0.0
    val_metric_sum = 0.0
    val_step = 1
    
    for val_step, (features, labels) in enumerate(dl_val, 1):
        labels = labels.view(-1)
        with torch.no_grad():
            predictions = model(features)
            val_loss = loss_func(predictions, labels)
            try:
                val_metric = metric_func(predictions, labels)
            except ValueError:
                pass
        
        val_loss_sum += val_loss.item()
        val_metric_sum += val_metric.item()
    
    # 记录日志
    info = (epoch, loss_sum/step, metric_sum/step, val_loss_sum/val_step, val_metric_sum/val_step)
    dfhistory.loc[epoch-1] = info
    
    # 打印日志
    print(("\nEPOCH=%d, loss=%.3f, " + metric_name + " = %.3f, val_loss=%.3f, " + "val_" + metric_name + " = %.3f") %info)
    print('\n' + '=========='* 8)
    
print('Finished Training')

start_training.........
[step=10] loss: 0.601, auc: 0.450
[step=20] loss: 0.549, auc: 0.479
[step=30] loss: 0.554, auc: 0.474
[step=40] loss: 0.539, auc: 0.488

EPOCH=1, loss=0.537, auc = 0.497, val_loss=0.475, val_auc = 0.634

[step=10] loss: 0.463, auc: 0.638
[step=20] loss: 0.483, auc: 0.589
[step=30] loss: 0.483, auc: 0.608
[step=40] loss: 0.490, auc: 0.606

EPOCH=2, loss=0.493, auc = 0.611, val_loss=0.465, val_auc = 0.669

[step=10] loss: 0.474, auc: 0.717
[step=20] loss: 0.470, auc: 0.710
[step=30] loss: 0.450, auc: 0.713
[step=40] loss: 0.469, auc: 0.711

EPOCH=3, loss=0.472, auc = 0.709, val_loss=0.458, val_auc = 0.691

[step=10] loss: 0.448, auc: 0.780
[step=20] loss: 0.455, auc: 0.755
[step=30] loss: 0.445, auc: 0.761
[step=40] loss: 0.453, auc: 0.751

EPOCH=4, loss=0.450, auc = 0.753, val_loss=0.451, val_auc = 0.676

[step=10] loss: 0.388, auc: 0.832
[step=20] loss: 0.424, auc: 0.797
[step=30] loss: 0.423, auc: 0.813
[step=40] loss: 0.420, auc: 0.812

EPOCH=5, loss=0.431, au

In [16]:
# 预测
from sklearn.metrics import accuracy_score
test_x = test.drop('label', axis=1).values
test_y = test['label'].values
y_pred_probs = model(torch.tensor(test_x).float())
y_pred = torch.where(y_pred_probs>0.5, torch.ones_like(y_pred_probs), torch.zeros_like(y_pred_probs))

test_auc = roc_auc_score(test_y, y_pred_probs.data.numpy())
test_acc = accuracy_score(test_y, y_pred.data.numpy())
print('test_auc:%.3f test_acc:%.3f'%(test_auc, test_acc))

test_auc:0.568 test_acc:0.730
