In [1]:
import torch
import numpy as np
import pandas as pd
from sklearn.preprocessing import MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

from torch.utils.data import DataLoader, Dataset, TensorDataset
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from sklearn.metrics import roc_auc_score, accuracy_score
import datetime
import warnings
warnings.filterwarnings('ignore')

In [2]:
class FM(nn.Module):
    def __init__(self, emb_dim, feat_num):
        super(FM, self).__init__()
        self.emb_dim = emb_dim
        #定义三个矩阵，一个是全局偏置，一个是一阶权重矩阵，一个是二阶交叉矩阵
        self.w0 = nn.Parameter(torch.rand([1,]))
        self.w1 = nn.Parameter(torch.rand([feat_num, 1]))
        self.w2 = nn.Parameter(torch.rand([feat_num, emb_dim]))
    def forward(self, x):
        #x的维度是(batch_size, feat_num)
        #一阶交叉
        first_order = self.w0 + torch.mm(x, self.w1) #(batch_size, 1)
        #二阶交叉
        second_order = 0.5 * torch.sum(torch.pow(torch.mm(x, self.w2), 2) - torch.mm(torch.pow(x, 2), torch.pow(self.w2, 2)), dim=1, keepdim=True)
        return torch.sigmoid(first_order + second_order)

In [8]:
data = pd.read_csv('./data/Criteo_sample.txt')
train_set, test_set = train_test_split(data, test_size=0.2, random_state=2020)

train_label = train_set['label']
del train_set['label']
test_label = test_set['label']
del test_set['label']
data_df = pd.concat((train_set, test_set))


sparse_feas = [col for col in data_df.columns if col[0] == 'C']
dense_feas = [col for col in data_df.columns if col[0] == 'I']

data_df[sparse_feas] = data_df[sparse_feas].fillna('-1')
data_df.drop(columns=dense_feas, inplace=True)

def sparseFeature(feat, feat_num, embed_dim=4):
    return {'feat':feat, 'feat_num':feat_num, 'embed_dim':embed_dim}
def denseFeature(feat):
    return {'feat':feat}
embed_dim = 8
feature_columns = [[sparseFeature(feat, len(data_df[feat].unique()), embed_dim=embed_dim) for feat in sparse_feas]]

for feat in sparse_feas:
    le = LabelEncoder()
    data_df[feat] = le.fit_transform(data_df[feat])

train = data_df[:train_set.shape[0]]
test = data_df[train_set.shape[0]:]

train['label'] = train_label
test['label'] = test_label

train_set, val_set = train_test_split(train, test_size=0.1, random_state=2024)

train_set.reset_index(drop=True, inplace=True)
val_set.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
print(feature_columns)
print(train_set.shape, val_set.shape, test.shape)

[[{'feat': 'C1', 'feat_num': 79, 'embed_dim': 8}, {'feat': 'C2', 'feat_num': 252, 'embed_dim': 8}, {'feat': 'C3', 'feat_num': 1293, 'embed_dim': 8}, {'feat': 'C4', 'feat_num': 1043, 'embed_dim': 8}, {'feat': 'C5', 'feat_num': 30, 'embed_dim': 8}, {'feat': 'C6', 'feat_num': 7, 'embed_dim': 8}, {'feat': 'C7', 'feat_num': 1164, 'embed_dim': 8}, {'feat': 'C8', 'feat_num': 39, 'embed_dim': 8}, {'feat': 'C9', 'feat_num': 2, 'embed_dim': 8}, {'feat': 'C10', 'feat_num': 908, 'embed_dim': 8}, {'feat': 'C11', 'feat_num': 926, 'embed_dim': 8}, {'feat': 'C12', 'feat_num': 1239, 'embed_dim': 8}, {'feat': 'C13', 'feat_num': 824, 'embed_dim': 8}, {'feat': 'C14', 'feat_num': 20, 'embed_dim': 8}, {'feat': 'C15', 'feat_num': 819, 'embed_dim': 8}, {'feat': 'C16', 'feat_num': 1159, 'embed_dim': 8}, {'feat': 'C17', 'feat_num': 9, 'embed_dim': 8}, {'feat': 'C18', 'feat_num': 534, 'embed_dim': 8}, {'feat': 'C19', 'feat_num': 201, 'embed_dim': 8}, {'feat': 'C20', 'feat_num': 4, 'embed_dim': 8}, {'feat': 'C21'

In [9]:
emb_dim = 4
feat_num = train_set.shape[1] - 1
model = FM(emb_dim, feat_num)
model

FM()

In [10]:
trn_x = train_set.drop('label', axis=1).values
trn_y = train_set['label'].values
val_x = val_set.drop('label', axis=1).values
val_y = val_set['label'].values
dl_train_dataset = TensorDataset(torch.tensor(trn_x).float(), torch.tensor(trn_y).float())
dl_val_dataset = TensorDataset(torch.tensor(val_x).float(), torch.tensor(val_y).float())

dl_train = DataLoader(dl_train_dataset, shuffle=True, batch_size=32)
dl_val = DataLoader(dl_val_dataset, shuffle=True, batch_size=32)

In [11]:
def auc(y_pred, y_true):
    pred = y_pred.data
    y = y_true.data
    return roc_auc_score(y, pred)

loss_func = nn.BCELoss()
optimizer = torch.optim.Adam(params=model.parameters(), lr=0.0001)
metric_func = auc
metric_name = 'auc'
epochs = 10
log_step_freq = 10

dfhistory = pd.DataFrame(columns=['epoch', 'loss', metric_name, 'val_loss', 'val_'+metric_name])

print('start_training.........')

print('========'*8)

for epoch in range(1, epochs+1):
    
    # 训练阶段
    model.train()
    loss_sum = 0.0
    metric_sum = 0.0
    step = 1
    

    
    for step, (features, labels) in enumerate(dl_train, 1):
        # 梯度清零
        optimizer.zero_grad()
        labels = labels.view(-1, 1)
        
        # 正向传播
        predictions = model(features);
        loss = loss_func(predictions, labels)
        try:
            metric = metric_func(predictions, labels)
        except ValueError:
            pass
        
        # 反向传播
        loss.backward()
        optimizer.step()
        
        # 打印batch级别日志
        loss_sum += loss.item()
        metric_sum += metric.item()
        if step % log_step_freq == 0:
            print(("[step=%d] loss: %.3f, " + metric_name + ": %.3f") % (step, loss_sum/step, metric_sum/step));
    
    # 验证阶段
    model.eval()
    val_loss_sum = 0.0
    val_metric_sum = 0.0
    val_step = 1
    
    for val_step, (features, labels) in enumerate(dl_val, 1):
        labels = labels.view(-1, 1)
        with torch.no_grad():
            predictions = model(features)
            val_loss = loss_func(predictions, labels)
            try:
                val_metric = metric_func(predictions, labels)
            except ValueError:
                pass
        
        val_loss_sum += val_loss.item()
        val_metric_sum += val_metric.item()
    
    # 记录日志
    info = (epoch, loss_sum/step, metric_sum/step, val_loss_sum/val_step, val_metric_sum/val_step)
    dfhistory.loc[epoch-1] = info
    
    # 打印日志
    print(("\nEPOCH=%d, loss=%.3f, " + metric_name + " = %.3f, val_loss=%.3f, " + "val_" + metric_name + " = %.3f") %info)
    print('\n' + '=========='* 8)
    
print('Finished Training')

start_training.........
[step=10] loss: 81.875, auc: 0.500
[step=20] loss: 80.156, auc: 0.500
[step=30] loss: 78.958, auc: 0.500
[step=40] loss: 79.297, auc: 0.500

EPOCH=1, loss=79.433, auc = 0.500, val_loss=80.000, val_auc = 0.500

[step=10] loss: 80.000, auc: 0.500
[step=20] loss: 78.750, auc: 0.500
[step=30] loss: 78.646, auc: 0.500
[step=40] loss: 79.688, auc: 0.500

EPOCH=2, loss=79.422, auc = 0.500, val_loss=80.000, val_auc = 0.500

[step=10] loss: 75.938, auc: 0.500
[step=20] loss: 78.594, auc: 0.500
[step=30] loss: 79.896, auc: 0.500
[step=40] loss: 79.609, auc: 0.500

EPOCH=3, loss=79.427, auc = 0.500, val_loss=80.000, val_auc = 0.500

[step=10] loss: 79.688, auc: 0.500
[step=20] loss: 80.469, auc: 0.500
[step=30] loss: 80.000, auc: 0.500
[step=40] loss: 79.141, auc: 0.500

EPOCH=4, loss=79.422, auc = 0.500, val_loss=80.000, val_auc = 0.500

[step=10] loss: 81.250, auc: 0.500
[step=20] loss: 80.156, auc: 0.500
[step=30] loss: 78.750, auc: 0.500
[step=40] loss: 78.750, auc: 0.

In [12]:
# 预测
from sklearn.metrics import accuracy_score
test_x = test.drop('label', axis=1).values
test_y = test['label'].values
y_pred_probs = model(torch.tensor(test_x).float())
y_pred = torch.where(y_pred_probs>0.5, torch.ones_like(y_pred_probs), torch.zeros_like(y_pred_probs))

test_auc = roc_auc_score(test_y, y_pred_probs.data.numpy())
test_acc = accuracy_score(test_y, y_pred.data.numpy())
print('test_auc:%.3f test_acc:%.3f'%(test_auc, test_acc))

test_auc:0.500 test_acc:0.225
