In [1]:
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader

class CriteoDataset(Dataset):
    def __init__(self, file_path, transform=None):
        self.data = pd.read_csv(file_path)
        self.transform = transform
        self._preprocess_data()

    def _preprocess_data(self):
        # 处理缺失值
        self.data.fillna(0, inplace=True)
        
        # 对分类特征进行编码
        self.categorical_columns = [f'C{i}' for i in range(1, 27)]
        self.numerical_columns = [f'I{i}' for i in range(1, 14)]
        self.label_column = 'label'

        for col in self.categorical_columns:
            self.data[col] = self.data[col].astype('category').cat.codes

        self.num_classes = [self.data[col].nunique() for col in self.categorical_columns]
        
        self.features = self.data[self.numerical_columns + self.categorical_columns]
        self.labels = self.data[self.label_column]

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        features = self.features.iloc[idx].values.astype('float32')
        label = self.labels.iloc[idx]
        sample = {'features': torch.tensor(features), 'label': torch.tensor(label, dtype=torch.float32)}
        
        if self.transform:
            sample = self.transform(sample)
        
        return sample


  from pandas.core.computation.check import NUMEXPR_INSTALLED


In [2]:
import torch.nn as nn

class WideAndDeepModel(nn.Module):
    def __init__(self, num_numerical_features, num_classes, embedding_dim):
        super(WideAndDeepModel, self).__init__()
        
        # Wide part
        self.wide = nn.Linear(num_numerical_features, 1)
        
        # Deep part
        self.embeddings = nn.ModuleList([nn.Embedding(num, embedding_dim) for num in num_classes])
        self.deep = nn.Sequential(
            nn.Linear(len(num_classes) * embedding_dim + num_numerical_features, 128),
            nn.ReLU(),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, 1)
        )

    def forward(self, x):
        numerical_features = x[:, :13]
        categorical_features = x[:, 13:].long()
        
        wide_output = self.wide(numerical_features)
        
        embeddings = [self.embeddings[i](categorical_features[:, i]) for i in range(categorical_features.size(1))]
        deep_input = torch.cat(embeddings + [numerical_features], dim=1)
        deep_output = self.deep(deep_input)
        
        return wide_output + deep_output


In [3]:
import torch.optim as optim
from sklearn.metrics import roc_auc_score

def train_model(model, dataloader, num_epochs=10, learning_rate=0.001):
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    criterion = nn.BCEWithLogitsLoss()

    model.train()
    for epoch in range(num_epochs):
        running_loss = 0.0
        for batch in dataloader:
            inputs = batch['features']
            labels = batch['label']
            
            optimizer.zero_grad()
            outputs = model(inputs).squeeze()
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
        print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {running_loss/len(dataloader):.4f}')
        
def evaluate_model(model, dataloader):
    model.eval()
    all_labels = []
    all_outputs = []
    
    with torch.no_grad():
        for batch in dataloader:
            inputs = batch['features']
            labels = batch['label']
            
            outputs = model(inputs).squeeze()
            all_labels.extend(labels.numpy())
            all_outputs.extend(outputs.numpy())
            
    auc_score = roc_auc_score(all_labels, all_outputs)
    print(f'AUC Score: {auc_score:.4f}')


In [5]:
import argparse

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Wide & Deep Model with PyTorch')
    parser.add_argument('--action', type=str, required=True, choices=['train', 'evaluate'],
                        help='Action to perform: train, evaluate')
    parser.add_argument('--data_path', type=str, required=True, help='Path to the Criteo dataset CSV file')
    parser.add_argument('--batch_size', type=int, default=32, help='Batch size for training/evaluation')
    parser.add_argument('--num_epochs', type=int, default=10, help='Number of epochs for training')
    parser.add_argument('--learning_rate', type=float, default=0.001, help='Learning rate for optimizer')
    parser.add_argument('--embedding_dim', type=int, default=8, help='Embedding dimension for categorical features')
    
    args = parser.parse_args()

    dataset = CriteoDataset(file_path=args.data_path)
    dataloader = DataLoader(dataset, batch_size=args.batch_size, shuffle=True)
    
    num_numerical_features = len(dataset.numerical_columns)
    model = WideAndDeepModel(num_numerical_features, dataset.num_classes, args.embedding_dim)
    
    if args.action == 'train':
        train_model(model, dataloader, num_epochs=args.num_epochs, learning_rate=args.learning_rate)
        torch.save(model.state_dict(), 'wide_and_deep_model.pth')
    elif args.action == 'evaluate':
        model.load_state_dict(torch.load('wide_and_deep_model.pth'))
        evaluate_model(model, dataloader)


usage: ipykernel_launcher.py [-h] --action {train,evaluate} --data_path
                             DATA_PATH [--batch_size BATCH_SIZE]
                             [--num_epochs NUM_EPOCHS]
                             [--learning_rate LEARNING_RATE]
                             [--embedding_dim EMBEDDING_DIM]
ipykernel_launcher.py: error: the following arguments are required: --action, --data_path


SystemExit: 2

In [None]:
# 训练模型
! python wide_and_deep.py --action train --data_path "./data/criteo_train.csv"
# 评估模型
! python your_script.py --action evaluate --data_path path_to_criteo_dataset.csv
