In [7]:
import os
import gc
import joblib
import pandas as pd
import numpy as np
from sklearn import metrics,preprocessing
from torch import nn
from torch.nn import functional as F
import torch

In [2]:
import torch
import torch.nn as nn

class MultiCategoryEmbedding(nn.Module):
    def __init__(self, num_categories_list, embedding_dim_list):
        super(MultiCategoryEmbedding, self).__init__()

        # 使用 nn.ModuleList 创建嵌入层列表
        self.embedding_list = nn.ModuleList([
            nn.Embedding(num_categories, embedding_dim)
            for num_categories, embedding_dim in zip(num_categories_list, embedding_dim_list)
        ])

    def forward(self, category_tensors):
        # 对每个分类变量分别进行嵌入
        embedded_values_list = [embedding(category_tensor) for embedding, category_tensor in zip(self.embedding_list, category_tensors)]

        # 拼接所有嵌入结果
        combined_embedding = torch.cat(embedded_values_list, dim=1)
        return combined_embedding

In [3]:
class block1(nn.Module):
    def __init__(self,inputs,outputs):
        super().__init__()       
        #self.net = nn.Sequential(nn.Dropout(0.5),nn.BatchNorm1d(inputs),nn.Linear(inputs,outputs),nn.ReLU())
        self.net = nn.Sequential(nn.BatchNorm1d(inputs),nn.Linear(inputs,outputs),nn.ReLU())
    def forward(self,X):
        return self.net(X)

In [4]:
def create_model(data,catcols):
    num_categories_list = []
    embedding_dim_list = []
    for c in catcols:
        num_unique_values = int(data[c].nunique())
        embed_dim = int(min(np.ceil(num_unique_values/2),50))
        #embed_dim = int(np.ceil(num_unique_values/2))
        num_categories_list.append(num_unique_values)
        embedding_dim_list.append(embed_dim)
    net = nn.Sequential()
    net.add_module('1',MultiCategoryEmbedding(num_categories_list,embedding_dim_list))
    net.add_module('20',block1(sum(embedding_dim_list),300))
    net.add_module('21',block1(300,300))
    #net.add_module('22',block1(300,300))
    net.add_module('23',block1(300,100))
    net.add_module('24',block1(100,1))
    return net

In [8]:
from torch.utils.data import TensorDataset, DataLoader
def train(net,train_features,train_labels,num_epochs,learning_rate,weight_decay,batch_size):
    train_set = TensorDataset(train_features,train_labels)
    train_iter = DataLoader(train_set,batch_size=batch_size,shuffle=True)
    optimizer = torch.optim.Adam(net.parameters(),lr=learning_rate)
    loss = nn.CrossEntropyLoss()
    try:
        for epoch in range(num_epochs):
            total_loss = 0.0
            for X,y in train_iter:
                optimizer.zero_grad()
                l = loss(net(X.T).reshape(y.shape),y)
                l.backward()
                optimizer.step()
                total_loss += l.item()
            avg_loss = total_loss / len(train_iter)
            #print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {avg_loss}')
    except Exception as e:
        print(f"An error occurred during training: {e}")
    return net

In [9]:
fold = 0
df = pd.read_csv("cat_train_folds.csv")
features = [f for f in df.columns if f not in ("id", "target", "kfold")]
for col in features:
    df.loc[:, col] = df[col].astype(str).fillna("NONE")

for feat in features:
    lbl_enc = preprocessing.LabelEncoder()
    df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)

df_train = df[df.kfold != fold].reset_index(drop=True)
df_valid = df[df.kfold == fold].reset_index(drop=True)

xtrain = torch.tensor(df_train[features].values.astype('int64'))
xvalid = torch.tensor(df_valid[features].values.astype('int64'))
ytrain = torch.tensor(df_train.target.values.astype('float32'))
yvalid = torch.tensor(df_valid.target.values.astype('float32'))

net = create_model(df, features)
net = train(net, xtrain, ytrain, 3, 0.01, 1,1024)

y = net(xvalid.T).detach()
print(metrics.roc_auc_score(yvalid,y.reshape(yvalid.shape)))
y1 = net(xtrain.T).detach()
print(metrics.roc_auc_score(ytrain,y1.reshape(ytrain.shape)))

0.7844051416394257
0.8117096094752007


In [6]:
def run(fold):
    df = pd.read_csv("cat_train_folds.csv")
    features = [f for f in df.columns if f not in ("id", "target", "kfold")]
    for col in features:
        df.loc[:, col] = df[col].astype(str).fillna("NONE")

    for feat in features:
        lbl_enc = preprocessing.LabelEncoder()
        df.loc[:, feat] = lbl_enc.fit_transform(df[feat].values)

    df_train = df[df.kfold != fold].reset_index(drop=True)
    df_valid = df[df.kfold == fold].reset_index(drop=True)

    xtrain = torch.tensor(df_train[features].values.astype('int64'))
    xvalid = torch.tensor(df_valid[features].values.astype('int64'))
    ytrain = torch.tensor(df_train.target.values.astype('float32'))
    yvalid = torch.tensor(df_valid.target.values.astype('float32'))

    net = create_model(df, features)
    net = train(net, xtrain, ytrain, 3, 0.01, 1,1024)

    y = net(xvalid.T).detach()
    print(metrics.roc_auc_score(yvalid,y.reshape(yvalid.shape)))
    #y1 = net(xtrain.T).detach()
    #print(metrics.roc_auc_score(ytrain,y1.reshape(ytrain.shape)))
if __name__=="__main__":
    for fold in range(5):
        run(fold)

0.783257693618115
0.7826237332696006
0.7807748535337764
0.782462469058828
0.7824744391722005
