In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
from sklearn.metrics import roc_auc_score, log_loss
from torch import nn
from tqdm import tqdm
from collections import defaultdict

In [2]:
config = {
    'epoch' : 10,
    'batch_size' : 128,
    'lr' : 0.01,
    'device' : torch.device("cuda:0" if torch.cuda.is_available() else "cpu"),
    'weight_path' : 'nps_for revisit.bin'
     
}

In [7]:
df = pd.read_excel('df_revisit.xlsx')
df.columns = [str(i) for i in range(1,len(df.columns))] + ['target']
for col in df.columns:
    map_dict = dict(zip(df[col].unique(), range(0, df[col].nunique() )))
    df[col] = df[col].map(map_dict)

In [9]:
size_dict = defaultdict(dict)
for col in df.columns:
    size_dict[col]['size'] = df[col].nunique()

In [10]:
class NPSDataset(Dataset):
    def __init__(self, df):
        self.df = df
        
    
    def __getitem__(self, idx):
        data = dict()
        for col in self.df.columns:
            data[col] = torch.Tensor([self.df[col].iloc[idx]]).long().squeeze(-1)
        
        data['target'] = torch.Tensor([self.df['target'].iloc[idx]]).squeeze(-1)
        
        return data
    
    def __len__(self):
        return len(self.df)

In [38]:
##Embedding 
class EmbeddingLayer(nn.Module):
    def __init__(self, size_dict, embedding_dim):
        super(EmbeddingLayer, self).__init__()
        self.size_dict = size_dict
        self.embedding_dim = embedding_dim
        self.embedding_layer = nn.ModuleDict()
        
        for col in size_dict.keys():
            self.embedding_layer.update({col: nn.Embedding(self.size_dict[col]['size'], self.embedding_dim)})
        
  
    
    def forward(self, x):
        emb_l = []
        for col in self.size_dict.keys():
            inpu = x[col].long().view(-1,1)
            emb_l.append(self.embedding_layer[col](inpu))
        emb_final = torch.stack(emb_l, dim=1)
        return emb_final

In [55]:
class MLP(nn.Module):
    def __init__(self,input_dim, droprate):
        super(MLP, self).__init__()
        dense_layers = []
        dense_layers.append(nn.Linear(input_dim, 128))
        dense_layers.append(nn.BatchNorm1d(128))
        dense_layers.append(nn.ReLU())
        dense_layers.append(nn.Dropout(droprate))
        dense_layers.append(nn.Linear(128, 64))
        dense_layers.append(nn.BatchNorm1d(64))
        dense_layers.append(nn.ReLU())
        dense_layers.append(nn.Dropout(droprate))
        dense_layers.append(nn.Linear(64, 32))
        dense_layers.append(nn.BatchNorm1d(32))
        dense_layers.append(nn.ReLU())
        dense_layers.append(nn.Dropout(droprate))
        dense_layers.append(nn.Linear(32,1))
        dense_layers.append(nn.Sigmoid())
        self.dnn = nn.Sequential(*dense_layers)
    
    def forward(self, x):
        return self.dnn(x)

In [56]:
class myModel(nn.Module):
    def __init__(self, size_dict, embedding_dim = 32, droprate = 0.1):
        super(myModel, self).__init__()
        self.embedding_dim = embedding_dim
        self.droprate = droprate
        self.embedding_layer = EmbeddingLayer(size_dict, embedding_dim = embedding_dim)
        self.input_dim = len(size_dict.keys()) *embedding_dim
        
        self.dnn = MLP(input_dim = self.input_dim, droprate = droprate)
        
    def forward(self, x):
        feature_embed = self.embedding_layer(x)
        feature_embed = torch.flatten(feature_embed, start_dim=1)
        y_pred = self.dnn(feature_embed)
        
        return y_pred

In [49]:
def train_model(model, train_loader, optimizer, device, metric_list = ['roc_auc_score','log_loss']):
    model.train()
    pred_list = []
    label_list = []
    pbar = tqdm(train_loader)
    for data in pbar:
        for i in data.keys():
            data[i] = data[i].to(device)
        
        model.zero_grad()
        output = model(data)
        label = data['target']
        loss = criterion(output.squeeze(-1), label)
        
        loss.backward()
        optimizer.step()
        
        pred_list.extend(output.squeeze(-1).cpu().detach().numpy())
        label_list.extend(label.squeeze(-1).cpu().detach().numpy())
        pbar.set_description("train Loss {}".format(loss))
    
    loss_dict = {}
    for metric in metric_list:
        if metric == 'log_loss':
            loss_dict[metric] = log_loss(label_list, pred_list)
        else:
            loss_dict[metric] = roc_auc_score(label_list, pred_list)
    
    return loss_dict

In [60]:
def valid_model(model, valid_loader, device, metric_list = ['roc_auc_score','log_loss']):
    model.eval()
    pred_list = []
    label_list = []
    pbar = tqdm(valid_loader)
    for data in pbar:
        for i in data.keys():
            data[i] = data[i].to(device)
    
        output = model(data)
        label = data['target']
        loss = criterion(output.squeeze(-1), label)
        
        pred_list.extend(output.squeeze(-1).cpu().detach().numpy())
        label_list.extend(label.squeeze(-1).cpu().detach().numpy())
        pbar.set_description("valid Loss {}".format(loss))
        
    loss_dict = {}
    for metric in metric_list:
        if metric == 'log_loss':
            loss_dict[metric] = log_loss(label_list, pred_list)
        else:
            loss_dict[metric] = roc_auc_score(label_list, pred_list)
    
    return loss_dict

In [26]:
split = int(len(df) * 0.8)
valid_df = df[:split].reset_index(drop=True)
train_df = df[split:].reset_index(drop=True)

In [17]:
train_dataset = NPSDataset(train_df)
valid_dataset = NPSDataset(valid_df)

In [18]:
train_loader = DataLoader(train_dataset, batch_size = config['batch_size'], shuffle=True)
valid_loader = DataLoader(valid_dataset, batch_size = config['batch_size'], shuffle=False)

In [57]:
model = myModel(size_dict = size_dict)

In [31]:
def set_device(gpu):
    if torch.cuda.is_available():
        device = torch.device(f'cuda:{gpu}')
    else:
        device = torch.device('cpu')
    
    return device

In [None]:
embedding_layer = EmbeddingLayer(size_dict, embedding_dim = 32)
l1 = nn.Linear(160,128)
l2 = nn.Linear(128,64)
for i in train_loader:
    feature_embed = embedding_layer(i)
    print(feature_embed.shape)
    feature_embed = torch.flatten(feature_embed, start_dim=1)
    print(feature_embed.shape)
    output = l1(feature_embed)
    print(output.shape)
    output = l2(output)
    print(output.shape)
    break

In [None]:
device = set_device(config['device'])
optimizer = torch.optim.Adam(model.parameters(), lr=config['lr'], weight_decay = 0.001)
criterion = criterion = nn.BCELoss()

best_valid_loss = 100

for i in range(config['epoch']):
    train_metric = train_model(model, train_loader, optimizer = optimizer, device = device)
    
    valid_metric = valid_model(model, valid_loader, device = device)
    
    valid_loss = valid_metric['log_loss']
    if valid_loss < best_valid_loss:
        torch.save(model.state_dict(), config['weight_path'])
        best_valid_loss = valid_loss
        print('model_saved', valid_loss)
    
    print(f'train metric {train_metric}')
    print(f'valid metric {valid_metric}')