In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import StratifiedKFold
from tqdm.auto import tqdm, trange
from sklearn.preprocessing import LabelEncoder
import pickle
import torch
from torch import nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset, Dataset, Subset
import torch_optimizer as optim
import os
import gc

In [2]:
experiments_path = './experiments/amex_transformer_finetune_v9/'
if not os.path.exists(experiments_path):
    os.makedirs(experiments_path)

In [3]:
pretrain_path = './experiments/amex_transformer_pretrain_v4/'

In [4]:
X = np.load("./data/X2.npy")
y = np.load("./data/y.npy")
customer_list = np.load("./data/customer_list.npy")
is_train = np.load("./data/is_train.npy")

In [5]:
features = pd.read_csv("./data/train_data.csv",index_col=0,nrows=1).columns[2:]
cat_features = ['B_30', 'B_38', 'D_114', 'D_116', 'D_117', 'D_120', 'D_126', 'D_63', 'D_64', 'D_66', 'D_68']
dense_features = [col for col in features if col not in cat_features]
features = cat_features + dense_features
features_group = {}
for i, col in enumerate(features):
    g = col[0]
    if g not in features_group:
        features_group[g] = {}
        if col in cat_features:
            features_group[g]['cat'] = [i]
        else:
            features_group[g]['dense'] = [i]
    else:
        if col in cat_features:
            if 'cat' in features_group[g]:
                features_group[g]['cat'].append(i)
            else:
                features_group[g]['cat'] = [i]
        else:
            if 'dense' in features_group[g]:
                features_group[g]['dense'].append(i)
            else:
                features_group[g]['dense'] = [i] 

In [6]:
for a in features_group:
    for b in features_group[a]:
        print(a,b,len(features_group[a][b]))

B cat 2
B dense 38
D cat 9
D dense 87
P dense 3
R dense 28
S dense 21


In [7]:
np.nanmax(X[...,:11].reshape(-1,11),0)+1

array([4., 8., 3., 3., 8., 3., 4., 6., 5., 3., 8.], dtype=float32)

In [8]:
X_tr = X[is_train]
y_tr = y[is_train]
train_customer = customer_list[is_train]

X_test = X[~is_train]
test_customer = customer_list[~is_train]

del X,y,customer_list,is_train
gc.collect()

63

In [9]:
BATCH_SIZE = 1024
EPOCHS = 5
GROUPS = 5
device = torch.device('cuda:1')
kf = StratifiedKFold(GROUPS, shuffle=True, random_state=42)
dataset = TensorDataset(torch.Tensor(X_tr), torch.Tensor(y_tr))
test_dataset = TensorDataset(torch.Tensor(X_test))

In [10]:
param = {'d_model': 768,
         'emb_dim': 4,
         'n_layers': 6,
         'n_heads': 4,
         'activation': 'relu',
         'transformer_act': 'relu',
         'use_cls': False,
         'input_norm': False,
         'input_layers': 0,
         'tanh_scale': 0.3450840441113073,
         'input_dropout': 0.10803114983077852,
         'hidden_dropout': 0.23771720623629086,
         'final_dropout': 0.1803505437404746,
         'transformer_dropout': 0.24608225028381883,
         'output_layers': 'mlp2',
         'pe_std': 0.7232348735328199,
         'optimizer': 'Lamb',
         'lr': 0.001,
         'weight_decay': 0.051712665649902206,
         'optimizer_alpha': 0.07931217763278503,
         'optimizer_beta': 0.0070573347096303885}

In [11]:
class TanhEstimator(nn.Module):
    def __init__(self, inp_size, tanh_scale=0.1):
        super().__init__()
        self.alpha = nn.Parameter(tanh_scale * torch.ones(inp_size))
        self.beta = nn.Parameter(torch.zeros(inp_size))
    def forward(self, inp):
        x = torch.tanh(self.alpha * inp + self.beta)
        return x

class AmexModel(nn.Module):
    def __init__(self, params):
        super().__init__()
        d_model = params['d_model']
        emb_dim = params['emb_dim']
        n_layers = params['n_layers']
        n_heads = 2**params['n_heads']
        
        if params['activation'] == 'relu':
            activation = nn.ReLU()
        elif params['activation'] == 'gelu':
            activation = nn.GELU()
        elif params['activation'] == 'mish':
            activation = nn.Mish()
        
        self.use_cls = params['use_cls']
        
        self.n_cat = [4, 8, 3, 3, 8, 3, 4, 6, 5, 3, 8]
        self.n_dense = 177
        self.features_group = features_group
        self.inp_emb = nn.ModuleDict()
        for key1 in self.features_group:
            self.inp_emb[key1] = nn.ModuleDict()
            for key2 in self.features_group[key1]:
                if key2 == 'cat':
                    self.inp_emb[key1][key2] = nn.ModuleList()
                    for i in self.features_group[key1][key2]:
                        self.inp_emb[key1][key2].append(nn.Embedding(self.n_cat[i],emb_dim))
                else:
                    d = len(self.features_group[key1][key2])
                    if 'cat' in self.features_group[key1]:
                        d += len(self.features_group[key1]['cat']) * emb_dim
                    self.inp_emb[key1][key2] = nn.Sequential(nn.Linear(d, d_model),
                                                             nn.Dropout(0.1),
                                                             nn.Mish(),
                                                             nn.Linear(d_model,d_model))
        
        if params['input_norm']:
            self.norm = nn.BatchNorm1d(self.n_dense)
        else:
            self.norm = nn.Identity()
            
        self.dense_norm = TanhEstimator(self.n_dense, params['tanh_scale'])
        self.post_norm = nn.BatchNorm1d(len(self.features_group) * d_model)
        self.proj = [nn.Dropout(params['input_dropout']),nn.Linear(len(self.features_group) * d_model, d_model)]
        for _ in range(params['input_layers']):
            self.proj.extend([nn.Dropout(params['hidden_dropout']), activation, nn.Linear(d_model, d_model)])
        self.proj = nn.Sequential(*self.proj)
        
        self.transformer = nn.TransformerEncoder(nn.TransformerEncoderLayer(d_model,n_heads,4*d_model,
                                                                            params['transformer_dropout'],
                                                                            activation=params['transformer_act'],
                                                                            norm_first=True,batch_first=True)
                                                 ,n_layers)
        
        if params['output_layers'] == 'linear':
            self.fc = nn.Sequential(nn.Dropout(params['final_dropout']), nn.Linear(d_model, 1))
        elif params['output_layers'] == 'mlp':
            self.fc = nn.Sequential(nn.Dropout(params['hidden_dropout']), nn.Linear(d_model, d_model), activation,
                                    nn.Dropout(params['final_dropout']), nn.Linear(d_model, 1))
        elif params['output_layers'] == 'mlp2':
            self.fc = nn.Sequential(nn.Dropout(params['hidden_dropout']), nn.Linear(d_model, 4*d_model), activation,
                                    nn.Dropout(params['final_dropout']), nn.Linear(4*d_model, 1))
        
        self.ae_fc = nn.Sequential(nn.Dropout(0.2), nn.Linear(d_model, 1095))
        self.new_fc = nn.Sequential(nn.Dropout(params['final_dropout']), nn.Linear(d_model, 1))
        
        self.pe = nn.Parameter(torch.empty([13,d_model]))
        nn.init.normal_(self.pe, std=params['pe_std'])
        
        self.cls = nn.Parameter(torch.empty(d_model))
        nn.init.normal_(self.cls)
        
            
    def forward(self, inp):
        missing_nodes_mask = torch.all(torch.isnan(inp),dim=-1)
        inp[torch.isnan(inp)] = 0
        
        inp_cat = inp[...,:len(self.n_cat)]
        inp_dense = inp[...,len(self.n_cat):]
#         inp_dense = self.dense_norm(inp_dense)
#         inp_dense = self.norm(inp_dense.transpose(1,2)).transpose(1,2)
        inp = torch.cat([inp_cat,inp_dense],dim=-1)
        X = []
        for key1 in self.features_group:
            if 'cat' in self.features_group[key1]:
                X_list = [inp[...,self.features_group[key1]['dense']]]
                for i, idx in enumerate(self.features_group[key1]['cat']):
                    X_list.append(self.inp_emb[key1]['cat'][i](inp[...,idx].long()))
                X_list = torch.cat(X_list,dim=-1)
                X.append(self.inp_emb[key1]['dense'](X_list))
            else:
                X.append(self.inp_emb[key1]['dense'](inp[...,self.features_group[key1]['dense']]))
        X = torch.cat(X,dim=-1)
        X = X.permute(0,2,1)
        X = self.post_norm(X)
        X = X.permute(0,2,1)
        X = self.proj(X)
        X = X + self.pe
        
        if self.use_cls:
            X = torch.cat([X,self.cls.reshape(1,1,-1).repeat(len(X),1,1)],dim=1)
            mask = torch.cat([missing_nodes_mask,torch.zeros([len(X),1],device=X.device).bool()],dim=1)
            X = self.transformer(X, src_key_padding_mask=mask)
        else:
            X = self.transformer(X, src_key_padding_mask=missing_nodes_mask)
        
        X = X[:,-1]
        
        y = self.new_fc(X).squeeze(-1)
        return y

In [12]:
criterion = nn.BCEWithLogitsLoss()

def amex_metric(y_true, y_pred):

    labels     = np.transpose(np.array([y_true, y_pred]))
    labels     = labels[labels[:, 1].argsort()[::-1]]
    weights    = np.where(labels[:,0]==0, 20, 1)
    cut_vals   = labels[np.cumsum(weights) <= int(0.04 * np.sum(weights))]
    top_four   = np.sum(cut_vals[:,0]) / np.sum(labels[:,0])

    gini = [0,0]
    for i in [1,0]:
        labels         = np.transpose(np.array([y_true, y_pred]))
        labels         = labels[labels[:, i].argsort()[::-1]]
        weight         = np.where(labels[:,0]==0, 20, 1)
        weight_random  = np.cumsum(weight / np.sum(weight))
        total_pos      = np.sum(labels[:, 0] *  weight)
        cum_pos_found  = np.cumsum(labels[:, 0] * weight)
        lorentz        = cum_pos_found / total_pos
        gini[i]        = np.sum((lorentz - weight_random) * weight)
    score = 0.5 * (gini[1]/gini[0] + top_four)
    return score, gini[1]/gini[0], top_four

def train_one_epoch(model, optimizer, scheduler, train_dataloader, device = torch.device('cpu')):
    model.train()
    MA_loss = 0
    count = 0
    for X,y in train_dataloader:
        X = X.to(device)
        y = y.to(device)
        optimizer.zero_grad()
        
        pred = model(X)
                
        loss = criterion(pred,y)
        loss.backward()
        optimizer.step()
        scheduler.step()
        MA_loss += loss.item() * len(y)
        count += len(y)
    MA_loss /= count
    return MA_loss

def evaluation(model, val_dataloader, device = torch.device('cpu')):
    model.eval()
    MA_loss = 0
    count = 0
    predictions = []
    labels = []
    with torch.no_grad():
        for X,y in val_dataloader:
            X = X.to(device)
            y = y.to(device)
            pred = model(X)
            loss = criterion(pred,y)
            MA_loss += loss.item() * len(y)
            count += len(y)
            predictions.append(pred.cpu())
            labels.append(y.cpu())
        MA_loss /= count
    predictions = torch.cat(predictions,dim=0).numpy()
    labels = torch.cat(labels,dim=0).numpy()
    score, gini, top4 = amex_metric(labels, predictions)
    return MA_loss, score, gini, top4, predictions, labels

def predict(model, test_dataloader, device = torch.device('cpu')):
    model.eval()
    predictions = []
    with torch.no_grad():
        for X in test_dataloader:
            X = X[0].to(device)
            predictions.append(model(X).cpu())
    predictions = torch.cat(predictions,dim=0).numpy()
    return predictions

In [13]:
oof = np.zeros(len(y_tr))
test_pred = np.zeros(len(X_test))
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, num_workers=0,shuffle=False,drop_last=False)
for fold, (train_index, val_index) in enumerate(kf.split(X_tr, y_tr)):
    print("fold:",fold)
    model_path = experiments_path + f"model_fold{fold}.pt"
    
    train_dataset = Subset(dataset, train_index)
    val_dataset = Subset(dataset, val_index)
    
    train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, num_workers=0,shuffle=True,drop_last=True)
    val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, num_workers=0,shuffle=False,drop_last=False)
    
    model = AmexModel(param).to(device)
    pretrain_model_path = pretrain_path + "model.pt"
    model.load_state_dict(torch.load(pretrain_model_path,map_location=device), strict=False)
    if param['optimizer'] == 'Lamb':
        optimizer = optim.Lamb(model.parameters(),
                               lr=param['lr'],
                               weight_decay=param['weight_decay'],
                               betas = (1 - param['optimizer_alpha'], 1 - param['optimizer_beta']))
    elif param['optimizer'] == 'AdamW':
        optimizer = torch.optim.AdamW(model.parameters(),
                                       lr=param['lr'],
                                       weight_decay=param['weight_decay'],
                                       betas = (1 - param['optimizer_alpha'], 1 - param['optimizer_beta']))
    elif param['optimizer'] == 'Ranger':
        optimizer = optim.Ranger(model.parameters(),
                                   lr=param['lr'],
                                   weight_decay=param['weight_decay'],
                                   betas = (1 - param['optimizer_alpha'], 1 - param['optimizer_beta']))
    scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, EPOCHS*len(train_loader))
    
    best_score = -1.0
    
    for epoch in trange(EPOCHS):
        train_loss = train_one_epoch(model, optimizer, scheduler, train_loader, device)
        val_loss, val_score, val_gini, val_top4, val_pred, val_label = evaluation(model, val_loader, device)
        if val_score > best_score:
            best_score = val_score
            oof[val_index] = val_pred
            torch.save(model.state_dict(), model_path)
        print(f"epoch {epoch}")
        print(f"train_loss {train_loss}")
        print(f"val_loss {val_loss}")
        print(f"val_score {val_score}")
        print(f"val_gini {val_gini}")
        print(f"val_top4 {val_top4}")
        print(f"best_score {best_score}")
        
    model.load_state_dict(torch.load(model_path,map_location=device))
    test_pred += predict(model, test_loader, device) / GROUPS
        
cv_score, cv_gini, cv_top4 = amex_metric(y_tr, oof)
print(cv_score, cv_gini, cv_top4)

fold: 0


  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0
train_loss 0.2514926267985525
val_loss 0.217480701352056
val_score 0.7944007272603442
val_gini 0.923312913394437
val_top4 0.6654885411262512
best_score 0.7944007272603442
epoch 1
train_loss 0.218040412447972
val_loss 0.213697245594278
val_score 0.7993720207472248
val_gini 0.9259341426411476
val_top4 0.672809898853302
best_score 0.7993720207472248
epoch 2
train_loss 0.21267165930411003
val_loss 0.2134722718809497
val_score 0.8014101349511946
val_gini 0.926055126031082
val_top4 0.6767651438713074
best_score 0.8014101349511946
epoch 3
train_loss 0.20800826090840654
val_loss 0.21262237957863805
val_score 0.8026332240431242
val_gini 0.926692005222784
val_top4 0.6785744428634644
best_score 0.8026332240431242
epoch 4
train_loss 0.2049480298710935
val_loss 0.212965349057335
val_score 0.8024430725301754
val_gini 0.9267324513843559
val_top4 0.6781536936759949
best_score 0.8026332240431242
fold: 1


  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0
train_loss 0.25008610190625963
val_loss 0.22147220764730674
val_score 0.7847598245457437
val_gini 0.9203990798623614
val_top4 0.649120569229126
best_score 0.7847598245457437
epoch 1
train_loss 0.21746256406413778
val_loss 0.21893976183773306
val_score 0.7880609104493607
val_gini 0.92275144049711
val_top4 0.6533703804016113
best_score 0.7880609104493607
epoch 2
train_loss 0.2119448023884656
val_loss 0.21778856052846518
val_score 0.7914496391368477
val_gini 0.9233856855536637
val_top4 0.6595135927200317
best_score 0.7914496391368477
epoch 3
train_loss 0.20716276186305052
val_loss 0.21792994728815898
val_score 0.7918758596488561
val_gini 0.9235648921149424
val_top4 0.6601868271827698
best_score 0.7918758596488561
epoch 4
train_loss 0.20449584493590467
val_loss 0.21784341874143193
val_score 0.7916095244019545
val_gini 0.9235792134462429
val_top4 0.659639835357666
best_score 0.7918758596488561
fold: 2


  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0
train_loss 0.24995888174412637
val_loss 0.2211523208113587
val_score 0.7898502468928821
val_gini 0.9211967229575172
val_top4 0.6585037708282471
best_score 0.7898502468928821
epoch 1
train_loss 0.21794605671360506
val_loss 0.21883061438100465
val_score 0.7908507138191777
val_gini 0.922314077555958
val_top4 0.6593873500823975
best_score 0.7908507138191777
epoch 2
train_loss 0.21197987826509848
val_loss 0.2171115383528388
val_score 0.7917567374657918
val_gini 0.9235791330240823
val_top4 0.6599343419075012
best_score 0.7917567374657918
epoch 3
train_loss 0.20725361474232967
val_loss 0.21712437809278645
val_score 0.7925831270958735
val_gini 0.9237592007211355
val_top4 0.6614070534706116
best_score 0.7925831270958735
epoch 4
train_loss 0.2047139591011921
val_loss 0.21766199212261023
val_score 0.7928146835041087
val_gini 0.9238436452293477
val_top4 0.6617857217788696
best_score 0.7928146835041087
fold: 3


  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0
train_loss 0.24436584343956835
val_loss 0.2227645065399299
val_score 0.7827368633287728
val_gini 0.9200287374531387
val_top4 0.6454449892044067
best_score 0.7827368633287728
epoch 1
train_loss 0.21764123964243096
val_loss 0.2198459197257308
val_score 0.7898255941795339
val_gini 0.9223820681426982
val_top4 0.6572691202163696
best_score 0.7898255941795339
epoch 2
train_loss 0.2114167541431981
val_loss 0.21761635335843396
val_score 0.7913909940006061
val_gini 0.9232406195167152
val_top4 0.6595413684844971
best_score 0.7913909940006061
epoch 3
train_loss 0.20667496702358043
val_loss 0.2179135637519089
val_score 0.7928991494504984
val_gini 0.9233956094440573
val_top4 0.6624026894569397
best_score 0.7928991494504984
epoch 4
train_loss 0.2038167330472829
val_loss 0.21808636129150513
val_score 0.7919174859436326
val_gini 0.9233678836647615
val_top4 0.6604670882225037
best_score 0.7928991494504984
fold: 4


  0%|          | 0/5 [00:00<?, ?it/s]

epoch 0
train_loss 0.2522133374846847
val_loss 0.21878064356796081
val_score 0.7912643230685048
val_gini 0.9225665284650432
val_top4 0.6599621176719666
best_score 0.7912643230685048
epoch 1
train_loss 0.2175909398571073
val_loss 0.21632504233075453
val_score 0.7948967738124564
val_gini 0.9241928901618391
val_top4 0.6656006574630737
best_score 0.7948967738124564
epoch 2
train_loss 0.2120465229176942
val_loss 0.2150374452747124
val_score 0.7945410077889815
val_gini 0.9251644740740568
val_top4 0.6639175415039062
best_score 0.7948967738124564
epoch 3
train_loss 0.2070863041714583
val_loss 0.21571723355979597
val_score 0.7954540126103411
val_gini 0.9252652868783969
val_top4 0.6656427383422852
best_score 0.7954540126103411
epoch 4
train_loss 0.20410489773783605
val_loss 0.21568108656676246
val_score 0.7945852276254273
val_gini 0.925042509350891
val_top4 0.6641279458999634
best_score 0.7954540126103411
0.7947352850070364 0.9244926186894691 0.6649779513246036


In [14]:
pd.Series(oof, index=train_customer).to_csv(experiments_path + "oof.csv")

In [15]:
sub = pd.read_csv("./data/sample_submission.csv",index_col=0)

In [16]:
sub.loc[test_customer,'prediction'] = test_pred

In [17]:
sub

Unnamed: 0_level_0,prediction
customer_ID,Unnamed: 1_level_1
00000469ba478561f23a92a868bd366de6f6527a684c9a2e78fb826dcac3b9b7,-4.000736
00001bf2e77ff879fab36aa4fac689b9ba411dae63ae397d4263dafa1daedef5,-8.055628
0000210045da4f81e5f122c6bde5c2a617d03eef67f82c5e400fc98e7bd43ce8,-3.018232
00003b41e58ede33b8daf61ab56d9952f17c9ad1c3976cf6e56734528702d694,-1.410013
00004b22eaeeeb0ec976890c1d9bfc14fd9427e98c4ee9a4693dd914fca22557,1.981464
...,...
ffff952c631f2c911b8a2a8ca56ea6e656309a83d2f64c5d60460dba6dedc41e,-5.183532
ffffcf5df59e5e0bba2a5ac4578a34e2b5aa64a1546cd3a4f0ca3de613b0b2ad,1.422638
ffffd61f098cc056dbd7d2a21380c4804bbfe60856f475cb095d2443a68030f1,-0.188733
ffffddef1fc3643ea179c93245b68dca0f36941cd83977822e8b356988ca4d07,-0.616730


In [18]:
sub.to_csv(experiments_path + f"submission.csv")