In [1]:
import pandas as pd
import numpy as np
import os
from pathlib import Path

from datetime import datetime, timedelta
import time

import gc
import copy

import pyarrow.parquet as pq
import pyarrow as pa

 
from dateutil.relativedelta import relativedelta
from sklearn.preprocessing import StandardScaler, MinMaxScaler

from sklearn.metrics import mean_squared_error, roc_auc_score
from sklearn.model_selection import StratifiedKFold, KFold

In [2]:
pd.options.display.max_rows = 100
pd.options.display.max_columns = 100

import warnings
warnings.filterwarnings("ignore")

import pytorch_lightning as pl
random_seed=1234
pl.seed_everything(random_seed)

Global seed set to 1234


1234

In [3]:
%%time
# train_file = r'/kaggle/input/amex-agg-data-rev2/agg_train_all_rev2_rev.parquet'
train_file = 'amex/agg_v3/agg_train_all_small.parquet'
df=pd.read_parquet(train_file, engine='pyarrow')

Wall time: 16.8 s


In [1]:
feats = ['B_10__last', 'B_10__mean', 'B_11__last', 'B_11__last__log', 'B_11__mean', 'B_11__mean__log', 'B_11__min', 'B_16__last', 'B_16__max', 'B_18__last', 'B_18__last__log', 'B_18__mean__log', 'B_18__min', 'B_19__last', 'B_1__last', 'B_1__max', 'B_1__mean', 'B_1__min', 'B_20__last', 'B_20__max', 'B_20__mean', 'B_22__last', 'B_22__last__log', 'B_22__max', 'B_22__mean', 'B_22__mean__log', 'B_23__last', 'B_23__last__log', 'B_23__mean', 'B_23__min', 'B_26__last__log', 'B_28__last', 'B_2__last', 'B_2__mean', 'B_2__min', 'B_30=0.0', 'B_30=1.0', 'B_30__nunique', 'B_33__last', 'B_33__max', 'B_33__mean', 'B_33__min', 'B_37__last', 'B_37__max', 'B_37__mean', 'B_38__last', 'B_3__last', 'B_3__last__log', 'B_3__max', 'B_3__mean', 'B_3__mean__log', 'B_3__min', 'B_40__last', 'B_40__last__log', 'B_40__mean__log', 'B_40__min', 'B_4__last', 'B_4__last__log', 'B_4__max', 'B_4__mean__log', 'B_5__last__log', 'B_6__last', 'B_6__min', 'B_7__last', 'B_7__max', 'B_7__mean', 'B_7__min', 'B_8__last', 'B_8__min', 'B_9__last', 'B_9__last__log', 'B_9__max', 'B_9__mean', 'B_9__mean__log', 'B_9__min', 'D_112__last', 'D_39__last', 'D_39__last__log', 'D_39__max', 'D_41__last', 'D_41__last__log', 'D_41__max', 'D_42__last', 'D_42__max', 'D_42__mean', 'D_42__min', 'D_43__last', 'D_43__max', 'D_43__mean', 'D_43__mean__log', 'D_44__last', 'D_44__last__log', 'D_44__max', 'D_44__mean', 'D_44__mean__log', 'D_44__min', 'D_45__last', 'D_45__last__log', 'D_45__max', 'D_45__mean', 'D_45__mean__log', 'D_45__min', 'D_48__last', 'D_48__max', 'D_48__mean', 'D_48__min', 'D_52__last', 'D_52__max', 'D_52__mean', 'D_52__min', 'D_53__max', 'D_55__last', 'D_58__last', 'D_58__min', 'D_61__last', 'D_61__max', 'D_61__mean', 'D_61__min', 'D_62__last', 'D_62__max', 'D_62__mean', 'D_62__min', 'D_70__max', 'D_74__last', 'D_74__max', 'D_74__mean', 'D_75__last', 'D_75__max', 'D_75__mean', 'D_77__last', 'D_77__max', 'D_77__mean', 'D_77__min', 'D_78__max', 'D_78__mean', 'D_84__mean', 'P_2__last', 'P_2__max', 'P_2__mean', 'P_2__min', 'P_3__last', 'P_3__mean', 'P_3__min', 'R_10__max', 'R_10__mean', 'R_1__last', 'R_1__max', 'R_1__mean', 'R_2__last', 'R_2__max', 'R_2__mean', 'R_3__max', 'R_3__mean', 'R_3__min', 'R_4__last', 'R_4__max', 'R_4__mean', 'R_5__last', 'R_5__max', 'R_5__mean', 'R_6__max', 'R_6__mean', 'R_7__mean', 'R_8__mean', 'S_15__max', 'S_15__mean', 'S_22__last', 'S_23__last', 'S_25__mean', 'S_25__min', 'S_3__last', 'S_3__max', 'S_3__mean', 'S_3__min', 'S_7__last', 'S_7__max', 'S_7__mean', 'S_8__last', 'S_8__mean', 'S_8__min']


len(feats)

180

In [5]:
na_cnt = df[feats].isna().sum()
display(na_cnt[na_cnt>0])
del na_cnt
gc.collect()

Series([], dtype: int64)

43

## define autoencoder-mlp model

In [6]:
#https://pytorch-forecasting.readthedocs.io/en/latest/_modules/pytorch_forecasting/models/mlp/submodules.html#FullyConnectedModule
#https://www.kaggle.com/c/jane-street-market-prediction/discussion/224348
#https://www.kaggle.com/code/gogo827jz/jane-street-supervised-autoencoder-mlp/notebook?scriptVersionId=73762661

import torch
from torch import nn
import numpy as np


class AE_MLP(nn.Module):
    def __init__(
        self,
        input_size: int,
        output_size: int,
        hidden_sizes: list,
        dropouts: list,
    ):
        super().__init__()
        self.input_size = input_size
        self.output_size = output_size
        self.hidden_sizes = hidden_sizes
        self.dropouts = dropouts
        
        #----normalize input data--------------
        self.bn0 = nn.BatchNorm1d(input_size)
        
        #---encoder layer----------------
        self.encoder = nn.Sequential(nn.Linear(input_size, hidden_sizes[0]), 
                                     nn.BatchNorm1d(hidden_sizes[0]), 
                                     nn.SiLU()
                                    )
        #---decoder layer----------------
        self.decoder = nn.Sequential(nn.Dropout(dropouts[0]), 
                                     nn.Linear(hidden_sizes[0], input_size) 
                                    )
        #----AE output layer-------------
        self.ae_out = nn.Sequential(nn.Linear(input_size, hidden_sizes[1]), 
                                    nn.BatchNorm1d(hidden_sizes[1]), 
                                    nn.SiLU(), 
                                    nn.Dropout(dropouts[1]), 
                                    nn.Linear(hidden_sizes[1], output_size),
                                    nn.Sigmoid(), #for binary classification loss function BCELoss
                                    )
        #---MLP--------------------------
                
        # input layer
        size2 = input_size+hidden_sizes[0]
        module_list = [nn.BatchNorm1d(size2), 
                       nn.Dropout(dropouts[2]), 
                       nn.Linear(size2, hidden_sizes[2]), 
                       nn.BatchNorm1d(hidden_sizes[2]), 
                       nn.SiLU(), 
                       nn.Dropout(dropouts[2])]
    
        # hidden layers
        for i in range(3, len(hidden_sizes)):
            module_list.extend([nn.Linear(hidden_sizes[i-1], hidden_sizes[i]), 
                                nn.BatchNorm1d(hidden_sizes[i]), 
                                nn.SiLU(), 
                                nn.Dropout(dropouts[i])]
                              )
        # output layer
        module_list.extend([nn.Linear(hidden_sizes[-1], output_size), 
                           nn.Sigmoid()])

        self.mlp = nn.Sequential(*module_list)
        
    def forward(self, x):
        x0 = self.bn0(x)
        encoder = self.encoder(x0)
        decoder = self.decoder(encoder)
        out_ae = self.ae_out(decoder)
        
        #x0 shape is n*m - n samples, m features
        #encoder is n*k - n samples, k features
        #x1 is n*(k+m) - n samples, (k+m) features
        #if x0 is n*w*m with w as the width for 3d array, the output will be n*w*(k+m)
        x1 = torch.cat((x0, encoder), dim = -1) #
        out = self.mlp(x1)

        return decoder, out_ae, out


In [7]:
import torch
from torch.utils.data import (Dataset, DataLoader)
  

class TS_Data(Dataset):
    
    def __init__(self, X, y): 
        
        features = torch.FloatTensor(X)
        targets = torch.FloatTensor(y)
        
        self.features = features
        self.targets = targets
        
        self.n_samples = X.shape[0]
        self.n_features = X.shape[1]
        
    def __len__(self):
        return self.n_samples

    def __getitem__(self, idx):
        

        x = self.features[idx]
        y = self.targets[idx]
        
        return x, y
    

def load_data(X, y, batch_size, n_workers=0, shuffle=False):
    data = TS_Data(X, y)
    
    loader = DataLoader(data, batch_size=batch_size, num_workers=n_workers, shuffle=shuffle)
    
    return loader

## hyperopt parameters

In [8]:
learn_rates = np.concatenate((np.arange(0.00001, 0.0001, 0.00001),  
                           np.arange(0.0001, 0.001, 0.0001), 
                           np.arange(0.001, 0.01, 0.001), 
                           np.arange(0.01, 0.05, 0.01)
                          ), 
                          axis=0)
hidden_sizes=[32, 48, 64, 96, 128, 256, 448, 512, 896, 1024] 
encoder_outputsize = range(5, 101)
dropouts = np.round(np.arange(0.001, 0.501, 0.001), 4)

len(learn_rates), len(hidden_sizes), len(dropouts)

(31, 10, 500)

In [9]:
from hyperopt import hp
import numpy as np
space  = { 
             'batch_size': hp.choice('batch_size', [128*i for i in [1, 2, 4, 8, 16, 20, 32, 40, 80, 100]]),
             'num_epochs':hp.choice('num_epochs', [50, 60, 100, 150, 200]),
             'learning_rate':hp.choice('learning_rate', learn_rates),
             'hidden_size1':hp.choice('hidden_size1', encoder_outputsize),
             'hidden_size2':hp.choice('hidden_size2', hidden_sizes),
             'hidden_size3':hp.choice('hidden_size3', hidden_sizes),
             'hidden_size4':hp.choice('hidden_size4', hidden_sizes),
             'hidden_size5':hp.choice('hidden_size5', hidden_sizes),
             'hidden_size6':hp.choice('hidden_size6', hidden_sizes),
             'dropout1':  hp.choice('dropout1', dropouts), 
             'dropout2':  hp.choice('dropout2', dropouts), 
             'dropout3':  hp.choice('dropout3', dropouts), 
             'dropout4':  hp.choice('dropout4', dropouts), 
             'dropout5':  hp.choice('dropout5', dropouts), 
             'dropout6':  hp.choice('dropout6', dropouts), 
    
            }                  

### Train

In [10]:
# @yunchonggan's fast metric implementation
# From https://www.kaggle.com/competitions/amex-default-prediction/discussion/328020
# https://www.kaggle.com/code/ambrosm/amex-lightgbm-quickstart
def amex_metric(y_true: np.array, y_pred: np.array) -> float:

    # count of positives and negatives
    n_pos = y_true.sum()
    n_neg = y_true.shape[0] - n_pos

    # sorting by descring prediction values
    indices = np.argsort(y_pred)[::-1]
    preds, target = y_pred[indices], y_true[indices]

    # filter the top 4% by cumulative row weights
    weight = 20.0 - target * 19.0
    cum_norm_weight = (weight / weight.sum()).cumsum()
    four_pct_filter = cum_norm_weight <= 0.04

    # default rate captured at 4%
    d = target[four_pct_filter].sum() / n_pos

    # weighted gini coefficient
    lorentz = (target / n_pos).cumsum()
    gini = ((lorentz - cum_norm_weight) * weight).sum()

    # max weighted gini coefficient
    gini_max = 10 * n_neg * (1 - 19 / (n_pos + 20 * n_neg))

    # normalized weighted gini coefficient
    g = gini / gini_max

    return 0.5 * (g + d)


In [11]:
X = df[feats]
y = df[['target']]

In [12]:
del df
gc.collect()

63

In [13]:
skf = KFold(n_splits=3)

In [14]:
print(skf)

for train_index, test_index in skf.split(X, y):
#     print("TRAIN:", train_index, "TEST:", test_index)
#     print("TRAIN:", len(train_index), "TEST:", len(test_index), len(test_index)/(len(test_index)+len(train_index)))
    print(y.iloc[test_index]['target'].value_counts()/len(test_index)) 
    print(y.iloc[train_index]['target'].value_counts()/len(train_index))

KFold(n_splits=3, random_state=None, shuffle=False)
0    0.740565
1    0.259435
Name: target, dtype: float64
0    0.741317
1    0.258683
Name: target, dtype: float64
0    0.739552
1    0.260448
Name: target, dtype: float64
0    0.741824
1    0.258176
Name: target, dtype: float64
0    0.743082
1    0.256918
Name: target, dtype: float64
0    0.740059
1    0.259941
Name: target, dtype: float64


In [15]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

In [16]:
import numpy as np
import torch
from tqdm import tqdm
import torch.optim as optim
from torch.nn import CrossEntropyLoss, MSELoss, BCELoss

In [17]:
log_file = 'amex/agg_v3/amex-hyperopt-aemlp-180feats.xlsx'

In [18]:
loss_dict = []

def score(params):
    pl.seed_everything(1)
#     print(params)
    
    learning_rate = params['learning_rate']
    num_epochs = params['num_epochs']
    batch_size = params['batch_size']
    h_sizes = [params[f'hidden_size{i}'] for i in range(1,7)]
    drop_list = [params[f'dropout{i}'] for i in range(1,7)]


    losses = []
    
    for train_index, test_index in skf.split(X, y):
        
        #----start: data prep-------------------------------------
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]

        #----end: data prep-------------------------------------
        
#         print(X_train.shape, X_test.shape)
        
        
        scaler_ = StandardScaler()
        scaler_.fit(X_train)
        # minmax_scaler.fit_transform(X_train[x_cols])

        train_loader = load_data(scaler_.transform(X_train), y_train['target'].values, 
                                 batch_size=batch_size, n_workers=0, shuffle=False)

        test_loader = load_data(scaler_.transform(X_test), y_test['target'].values, 
                                 batch_size=batch_size, n_workers=0, shuffle=False)
        #----end: data prep-------------------------------------


        model = AE_MLP(input_size=len(feats), output_size=1, 
                       hidden_sizes=h_sizes, 
                       dropouts = drop_list)

        model = model.to(device)

        # optimizer = torch.optim.RMSprop(model.parameters(), lr=learning_rate)  
        optimizer = torch.optim.RMSprop([
                {'params': model.encoder.parameters()},
                {'params': model.decoder.parameters()},
                {'params': model.ae_out.parameters()},
                {'params': model.mlp.parameters()},
            ], lr=learning_rate)

        scheduler = torch.optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                                  max_lr=1e-2, epochs=num_epochs, steps_per_epoch=len(train_loader))
        
        decoder_loss = MSELoss()
        out_ae_loss = BCELoss()
        out_loss = BCELoss()


        #------train models--------------------------
        for epoch in range(num_epochs):
            model.train()
            for batch_idx, (features, targets) in enumerate(train_loader):

                features = features.to(device)
                targets = targets.to(device)

                ### FORWARD AND BACK PROP
                decoder, out_ae, out = model(features)
                decoder_cost = decoder_loss(decoder, features)
                out_ae_cost = out_ae_loss(out_ae.squeeze(), targets)  
                out_cost = out_loss(out.squeeze(), targets) #squeeze the n_samples*1 2d array to 1d array of n_samples
                total_cost = (decoder_cost + out_ae_cost + out_cost)/3

                optimizer.zero_grad()

                total_cost.backward()

                ### UPDATE MODEL PARAMETERS
                optimizer.step()

        #-----eval models-------------------------------
        model.eval()

        y_preds = []
        y_trues = []
        with torch.no_grad():
            for features, targets in test_loader:
                features = features.to(device)
                targets = targets.to(device)
                _, _, outputs = model(features)
                y_preds.extend(outputs.squeeze().cpu().numpy())
                y_trues.extend(targets.squeeze().cpu().numpy())

        #-----start: train mlp---------------------------------------
        
        #-----end: train mlp---------------------------------------
#         loss = roc_auc_score(y_trues, y_preds)
#         loss = amex_metric(y_test, 
#                            pd.DataFrame(data={'prediction': y_preds}))
        y_preds = np.array(y_preds)
    
        loss = amex_metric(y_test['target'].values, y_preds)
        losses.append(loss)
        
        
    loss = np.mean(losses)
#     print(loss)
    loss_dict.append({'params': params, 'losses': losses, 'mean_loss': loss})
    if len(loss_dict)%3==0:
        pd.DataFrame(data=loss_dict).to_excel(log_file, index=False)
    return {'loss': -loss, 'status': STATUS_OK}

In [19]:
from hyperopt import hp
from hyperopt import fmin, tpe, hp, STATUS_OK, Trials, anneal, rand
from functools import partial
def optimize(space, evals, cores, trials, optimizer=tpe.suggest, random_state=1234, n_startup_jobs=10):
    algo = partial(optimizer, n_startup_jobs=n_startup_jobs)
    best = fmin(score, space, algo=algo, max_evals=evals, trials = trials)
    print(best)
    return best

In [20]:
cores = 4
n=500
verbose = False
trials = Trials()

In [None]:
best_param = optimize(space,
                      evals = n,
                      optimizer=tpe.suggest,
                      cores = cores,
                      trials = trials, random_state=1234, 
                      n_startup_jobs=10)

  0%|                                                                         | 0/500 [00:00<?, ?it/s, best loss: ?]

Global seed set to 1


  0%|                                         | 1/500 [36:33<304:00:31, 2193.25s/it, best loss: -0.7549057862532833]

Global seed set to 1


  0%|▏                                        | 2/500 [42:35<154:24:20, 1116.19s/it, best loss: -0.7790104946200208]

Global seed set to 1


  1%|▏                                      | 3/500 [1:13:38<201:10:07, 1457.16s/it, best loss: -0.7790104946200208]

Global seed set to 1


  1%|▎                                      | 4/500 [1:37:44<200:09:10, 1452.72s/it, best loss: -0.7790104946200208]

Global seed set to 1


  1%|▍                                      | 5/500 [1:45:16<150:07:46, 1091.85s/it, best loss: -0.7792881270511458]

Global seed set to 1


  1%|▍                                      | 6/500 [2:07:31<161:11:55, 1174.73s/it, best loss: -0.7792881270511458]

Global seed set to 1


  1%|▌                                       | 7/500 [2:14:46<127:45:25, 932.91s/it, best loss: -0.7792881270511458]

Global seed set to 1


  2%|▋                                       | 8/500 [2:26:58<118:42:54, 868.65s/it, best loss: -0.7792881270511458]

Global seed set to 1


  2%|▋                                       | 9/500 [2:38:50<111:49:57, 819.95s/it, best loss: -0.7792881270511458]

Global seed set to 1


  2%|▊                                      | 10/500 [2:56:35<121:52:27, 895.40s/it, best loss: -0.7792881270511458]

Global seed set to 1


  2%|▊                                      | 11/500 [3:04:07<103:11:38, 759.71s/it, best loss: -0.7792881270511458]

Global seed set to 1


  2%|▉                                       | 12/500 [3:10:09<86:36:02, 638.86s/it, best loss: -0.7792881270511458]

Global seed set to 1


  3%|█                                      | 13/500 [3:34:02<118:57:28, 879.36s/it, best loss: -0.7796073492974597]

Global seed set to 1


  3%|█                                     | 14/500 [3:57:54<141:15:56, 1046.41s/it, best loss: -0.7796588107492209]

Global seed set to 1


  3%|█▏                                    | 15/500 [4:21:45<156:35:29, 1162.33s/it, best loss: -0.7796588107492209]

Global seed set to 1


  3%|█▏                                    | 16/500 [4:46:05<168:18:01, 1251.82s/it, best loss: -0.7806770248622671]

Global seed set to 1


  3%|█▎                                    | 17/500 [5:10:17<176:01:22, 1311.97s/it, best loss: -0.7806770248622671]

Global seed set to 1


  4%|█▎                                    | 18/500 [5:34:51<182:12:01, 1360.83s/it, best loss: -0.7806770248622671]

Global seed set to 1


  4%|█▍                                    | 19/500 [5:59:04<185:29:56, 1388.35s/it, best loss: -0.7806770248622671]

Global seed set to 1


In [None]:
pd.DataFrame(data=loss_dict).to_excel(log_file, index=False)