- incorporate drug_id validation
- try mlp, tab, xgb ensemble
- (mlp version 37, tabnet version 40, xgb version 20)
- change ensemble ratio

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0


In [2]:
#import sys
#!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
#!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
#sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
#sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
#sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path
#!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [3]:
import sys
import os
import pickle
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from category_encoders import CountEncoder
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')
from scipy.optimize import minimize, fsolve

import time
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf
from torch.nn.modules.loss import _WeightedLoss

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

# Preprocess

In [4]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [5]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [6]:
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
y = targets.drop("sig_id", axis=1).copy()
targets = targets[targets.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets.copy().drop("sig_id", axis=1).to_numpy()

In [7]:
NB_SPLITS = 7
seed = 34

def make_fold(NB_SPLITS, seed):   
    folds = []
    # LOAD FILES
    train_score = targets.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = train_score.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.groupby('drug_id')[target_feats].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.loc[train_score.drug_id.isin(vc2)].reset_index(drop = True)
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    train_score['fold'] = train_score.drug_id.map(dct1)
    train_score.loc[train_score.fold.isna(),'fold'] = train_score.loc[train_score.fold.isna(),'sig_id'].map(dct2)
    train_score.fold = train_score.fold.astype('int8')
    folds.append(train_score.fold.values)
    
    return np.array(folds).reshape(-1,1)

In [8]:
mlp_fold = make_fold(7,34)
tab_fold = make_fold(5,14)
xgb_fold = make_fold(5,34)

# Feature engineering

In [9]:
def fe_xgb(df_train, df_test):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()    
    X = tmp_train[c_feats+g_feats].copy().values
    select = VarianceThreshold(threshold=0.8)
    X_new = select.fit_transform(X)
    drop_feats = list(np.array(tmp_train[c_feats+g_feats].columns)[select.get_support()==False])

    tmp_train.drop(drop_feats, axis=1, inplace=True)
    tmp_test.drop(drop_feats, axis=1, inplace=True)

    modg_feats = [i for i in tmp_train.columns if "g-" in i]
    modc_feats = [i for i in tmp_train.columns if "c-" in i]
    
    c_num = 10
    pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
    pca = PCA(n_components=c_num,random_state=42)
    c_train = pca.fit_transform(tmp_train[modc_feats])
    c_test = pca.transform(tmp_test[modc_feats])
    c_train = pd.DataFrame(c_train, columns=pca_c_cols)
    c_test = pd.DataFrame(c_test, columns=pca_c_cols)

    g_num = 80
    pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
    pca = PCA(n_components=g_num, random_state=42)
    g_train = pca.fit_transform(tmp_train[modg_feats])
    g_test = pca.transform(tmp_test[modg_feats])
    g_train = pd.DataFrame(g_train, columns=pca_g_cols)
    g_test = pd.DataFrame(g_test, columns=pca_g_cols)

    tmp_train = pd.concat([tmp_train, c_train],axis=1)
    tmp_test = pd.concat([tmp_test, c_test],axis=1)
    tmp_train = pd.concat([tmp_train, g_train],axis=1)
    tmp_test = pd.concat([tmp_test, g_test],axis=1)
    return tmp_train, tmp_test
    
def fe_xgb_stats(df, remove_features):
    tmp = df.copy()
    modg_feats = [i for i in tmp.columns if "g-" in i]
    modc_feats = [i for i in tmp.columns if "c-" in i]
    tmp['g_sum'] = tmp[modg_feats].sum(axis = 1)
    tmp['g_mean'] = tmp[modg_feats].mean(axis = 1)
    tmp['g_std'] = tmp[modg_feats].std(axis = 1)
    tmp['g_kurt'] = tmp[modg_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[modg_feats].skew(axis = 1)
    tmp['c_sum'] = tmp[modc_feats].sum(axis = 1)
    tmp['c_mean'] = tmp[modc_feats].mean(axis = 1)
    tmp['c_std'] = tmp[modc_feats].std(axis = 1)
    tmp['c_kurt'] = tmp[modc_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[modc_feats].skew(axis = 1)
    tmp['gc_sum'] = tmp[modc_feats + modg_feats].sum(axis = 1)
    tmp['gc_mean'] = tmp[modc_feats + modg_feats].mean(axis = 1)
    tmp['gc_std'] = tmp[modc_feats + modg_feats].std(axis = 1)
    tmp['gc_kurt'] = tmp[modc_feats + modg_feats].kurtosis(axis = 1)
    tmp['gc_skew'] = tmp[modc_feats + modg_feats].skew(axis = 1)
    return tmp

def fe_simple(df, remove_features):
    tmp = df.copy()
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

def fe_simple2(df):
    tmp = df.copy()
    tmp = pd.get_dummies(tmp, columns=['cp_time','cp_dose'])
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True) 
    return tmp

def fe_mlp(df_train, df_test):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()
    X = tmp_train.iloc[:,4:].copy().values
    select = VarianceThreshold(threshold=0.7)
    X_new = select.fit_transform(X)
    drop_feats = list(np.array(tmp_train.iloc[:,4:].columns)[select.get_support()==False])
    
    tmp_train.drop(drop_feats, axis=1, inplace=True)
    tmp_test.drop(drop_feats, axis=1, inplace=True)

    modg_feats = [i for i in tmp_train.columns if "g-" in i]
    modc_feats = [i for i in tmp_train.columns if "c-" in i]
    
    for i in modc_feats + modg_feats:
        ss = preprocessing.QuantileTransformer(n_quantiles=1000, random_state=0, output_distribution="normal")
        ss.fit(tmp_train[i].values.reshape(-1,1))
        tmp_train[i] = ss.transform(tmp_train[i].values.reshape(-1,1))
        tmp_test[i] = ss.transform(tmp_test[i].values.reshape(-1,1))
    
    c_num = 10
    pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
    pca = PCA(n_components=c_num,random_state=42)
    c_train = pca.fit_transform(tmp_train[modc_feats])
    c_test = pca.transform(tmp_test[modc_feats])
    c_train = pd.DataFrame(c_train, columns=pca_c_cols)
    c_test = pd.DataFrame(c_test, columns=pca_c_cols)

    g_num = 60
    pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
    pca = PCA(n_components=g_num, random_state=42)
    g_train = pca.fit_transform(tmp_train[modg_feats])
    g_test = pca.transform(tmp_test[modg_feats])
    g_train = pd.DataFrame(g_train, columns=pca_g_cols)
    g_test = pd.DataFrame(g_test, columns=pca_g_cols)

    tmp_train = pd.concat([tmp_train, c_train],axis=1)
    tmp_test = pd.concat([tmp_test, c_test],axis=1)
    tmp_train = pd.concat([tmp_train, g_train],axis=1)
    tmp_test = pd.concat([tmp_test, g_test],axis=1)
    
    return tmp_train, tmp_test

def fe_tabnet(df_train, df_test):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()
    
    modg_feats = [i for i in tmp_train.columns if "g-" in i]
    modc_feats = [i for i in tmp_train.columns if "c-" in i]
    
    for i in modc_feats + modg_feats:
        ss = preprocessing.RobustScaler()
        ss.fit(tmp_train[i].values.reshape(-1,1))
        tmp_train[i] = ss.transform(tmp_train[i].values.reshape(-1,1))
        tmp_test[i] = ss.transform(tmp_test[i].values.reshape(-1,1))
        
    return tmp_train, tmp_test

def fe_stats(df):
    tmp = df.copy()
    modg_feats = [i for i in tmp.columns if "g-" in i]
    modc_feats = [i for i in tmp.columns if "c-" in i]
    tmp['g_kurt'] = tmp[modg_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[modg_feats].skew(axis = 1)
    tmp['c_kurt'] = tmp[modc_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[modc_feats].skew(axis = 1)
    return tmp

remove_features = ["cp_type" , "sig_id"]

In [10]:
# pytorch mlp -----------------------------------
mlp_train, mlp_test = fe_mlp(train, test)
mlp_train = fe_stats(mlp_train)
mlp_test = fe_stats(mlp_test)
mlp_train = fe_simple2(mlp_train)
mlp_test = fe_simple2(mlp_test)
mlp_train["fold"] = mlp_fold
mlp_train = mlp_train.to_numpy()
mlp_test = mlp_test.to_numpy()

# pytorch tabnet ----------------------------------
tab_train, tab_test = fe_tabnet(train, test)
tab_train = fe_simple(tab_train, remove_features)
tab_test = fe_simple(tab_test, remove_features)
tab_train["fold"] = tab_fold

tab_train = tab_train.to_numpy()
tab_test = tab_test.to_numpy()

# xgb-----------------------
fn_train = fe_simple(train, remove_features)
fn_test = fe_simple(test, remove_features)

X, X_test = fe_xgb(fn_train, fn_test)
X = fe_xgb_stats(X, remove_features)
X_test = fe_xgb_stats(X_test, remove_features)
X["fold"] = xgb_fold

print(mlp_train.shape, mlp_test.shape)
print(tab_train.shape, tab_test.shape)
print(X.shape, X_test.shape)

(21948, 919) (3624, 918)
(21948, 875) (3624, 874)
(21948, 915) (3624, 914)


# 1st mlp

In [11]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets, n_classes, smoothing=0.0):
        assert 0 <= smoothing <= 1
        with torch.no_grad():
            targets = targets * (1 - smoothing) + torch.ones_like(targets).to(device) * smoothing / n_classes
        return targets

    def forward(self, inputs, targets):
        targets = SmoothCrossEntropyLoss()._smooth(targets, inputs.shape[1], self.smoothing)

        if self.weight is not None:
            inputs = inputs * self.weight.unsqueeze(0)

        loss = F.binary_cross_entropy_with_logits(inputs, targets)

        return loss

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
n_folds=7
train_epochs = 20
smoothing = 0.001
p_min = smoothing
p_max = 1 - smoothing

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class MoaModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1024))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, last_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]
    
    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=224)
    metric = lambda inputs, targets : F.binary_cross_entropy((torch.clamp(torch.sigmoid(inputs), p_min, p_max)), targets)

    models = []
    
    X_test2 = torch.tensor(X_test, dtype=torch.float32)
    test = torch.utils.data.TensorDataset(X_test2) 
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    
    scores = []
    for fold in range(n_folds):
        valid_index = X_train[:,-1] == fold
        train_index = X_train[:,-1] != fold
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        X_train2 = X_train2[:,:-1]
        X_valid2 = X_valid2[:,:-1]
        
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = SmoothCrossEntropyLoss(smoothing=smoothing)

        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
        
        clf.to(device)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            sm_avg_loss = 0.
            
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                avg_loss += loss.item() / len(train_loader)  
                sm_avg_loss += metric(y_pred, y_batch) / len(train_loader) 
                
            clf.eval()
            avg_val_loss = 0.
            sm_avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                sm_avg_val_loss += metric(y_pred, y_batch) / len(valid_loader)

            elapsed_time = time.time() - start_time 
            #scheduler.step() #avg_val_loss # maybe mistake
                    
            if sm_avg_val_loss < best_val_loss:
                best_val_loss = sm_avg_val_loss
                print('Best: Epoch {} \t loss={:.5f} \t val_loss={:.5f} \t sm_loss={:.5f} \t sm_val_loss={:.5f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, sm_avg_loss, sm_avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
        
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))         
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, pred_value

In [13]:
seeds = [0,1,2,3,4]
mlp1_oof = np.zeros([len(mlp_train),fn_targets.shape[1]])
mlp1_test = np.zeros([len(mlp_test),fn_targets.shape[1]])

for seed_ in seeds:
    oof, pytorch_pred = modelling_torch(mlp_train, fn_targets, mlp_test, seed_, mlp_train.shape[1]-1, fn_targets.shape[1])
    mlp1_oof += oof / len(seeds)
    mlp1_test += pytorch_pred / len(seeds)

Fold 1
Best: Epoch 1 	 loss=0.41263 	 val_loss=0.02233 	 sm_loss=0.41259 	 sm_val_loss=0.02228 	 time=1.86s
Best: Epoch 2 	 loss=0.02025 	 val_loss=0.02022 	 sm_loss=0.02019 	 sm_val_loss=0.01977 	 time=0.92s
Best: Epoch 3 	 loss=0.01819 	 val_loss=0.01803 	 sm_loss=0.01819 	 sm_val_loss=0.01805 	 time=0.93s
Best: Epoch 5 	 loss=0.01735 	 val_loss=0.01793 	 sm_loss=0.01747 	 sm_val_loss=0.01796 	 time=0.96s
Best: Epoch 6 	 loss=0.01736 	 val_loss=0.01786 	 sm_loss=0.01748 	 sm_val_loss=0.01789 	 time=0.90s
Best: Epoch 8 	 loss=0.01730 	 val_loss=0.01766 	 sm_loss=0.01743 	 sm_val_loss=0.01767 	 time=0.90s
Best: Epoch 11 	 loss=0.01702 	 val_loss=0.01745 	 sm_loss=0.01715 	 sm_val_loss=0.01746 	 time=0.88s
Best: Epoch 13 	 loss=0.01650 	 val_loss=0.01731 	 sm_loss=0.01665 	 sm_val_loss=0.01735 	 time=0.89s
Best: Epoch 14 	 loss=0.01622 	 val_loss=0.01731 	 sm_loss=0.01639 	 sm_val_loss=0.01734 	 time=0.91s
Best: Epoch 15 	 loss=0.01592 	 val_loss=0.01726 	 sm_loss=0.01609 	 sm_val_loss=

In [14]:
check_mlp = np.zeros([y.shape[0], y.shape[1]])
check_mlp[cons_train_index,:] = mlp1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_mlp)))

OOF log loss:  0.01567181329219744


In [15]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_mlp[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.6729756555646167


# 1st tabnet

In [16]:
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [17]:
MAX_EPOCH=200

def seed_tabnet_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
def modelling_tabnet(tr, target, te, sample_seed):
    seed_tabnet_everything(sample_seed) 
    tabnet_params = dict(n_d=24, n_a=24, n_steps=1, gamma=1.5, seed = sample_seed,
                     lambda_sparse=0, n_independent=1, n_shared=1,
                     optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []
    
    NB_SPLITS = 5
    oof_preds = np.zeros([len(tr),target.shape[1]])
    scores = []
    for fold_nb in range(NB_SPLITS):
        print("FOLDS : ", fold_nb+1)
        
        ## model
        val_idx = tr[:,-1] == fold_nb
        train_idx = tr[:,-1] != fold_nb
        X_train, y_train = tr[train_idx, :], target[train_idx, :]
        X_val, y_val = tr[val_idx, :], target[val_idx, :]
        X_train = np.delete(X_train, -1, 1)
        X_val = np.delete(X_val, -1, 1)
        model = TabNetRegressor(**tabnet_params)
        
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=512, virtual_batch_size=128,
              num_workers=1, drop_last=False,
              # use binary cross entropy as this is not a regression problem
              loss_fn=torch.nn.functional.binary_cross_entropy_with_logits) 
            
        preds_val = model.predict(X_val)
        preds =  1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])
        oof_preds[val_idx,:] = preds
        scores.append(score)
        
        # preds on test
        preds_test = model.predict(te)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

    test_preds_all = np.stack(test_cv_preds)
    print("OOF log loss:", log_loss(np.ravel(target), np.ravel(np.array(oof_preds))))
    aucs = []
    for task_id in range(206):
        aucs.append(roc_auc_score(y_true=target[:, task_id],y_score=oof_preds[:, task_id]))
    print(f"Overall AUC : {np.mean(aucs)}")
    return oof_preds, test_preds_all

In [18]:
tabnet1_oof = np.zeros([len(tab_train),fn_targets.shape[1]])
tabnet1_test = np.zeros([len(tab_test),fn_targets.shape[1]])
seeds = [0,1,2]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(tab_train, fn_targets, tab_test, seed_)
    tabnet1_oof += oof_preds / len(seeds)
    tabnet1_test += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 0.25196 | val_logits_ll: 0.02888 |  0:00:01s
epoch 10 | loss: 0.01778 | val_logits_ll: 0.01846 |  0:00:11s
epoch 20 | loss: 0.01668 | val_logits_ll: 0.01868 |  0:00:22s
epoch 30 | loss: 0.01609 | val_logits_ll: 0.01892 |  0:00:34s
epoch 40 | loss: 0.01579 | val_logits_ll: 0.01806 |  0:00:45s

Early stopping occured at epoch 41 with best_epoch = 21 and best_val_logits_ll = 0.0179
Best weights from best epoch are automatically used!
FOLDS :  2
Device used : cuda
epoch 0  | loss: 0.25514 | val_logits_ll: 0.02916 |  0:00:01s
epoch 10 | loss: 0.01839 | val_logits_ll: 0.01873 |  0:00:12s
epoch 20 | loss: 0.01696 | val_logits_ll: 0.0206  |  0:00:22s
epoch 30 | loss: 0.01628 | val_logits_ll: 0.01884 |  0:00:33s
epoch 40 | loss: 0.01607 | val_logits_ll: 0.01893 |  0:00:46s
epoch 50 | loss: 0.01582 | val_logits_ll: 0.01794 |  0:00:57s

Early stopping occured at epoch 58 with best_epoch = 38 and best_val_logits_ll = 0.01769
Best weights from best epo

In [19]:
check_tabnet = np.zeros([y.shape[0], y.shape[1]])
check_tabnet[cons_train_index,:] = tabnet1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_tabnet)))

OOF log loss:  0.015857856295506397


In [20]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_tabnet[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.6673952550681482


# 1st xgb

In [21]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist')) 

clf = Pipeline([('classify', classifier)
               ])

params = {'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0803,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 25.5800,
          'classify__estimator__n_estimators': 100,
         }

clf.set_params(**params)

Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               gamma=3.6975,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_constraints=None,
                                                               learning_rate=0.0803,
                                                               max_delta_step=2.0706,
                                                   

In [22]:
def modelling_xgb(X, y, X_test, seed):
    oof_preds = np.zeros(y.shape)
    test_preds = np.zeros((X_test.shape[0], y.shape[1]))
    oof_losses = []
    for fn in range(NFOLDS):
        print('Starting fold: ', fn)
        val_idx = X.iloc[:,-1] == fn
        trn_idx = X.iloc[:,-1] != fn
        X_train, X_val = X.loc[trn_idx,:].to_numpy(), X.loc[val_idx,:].to_numpy()
        y_train, y_val = y.loc[trn_idx].to_numpy(), y.loc[val_idx].to_numpy()
        X_train = np.delete(X_train, -1, 1)
        X_val = np.delete(X_val, -1, 1)
    
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val) # list of preds per class
        val_preds = np.array(val_preds)[:,:,1].T # take the positive class
        oof_preds[val_idx] = val_preds
        
        loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
        print(loss)
        oof_losses.append(loss)
        preds = clf.predict_proba(X_test)
        preds = np.array(preds)[:,:,1].T # take the positive class
        test_preds += preds / NFOLDS
    
    print(oof_losses)
    print('Mean OOF loss across folds', np.mean(oof_losses))
    print('STD OOF loss across folds', np.std(oof_losses))
    return oof_preds, test_preds

In [23]:
NFOLDS=5
xgb1_oof = np.zeros((X.shape[0], fn_targets.shape[1]))
xgb1_test = np.zeros((X_test.shape[0], fn_targets.shape[1]))
for fn in range(NFOLDS):
    print('Starting fold: ', fn)
    val_idx = X.iloc[:,-1] == fn
    trn_idx = X.iloc[:,-1] != fn
    X_train, X_val = X.loc[trn_idx,:].to_numpy(), X.loc[val_idx,:].to_numpy()
    y_train, y_val = fn_targets[trn_idx,:], fn_targets[val_idx,:]
    X_train = np.delete(X_train, -1, 1)
    X_val = np.delete(X_val, -1, 1)
    
    clf.fit(X_train, y_train)
        
    val_preds = clf.predict_proba(X_val) # list of preds per class
    val_preds = np.array(val_preds)[:,:,1].T # take the positive class
    xgb1_oof[val_idx] = val_preds
    loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
    print(loss)
        
    preds = clf.predict_proba(X_test)
    preds = np.array(preds)[:,:,1].T # take the positive class
    xgb1_test += preds / NFOLDS

Starting fold:  0
0.01802298004472023
Starting fold:  1
0.018461632031383953
Starting fold:  2
0.01771447134739482
Starting fold:  3
0.0180343302189189
Starting fold:  4
0.01803448574623002


In [24]:
check_xgb = np.zeros([y.shape[0], y.shape[1]])
check_xgb[cons_train_index,:] =  xgb1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_xgb)))

OOF log loss:  0.01663886078344363


# submission

In [25]:
pd.DataFrame(check_mlp).to_csv("mlp_oof.csv", index=False)
pd.DataFrame(check_tabnet).to_csv("tab_oof.csv", index=False)
pd.DataFrame(check_xgb).to_csv("xgb_oof.csv", index=False)

In [26]:
check = 0.01 * check_xgb + 0.39 * check_tabnet + 0.6 * check_mlp
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check)))

OOF log loss:  0.015545757089245039


In [27]:
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub.loc[cons_test_index,target_feats] =  0.01 * xgb1_test + 0.39 * tabnet1_test + 0.6 * mlp1_test
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)