- increase weight optimization trials
- increase tabnet ensemble
- decrease svm ratio

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0


In [2]:
import sys
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [3]:
import os
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from category_encoders import CountEncoder
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')
from scipy.optimize import minimize, fsolve

import time
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf
from torch.nn.modules.loss import _WeightedLoss

from sklearn.linear_model import LogisticRegression
from cuml.svm import SVC, SVR

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

# Preprocess

In [4]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')

In [5]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [6]:
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets.drop("sig_id", axis=1).copy()
fn_targets = fn_targets[fn_targets.index.isin(cons_train_index)].copy().reset_index(drop=True).to_numpy()
y = targets.drop("sig_id", axis=1).copy()

# Feature engineering

In [7]:
def fe_simple(df, remove_features):
    tmp = df.copy()
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

def fe_mlp(df_train, df_test):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()
    X = tmp_train.iloc[:,4:].copy().values
    select = VarianceThreshold(threshold=0.7)
    X_new = select.fit_transform(X)
    drop_feats = list(np.array(tmp_train.iloc[:,4:].columns)[select.get_support()==False])
    
    tmp_train.drop(drop_feats, axis=1, inplace=True)
    tmp_test.drop(drop_feats, axis=1, inplace=True)

    modg_feats = [i for i in tmp_train.columns if "g-" in i]
    modc_feats = [i for i in tmp_train.columns if "c-" in i]
    
    for i in modc_feats + modg_feats:
        ss = preprocessing.QuantileTransformer(n_quantiles=1000, random_state=0, output_distribution="normal")
        ss.fit(tmp_train[i].values.reshape(-1,1))
        tmp_train[i] = ss.transform(tmp_train[i].values.reshape(-1,1))
        tmp_test[i] = ss.transform(tmp_test[i].values.reshape(-1,1))
    
    c_num = 10
    pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
    pca = PCA(n_components=c_num,random_state=42)
    c_train = pca.fit_transform(tmp_train[modc_feats])
    c_test = pca.transform(tmp_test[modc_feats])
    c_train = pd.DataFrame(c_train, columns=pca_c_cols)
    c_test = pd.DataFrame(c_test, columns=pca_c_cols)

    g_num = 60
    pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
    pca = PCA(n_components=g_num, random_state=42)
    g_train = pca.fit_transform(tmp_train[modg_feats])
    g_test = pca.transform(tmp_test[modg_feats])
    g_train = pd.DataFrame(g_train, columns=pca_g_cols)
    g_test = pd.DataFrame(g_test, columns=pca_g_cols)

    tmp_train = pd.concat([tmp_train, c_train],axis=1)
    tmp_test = pd.concat([tmp_test, c_test],axis=1)
    tmp_train = pd.concat([tmp_train, g_train],axis=1)
    tmp_test = pd.concat([tmp_test, g_test],axis=1)
    
    return tmp_train, tmp_test

def fe_mlp2(df):
    tmp = df.copy()
    modg_feats = [i for i in tmp.columns if "g-" in i]
    modc_feats = [i for i in tmp.columns if "c-" in i]
    tmp['g_sum'] = tmp[modg_feats].sum(axis = 1)
    tmp['g_mean'] = tmp[modg_feats].mean(axis = 1)
    tmp['g_std'] = tmp[modg_feats].std(axis = 1)
    tmp['g_kurt'] = tmp[modg_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[modg_feats].skew(axis = 1)
    tmp['c_sum'] = tmp[modc_feats].sum(axis = 1)
    tmp['c_mean'] = tmp[modc_feats].mean(axis = 1)
    tmp['c_std'] = tmp[modc_feats].std(axis = 1)
    tmp['c_kurt'] = tmp[modc_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[modc_feats].skew(axis = 1)
    tmp['gc_sum'] = tmp[modc_feats + modg_feats].sum(axis = 1)
    tmp['gc_mean'] = tmp[modc_feats + modg_feats].mean(axis = 1)
    tmp['gc_std'] = tmp[modc_feats + modg_feats].std(axis = 1)
    tmp['gc_kurt'] = tmp[modc_feats + modg_feats].kurtosis(axis = 1)
    tmp['gc_skew'] = tmp[modc_feats + modg_feats].skew(axis = 1)
    tmp = pd.get_dummies(tmp, columns=['cp_time','cp_dose'])
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True) 
    return tmp

def fe_tabnet(df_train, df_test):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()
    
    for i in c_feats + g_feats:
        ss = preprocessing.QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
        ss.fit(tmp_train[i].values.reshape(-1,1))
        tmp_train[i] = ss.transform(tmp_train[i].values.reshape(-1,1))
        tmp_test[i] = ss.transform(tmp_test[i].values.reshape(-1,1))
        
    c_num = 1
    pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
    pca = PCA(n_components=c_num,random_state=42)
    c_train = pca.fit_transform(tmp_train[c_feats])
    c_test = pca.transform(tmp_test[c_feats])
    c_train = pd.DataFrame(c_train, columns=pca_c_cols)
    c_test = pd.DataFrame(c_test, columns=pca_c_cols)

    g_num = 10
    pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
    pca = PCA(n_components=g_num, random_state=42)
    g_train = pca.fit_transform(tmp_train[g_feats])
    g_test = pca.transform(tmp_test[g_feats])
    g_train = pd.DataFrame(g_train, columns=pca_g_cols)
    g_test = pd.DataFrame(g_test, columns=pca_g_cols)

    tmp_train = pd.concat([tmp_train, c_train],axis=1)
    tmp_test = pd.concat([tmp_test, c_test],axis=1)
    tmp_train = pd.concat([tmp_train, g_train],axis=1)
    tmp_test = pd.concat([tmp_test, g_test],axis=1)
        
    return tmp_train, tmp_test

remove_features = ["cp_type" , "sig_id"]

In [8]:
# pytorch mlp -----------------------------------
mlp_train, mlp_test = fe_mlp(train, test)
mlp_train = fe_mlp2(mlp_train).to_numpy()
mlp_test = fe_mlp2(mlp_test).to_numpy()

# pytorch tabnet ----------------------------------
tab_train, tab_test = fe_tabnet(train, test)
tab_train = fe_simple(tab_train, remove_features)
tab_test = fe_simple(tab_test, remove_features)

tab_train = tab_train.to_numpy()
tab_test = tab_test.to_numpy()

# svm-----------------------
fn_train = fe_simple(train, remove_features)
fn_test = fe_simple(test, remove_features)

ss = preprocessing.StandardScaler()
fn_train= ss.fit_transform(fn_train)
fn_test = ss.transform(fn_test)

print(mlp_train.shape, mlp_test.shape)
print(tab_train.shape, tab_test.shape)
print(fn_train.shape, fn_test.shape)

(21948, 929) (3624, 929)
(21948, 885) (3624, 885)
(21948, 874) (3624, 874)


# 1st mlp

In [9]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets, n_classes, smoothing=0.0):
        assert 0 <= smoothing <= 1
        with torch.no_grad():
            targets = targets * (1 - smoothing) + torch.ones_like(targets).to(device) * smoothing / n_classes
        return targets

    def forward(self, inputs, targets):
        targets = SmoothCrossEntropyLoss()._smooth(targets, inputs.shape[1], self.smoothing)

        if self.weight is not None:
            inputs = inputs * self.weight.unsqueeze(0)

        loss = F.binary_cross_entropy_with_logits(inputs, targets)

        return loss

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
n_folds=7
train_epochs = 20
smoothing = 0.001
p_min = smoothing
p_max = 1 - smoothing

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class MoaModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1024))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, last_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
def modelling_torch(X_train, y_train, X_test, sample_seed, last_num):
    seed_everything(seed=sample_seed) 

    test_len = X_test.shape[0]
    
    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=224)
    metric = lambda inputs, targets : F.binary_cross_entropy((torch.clamp(torch.sigmoid(inputs), p_min, p_max)), targets)

    models = []
    
    X_test2 = torch.tensor(X_test, dtype=torch.float32)
    test = torch.utils.data.TensorDataset(X_test2) 
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)

        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = MoaModel(mlp_train.shape[1], last_num)
        loss_fn = SmoothCrossEntropyLoss(smoothing=smoothing)
        
        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
        
        clf.to(device)
        
        best_val_loss = np.inf
        
        for epoch in range(train_epochs):
            clf.train()
            sm_avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                sm_avg_loss += metric(y_pred, y_batch) / len(train_loader) 
                
            clf.eval()
            sm_avg_val_loss = 0.
            
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                sm_avg_val_loss += metric(y_pred, y_batch) / len(valid_loader)
                            
            if sm_avg_val_loss < best_val_loss:
                best_val_loss = sm_avg_val_loss
                print('Epoch {} sm_loss={:.5f}  sm_val_loss={:.5f}'.format(epoch + 1, sm_avg_loss, sm_avg_val_loss))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
                
        pred_model = MoaModel(mlp_train.shape[1], last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))         
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
            y_pred = pred_model(x_batch).detach()
            oof_epoch[i * batch_size:(i+1) * batch_size,:] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
            target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, pred_value

In [11]:
seeds = [0,1,2,3,4]
mlp1_oof = np.zeros([len(mlp_train),fn_targets.shape[1]])
mlp1_test = np.zeros([len(mlp_test),fn_targets.shape[1]])

for seed_ in seeds:
    oof, pytorch_pred = modelling_torch(mlp_train, fn_targets, mlp_test, seed_, fn_targets.shape[1])
    mlp1_oof += oof / len(seeds)
    mlp1_test += pytorch_pred / len(seeds)

Fold 1
Epoch 1 sm_loss=0.41417  sm_val_loss=0.02222
Epoch 2 sm_loss=0.02011  sm_val_loss=0.01953
Epoch 3 sm_loss=0.01818  sm_val_loss=0.01902
Epoch 4 sm_loss=0.01763  sm_val_loss=0.01781
Epoch 5 sm_loss=0.01750  sm_val_loss=0.01770
Epoch 6 sm_loss=0.01744  sm_val_loss=0.01759
Epoch 7 sm_loss=0.01743  sm_val_loss=0.01740
Epoch 8 sm_loss=0.01734  sm_val_loss=0.01735
Epoch 11 sm_loss=0.01707  sm_val_loss=0.01709
Epoch 12 sm_loss=0.01687  sm_val_loss=0.01680
Epoch 13 sm_loss=0.01671  sm_val_loss=0.01674
Epoch 14 sm_loss=0.01642  sm_val_loss=0.01673
Epoch 15 sm_loss=0.01609  sm_val_loss=0.01636
Epoch 16 sm_loss=0.01566  sm_val_loss=0.01632
Epoch 17 sm_loss=0.01520  sm_val_loss=0.01608
Epoch 18 sm_loss=0.01475  sm_val_loss=0.01598
Epoch 19 sm_loss=0.01437  sm_val_loss=0.01595
Epoch 20 sm_loss=0.01419  sm_val_loss=0.01592
Fold 2
Epoch 1 sm_loss=0.41399  sm_val_loss=0.02262
Epoch 2 sm_loss=0.02004  sm_val_loss=0.02017
Epoch 4 sm_loss=0.01781  sm_val_loss=0.01759
Epoch 7 sm_loss=0.01742  sm_val

In [12]:
check_mlp = np.zeros([targets.shape[0], targets.shape[1]-1])
check_mlp[cons_train_index,:] = mlp1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_mlp)))

OOF log loss:  0.014597014592739207


In [13]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_mlp[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.8085230460948042


# 1st tabnet

In [14]:
smoothing = 0.00005
p_min = smoothing
p_max = 1 - smoothing

class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        
        #smoothing = 0.00001
        #n_classes = 206
        #y_true = y_true * (1 - smoothing) + np.ones_like(y_true) * smoothing / n_classes
        
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [15]:
MAX_EPOCH=200

def seed_tabnet_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
def modelling_tabnet(tr, target, te, sample_seed):
    seed_tabnet_everything(sample_seed) 
    tabnet_params = dict(n_d=24, n_a=24, n_steps=1, gamma=1.5, seed = sample_seed,
                     lambda_sparse=0, n_independent=1, n_shared=1,
                     optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []
    
    NB_SPLITS = 5
    mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    oof_preds = np.zeros([len(tr),target.shape[1]])
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        print("FOLDS : ", fold_nb+1)

        ## model
        X_train, y_train = tr[train_idx, :], target[train_idx, :]
        X_val, y_val = tr[val_idx, :], target[val_idx, :]
        model = TabNetRegressor(**tabnet_params)
        
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=1024, virtual_batch_size=128,
              num_workers=1, drop_last=False,
              # use binary cross entropy as this is not a regression problem
              loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)
        
        preds_val = model.predict(X_val)
        preds =  1 / (1 + np.exp(-preds_val))
        oof_preds[val_idx,:] = preds
        #oof_preds[val_idx,:] = np.clip(preds, p_min, p_max)
        
        # preds on test
        preds_test = model.predict(te)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))
        #test_cv_preds.append(np.clip(1 / (1 + np.exp(-preds_test)), p_min, p_max))


    test_preds_all = np.stack(test_cv_preds)
    return oof_preds, test_preds_all

In [16]:
tabnet1_oof = np.zeros([len(tab_train),fn_targets.shape[1]])
tabnet1_test = np.zeros([len(tab_test),fn_targets.shape[1]])
seeds = [0,1,2,3,4]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(tab_train, fn_targets, tab_test, seed_)
    tabnet1_oof += oof_preds / len(seeds)
    tabnet1_test += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 0.4388  | val_logits_ll: 0.15052 |  0:00:01s
epoch 10 | loss: 0.01926 | val_logits_ll: 0.01938 |  0:00:10s
epoch 20 | loss: 0.01741 | val_logits_ll: 0.01861 |  0:00:19s
epoch 30 | loss: 0.01671 | val_logits_ll: 0.01751 |  0:00:29s
epoch 40 | loss: 0.01646 | val_logits_ll: 0.01724 |  0:00:38s
epoch 50 | loss: 0.01589 | val_logits_ll: 0.01728 |  0:00:48s
epoch 60 | loss: 0.01548 | val_logits_ll: 0.01681 |  0:00:57s
epoch 70 | loss: 0.01513 | val_logits_ll: 0.01676 |  0:01:06s

Early stopping occured at epoch 79 with best_epoch = 59 and best_val_logits_ll = 0.01676
Best weights from best epoch are automatically used!
FOLDS :  2
Device used : cuda
epoch 0  | loss: 0.44418 | val_logits_ll: 0.16504 |  0:00:00s
epoch 10 | loss: 0.01906 | val_logits_ll: 0.01926 |  0:00:10s
epoch 20 | loss: 0.01709 | val_logits_ll: 0.02044 |  0:00:20s
epoch 30 | loss: 0.0164  | val_logits_ll: 0.01993 |  0:00:30s
epoch 40 | loss: 0.01579 | val_logits_ll: 0.01746 |  

In [17]:
check_tabnet = np.zeros([targets.shape[0], targets.shape[1]-1])
check_tabnet[cons_train_index,:] = tabnet1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_tabnet)))

OOF log loss:  0.014947202730287188


In [18]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_tabnet[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.7962018079735242


# 1st SVM

In [19]:
N_STARTS = 1
N_SPLITS = 5

svm0_oof = np.zeros([len(fn_train), fn_targets.shape[1]])
svm0_test = np.zeros([len(fn_test), fn_targets.shape[1]])

svm1_test = np.zeros([len(fn_test),fn_targets.shape[1]])
svm1_oof = np.zeros([fn_targets.shape[0],fn_targets.shape[1]]) 

for ind in tqdm(range(fn_targets.shape[1])):
    ind_target_sum = fn_targets[:, ind].sum()
    if ind_target_sum >= N_SPLITS:
        for seed in range(N_STARTS):
            skf = StratifiedKFold(n_splits = N_SPLITS, random_state = seed, shuffle = True)

            for n, (train_index, val_index) in enumerate(skf.split(fn_train, fn_targets[:,ind])):
                
                x_tr, x_val = fn_train[train_index], fn_train[val_index]
                y_tr, y_val = fn_targets[train_index,ind], fn_targets[val_index,ind]

                model = SVC(C = 40, cache_size = 2000)
                model.fit(x_tr, y_tr)
                svm0_test[:, ind] += model.decision_function(fn_test) / (N_SPLITS * N_STARTS)
                svm0_oof[val_index, ind] += model.decision_function(x_val) / N_STARTS

        for seed in range(N_STARTS):
            skf = StratifiedKFold(n_splits = N_SPLITS, random_state = seed, shuffle = True)
            
            for n, (train_index, val_index) in enumerate(skf.split(svm0_oof, fn_targets[:,ind])):

                x_tr, x_val = svm0_oof[train_index, ind].reshape(-1, 1), svm0_oof[val_index, ind].reshape(-1, 1)
                y_tr, y_val = fn_targets[train_index,ind], fn_targets[val_index,ind]

                model = LogisticRegression(C = 35, max_iter = 1000)
                model.fit(x_tr, y_tr)
                svm1_test[:, ind] += model.predict_proba(svm0_test[:, ind].reshape(-1, 1))[:, 1] / (N_SPLITS * N_STARTS)
                svm1_oof[val_index, ind] += model.predict_proba(x_val)[:, 1] / N_STARTS

HBox(children=(FloatProgress(value=0.0, max=206.0), HTML(value='')))




In [20]:
check_svm = np.zeros([targets.shape[0], targets.shape[1]-1])
check_svm[cons_train_index,:] = svm1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_svm)))

OOF log loss:  0.015414626688103106


In [21]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_svm[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.7382211150214084


# submission

- https://www.kaggle.com/ipythonx/optimizing-metrics-out-of-fold-weights-ensemble
- https://www.kaggle.com/hsperr/finding-ensamble-weights
- https://stackoverflow.com/questions/18767657/how-do-i-use-a-minimization-function-in-scipy-with-constraints

In [22]:
%%time

sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
check_score = pd.read_csv(DATA_DIR + 'train_targets_scored.csv').drop("sig_id", axis=1)

np.random.seed(42)
blend_train = []
blend_test = []

# out of fold prediction
#blend_train.append(svm1_oof)
blend_train.append(tabnet1_oof)
blend_train.append(mlp1_oof)
blend_train = np.array(blend_train)

# submission scores
#blend_test.append(svm1_test)
blend_test.append(tabnet1_test)
blend_test.append(mlp1_test)
blend_test = np.array(blend_test)

svm_ratio = 0.05

total_scores = []
for i in range(len(target_feats)):
    def log_loss_func(weights):
        final_prediction = svm_ratio * svm1_oof[:,i]
        for weight, prediction in zip(weights, blend_train):
            final_prediction += weight * prediction[:,i]
        return log_loss(np.ravel(fn_targets[:,i]), np.ravel(final_prediction))
    
    best_score = np.inf
    best_weights = [0] * len(blend_train)
    for k in range(60):
        starting_values = np.random.rand(len(blend_train))
        starting_values /= sum(starting_values)
        bounds = [(0, 1)] * len(blend_train)
        cons = ({'type': 'eq', 'fun': lambda x:  1 - svm_ratio - sum(x)}) 
            
        res = minimize(log_loss_func,
                   starting_values,
                   method='SLSQP',
                   bounds=bounds,
                   constraints = cons) 
    
        if best_score > res["fun"]:
            best_score = res["fun"]
            best_weights = res["x"]
        
    valid_prediction = svm_ratio * svm1_oof[:,i]
    for weight, prediction in zip(best_weights, blend_train):
        valid_prediction += weight * prediction[:,i]
    print('Score: {}, Weights: {}'.format(round(res['fun'],8), res['x']))
    total_scores.append(res['fun'])
    check_score.loc[cons_train_index,target_feats[i]] = valid_prediction
    
    oof_test = svm_ratio * svm1_test[:,i]
    for weight, prediction in zip(best_weights, blend_test):
        oof_test += weight * prediction[:,i]
        
    sub.loc[cons_test_index,target_feats[i]] = oof_test

print("final ensemble oof score:", np.mean(total_scores))
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_score)))

sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)

Score: 0.00572573, Weights: [0.59828    0.35172001]
Score: 0.00634922, Weights: [0.75974516 0.19025484]
Score: 0.00832809, Weights: [0.59538966 0.35461034]
Score: 0.04687472, Weights: [0.6243622 0.3256378]
Score: 0.06831473, Weights: [0.79922927 0.15077073]
Score: 0.02155863, Weights: [0.12033738 0.82966262]
Score: 0.01595623, Weights: [0.45373937 0.49626063]
Score: 0.02654544, Weights: [0.15884813 0.79115187]
Score: 0.00165483, Weights: [0.16194057 0.78805943]
Score: 0.05406978, Weights: [0.27029074 0.67970926]
Score: 0.078691, Weights: [0.55017053 0.39982947]
Score: 0.01048197, Weights: [0.   0.95]
Score: 0.00193801, Weights: [0.4578818 0.4921182]
Score: 0.00990744, Weights: [0.17473503 0.77526497]
Score: 0.00458215, Weights: [0.82987275 0.12012725]
Score: 0.004676, Weights: [0.46197358 0.48802642]
Score: 0.01476303, Weights: [0.44820914 0.50179086]
Score: 0.02516634, Weights: [0.35062375 0.59937625]
Score: 0.02314643, Weights: [0.87425819 0.07574181]
Score: 0.01186904, Weights: [0.4