- improve from ensemble version 81
- remove robustscaler in tabnet (improved tabnet a little)

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0


In [2]:
import sys
import os
import pickle
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from category_encoders import CountEncoder
from xgboost import XGBClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')
from scipy.optimize import minimize, fsolve
from sklearn.compose import make_column_transformer,ColumnTransformer
from sklearn.base import BaseEstimator,TransformerMixin
from sklearn.pipeline import make_pipeline,make_union

import time
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf
from torch.nn.modules.loss import _WeightedLoss

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

#from sklearn.linear_model import LogisticRegression
#from cuml.svm import SVC, SVR

# Preprocess

In [3]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
y = targets.drop("sig_id", axis=1).copy()
targets = targets[targets.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets.copy().drop("sig_id", axis=1).to_numpy()

In [6]:
def make_fold(NB_SPLITS, seed):   
    folds = []
    # LOAD FILES
    train_score = targets.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = train_score.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.groupby('drug_id')[target_feats].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.loc[train_score.drug_id.isin(vc2)].reset_index(drop = True)
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    train_score['fold'] = train_score.drug_id.map(dct1)
    train_score.loc[train_score.fold.isna(),'fold'] = train_score.loc[train_score.fold.isna(),'sig_id'].map(dct2)
    train_score.fold = train_score.fold.astype('int8')
    folds.append(train_score.fold.values)
    
    return np.array(folds).reshape(-1,1)

In [7]:
mlp_fold = make_fold(7,34)

# Feature engineering

In [8]:
def fe_simple(df, remove_features):
    tmp = df.copy()
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

def fe_simple2(df):
    tmp = df.copy()
    tmp = pd.get_dummies(tmp, columns=['cp_time','cp_dose'])
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True) 
    return tmp

def fe_mlp(df_train, df_test):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()
    X = tmp_train.iloc[:,4:].copy().values
    select = VarianceThreshold(threshold=0.7)
    X_new = select.fit_transform(X)
    drop_feats = list(np.array(tmp_train.iloc[:,4:].columns)[select.get_support()==False])
    
    tmp_train.drop(drop_feats, axis=1, inplace=True)
    tmp_test.drop(drop_feats, axis=1, inplace=True)

    modg_feats = [i for i in tmp_train.columns if "g-" in i]
    modc_feats = [i for i in tmp_train.columns if "c-" in i]
    
    for i in modc_feats + modg_feats:
        ss = preprocessing.QuantileTransformer(n_quantiles=1000, random_state=0, output_distribution="normal")
        ss.fit(tmp_train[i].values.reshape(-1,1))
        tmp_train[i] = ss.transform(tmp_train[i].values.reshape(-1,1))
        tmp_test[i] = ss.transform(tmp_test[i].values.reshape(-1,1))
    
    c_num = 10
    pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
    pca = PCA(n_components=c_num,random_state=42)
    c_train = pca.fit_transform(tmp_train[modc_feats])
    c_test = pca.transform(tmp_test[modc_feats])
    c_train = pd.DataFrame(c_train, columns=pca_c_cols)
    c_test = pd.DataFrame(c_test, columns=pca_c_cols)

    g_num = 60
    pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
    pca = PCA(n_components=g_num, random_state=42)
    g_train = pca.fit_transform(tmp_train[modg_feats])
    g_test = pca.transform(tmp_test[modg_feats])
    g_train = pd.DataFrame(g_train, columns=pca_g_cols)
    g_test = pd.DataFrame(g_test, columns=pca_g_cols)

    tmp_train = pd.concat([tmp_train, c_train],axis=1)
    tmp_test = pd.concat([tmp_test, c_test],axis=1)
    tmp_train = pd.concat([tmp_train, g_train],axis=1)
    tmp_test = pd.concat([tmp_test, g_test],axis=1)
    
    return tmp_train, tmp_test

def fe_stats(df):
    tmp = df.copy()
    modg_feats = [i for i in tmp.columns if "g-" in i]
    modc_feats = [i for i in tmp.columns if "c-" in i]
    tmp['g_kurt'] = tmp[modg_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[modg_feats].skew(axis = 1)
    tmp['c_kurt'] = tmp[modc_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[modc_feats].skew(axis = 1)
    return tmp

remove_features = ["cp_type" , "sig_id"]

In [9]:
fn_train = fe_simple(train, remove_features)
fn_test = fe_simple(test, remove_features)

# pytorch mlp -----------------------------------
mlp_train, mlp_test = fe_mlp(train, test)
mlp_train = fe_stats(mlp_train)
mlp_test = fe_stats(mlp_test)
mlp_train = fe_simple2(mlp_train)
mlp_test = fe_simple2(mlp_test)
mlp_train["fold"] = mlp_fold
mlp_train = mlp_train.to_numpy()
mlp_test = mlp_test.to_numpy()

# pytorch tabnet ----------------------------------
tab_train = fn_train.copy()
tab_test = fn_test.copy()
tab_train= tab_train.to_numpy()
tab_test = tab_test.to_numpy()

print(mlp_train.shape, mlp_test.shape)
print(tab_train.shape, tab_test.shape)

(21948, 919) (3624, 918)
(21948, 874) (3624, 874)


# 1st mlp

In [10]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets, n_classes, smoothing=0.0):
        assert 0 <= smoothing <= 1
        with torch.no_grad():
            targets = targets * (1 - smoothing) + torch.ones_like(targets).to(device) * smoothing / n_classes
        return targets

    def forward(self, inputs, targets):
        targets = SmoothCrossEntropyLoss()._smooth(targets, inputs.shape[1], self.smoothing)

        if self.weight is not None:
            inputs = inputs * self.weight.unsqueeze(0)

        loss = F.binary_cross_entropy_with_logits(inputs, targets)

        return loss

In [11]:
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
n_folds=7
train_epochs = 20
smoothing = 0.001
p_min = smoothing
p_max = 1 - smoothing

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class MoaModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1024))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, last_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]
    
    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=224)
    metric = lambda inputs, targets : F.binary_cross_entropy((torch.clamp(torch.sigmoid(inputs), p_min, p_max)), targets)

    models = []
    
    X_test2 = torch.tensor(X_test, dtype=torch.float32)
    test = torch.utils.data.TensorDataset(X_test2) 
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    
    scores = []
    for fold in range(n_folds):
        valid_index = X_train[:,-1] == fold
        train_index = X_train[:,-1] != fold
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        X_train2 = X_train2[:,:-1]
        X_valid2 = X_valid2[:,:-1]
        
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = SmoothCrossEntropyLoss(smoothing=smoothing)

        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
        
        clf.to(device)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            sm_avg_loss = 0.
            
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                avg_loss += loss.item() / len(train_loader)  
                sm_avg_loss += metric(y_pred, y_batch) / len(train_loader) 
                
            clf.eval()
            avg_val_loss = 0.
            sm_avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                sm_avg_val_loss += metric(y_pred, y_batch) / len(valid_loader)

            elapsed_time = time.time() - start_time 
            #scheduler.step() #avg_val_loss # maybe mistake
                    
            if sm_avg_val_loss < best_val_loss:
                best_val_loss = sm_avg_val_loss
                print('Best: Epoch {} \t loss={:.5f} \t val_loss={:.5f} \t sm_loss={:.5f} \t sm_val_loss={:.5f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, sm_avg_loss, sm_avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
        
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))         
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, pred_value

In [12]:
seeds = [0,1,2,3,4,5,6,7,8,9]
mlp1_oof = np.zeros([len(mlp_train),fn_targets.shape[1]])
mlp1_test = np.zeros([len(mlp_test),fn_targets.shape[1]])

for seed_ in seeds:
    oof, pytorch_pred = modelling_torch(mlp_train, fn_targets, mlp_test, seed_, mlp_train.shape[1]-1, fn_targets.shape[1])
    mlp1_oof += oof / len(seeds)
    mlp1_test += pytorch_pred / len(seeds)

Fold 1
Best: Epoch 1 	 loss=0.41263 	 val_loss=0.02233 	 sm_loss=0.41259 	 sm_val_loss=0.02228 	 time=1.73s
Best: Epoch 2 	 loss=0.02025 	 val_loss=0.02022 	 sm_loss=0.02019 	 sm_val_loss=0.01977 	 time=1.21s
Best: Epoch 3 	 loss=0.01819 	 val_loss=0.01803 	 sm_loss=0.01819 	 sm_val_loss=0.01805 	 time=1.25s
Best: Epoch 5 	 loss=0.01735 	 val_loss=0.01793 	 sm_loss=0.01747 	 sm_val_loss=0.01796 	 time=1.06s
Best: Epoch 6 	 loss=0.01736 	 val_loss=0.01786 	 sm_loss=0.01748 	 sm_val_loss=0.01789 	 time=1.31s
Best: Epoch 8 	 loss=0.01730 	 val_loss=0.01766 	 sm_loss=0.01743 	 sm_val_loss=0.01767 	 time=1.04s
Best: Epoch 11 	 loss=0.01702 	 val_loss=0.01745 	 sm_loss=0.01715 	 sm_val_loss=0.01746 	 time=1.02s
Best: Epoch 13 	 loss=0.01650 	 val_loss=0.01731 	 sm_loss=0.01665 	 sm_val_loss=0.01735 	 time=1.28s
Best: Epoch 14 	 loss=0.01622 	 val_loss=0.01731 	 sm_loss=0.01639 	 sm_val_loss=0.01734 	 time=1.07s
Best: Epoch 15 	 loss=0.01592 	 val_loss=0.01726 	 sm_loss=0.01609 	 sm_val_loss=

In [13]:
check_mlp = np.zeros([y.shape[0], y.shape[1]])
check_mlp[cons_train_index,:] = mlp1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_mlp)))

OOF log loss:  0.015653738290853914


In [14]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_mlp[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.6718415116935446


# 1st tabnet

In [15]:
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [16]:
MAX_EPOCH=200

def seed_tabnet_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False
        
def modelling_tabnet(tr, target, te, sample_seed):
    seed_tabnet_everything(sample_seed) 
    tabnet_params = dict(n_d=12, n_a=12, n_steps=1, gamma=1.3, seed = sample_seed,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []
    
    NB_SPLITS = 5
    mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    oof_preds = np.zeros([len(tr),target.shape[1]])
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        print("FOLDS : ", fold_nb+1)

        ## model
        X_train, y_train = tr[train_idx, :], target[train_idx, :]
        X_val, y_val = tr[val_idx, :], target[val_idx, :]
        model = TabNetRegressor(**tabnet_params)
        
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=1024, virtual_batch_size=128,
              num_workers=1, drop_last=False,
              # use binary cross entropy as this is not a regression problem
              loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)
        
        preds_val = model.predict(X_val)
        preds =  1 / (1 + np.exp(-preds_val))
        oof_preds[val_idx,:] = preds
        
        # preds on test
        preds_test = model.predict(te)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

    test_preds_all = np.stack(test_cv_preds)
    return oof_preds, test_preds_all

In [17]:
tabnet1_oof = np.zeros([len(tab_train),fn_targets.shape[1]])
tabnet1_test = np.zeros([len(tab_test),fn_targets.shape[1]])
seeds = [0,1]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(tab_train, fn_targets, tab_test, seed_)
    tabnet1_oof += oof_preds / len(seeds)
    tabnet1_test += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 0.56891 | val_logits_ll: 0.30189 |  0:00:01s
epoch 10 | loss: 0.02044 | val_logits_ll: 0.02035 |  0:00:14s
epoch 20 | loss: 0.01882 | val_logits_ll: 0.01888 |  0:00:26s
epoch 30 | loss: 0.01766 | val_logits_ll: 0.0192  |  0:00:38s
epoch 40 | loss: 0.01701 | val_logits_ll: 0.01758 |  0:00:51s
epoch 50 | loss: 0.01697 | val_logits_ll: 0.01728 |  0:01:04s
epoch 60 | loss: 0.01655 | val_logits_ll: 0.01714 |  0:01:16s
epoch 70 | loss: 0.01654 | val_logits_ll: 0.01709 |  0:01:28s
epoch 80 | loss: 0.01625 | val_logits_ll: 0.01701 |  0:01:40s
epoch 90 | loss: 0.01592 | val_logits_ll: 0.01686 |  0:01:52s
epoch 100| loss: 0.0161  | val_logits_ll: 0.01735 |  0:02:06s
epoch 110| loss: 0.01536 | val_logits_ll: 0.01688 |  0:02:18s
epoch 120| loss: 0.01541 | val_logits_ll: 0.01687 |  0:02:30s

Early stopping occured at epoch 122 with best_epoch = 102 and best_val_logits_ll = 0.01667
Best weights from best epoch are automatically used!
FOLDS :  2
Device u

In [18]:
check_tabnet = np.zeros([y.shape[0], y.shape[1]])
check_tabnet[cons_train_index,:] = tabnet1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_tabnet)))

OOF log loss:  0.015306827882797544


# 1st dae

In [19]:
class CatIntMapper( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self ,col,dicti):
        self.col = col
        self.dicti = dicti
        
    def fit(self, X, y = None):
        
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        assert  X[self.col].isin(self.dicti.keys()).all() 
        return pd.concat([X.drop(self.col,axis=1),X[self.col].map(self.dicti).astype(int).rename(self.col)],axis=1) 
    
    def transform( self, X):
        assert  X[self.col].isin(self.dicti.keys()).all() 
        
        return pd.concat([X.drop(self.col,axis=1),X[self.col].map(self.dicti).astype(int).rename(self.col)],axis=1) 

class NamedOutTWrapper(BaseEstimator, TransformerMixin):
    
    def __init__(self,transformer,columns,inplace=False,prefix='_' ):
        self.transformer = transformer
        self.cols = columns
        self.inplace =  inplace
        self.prefix = prefix
        self.transformer_name = self._get_transformer_name()
        
    def fit(self, X, y = None): 
        self.transformer =   self.transformer.fit(X[self.cols] , y )
        return self
    
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ): 
        transformed_columns = self.transformer.fit_transform(X[self.cols] , y )
        out=pd.DataFrame(index=X.index)
        
        if self.inplace:
            out = X[self.cols]
            out[self.cols] = transformed_columns
            return pd.concat([X.drop(self.cols,axis=1),out],axis=1)
        else:
            for i,values in enumerate(transformed_columns.transpose()):
                out[ self.transformer_name + self.prefix + str(i)] = values
            return   pd.concat([X,out],axis=1)
        
    def transform( self, X):
        transformed_columns = self.transformer.transform(X[self.cols]  )
        out=pd.DataFrame(index=X.index)
        
        if self.inplace:
            out = X[self.cols]
            out[self.cols] = transformed_columns
            return pd.concat([X.drop(self.cols,axis=1),out],axis=1)
        else:
            for i,values in enumerate(transformed_columns.transpose()):
                out[ self.transformer_name + self.prefix + str(i)] = values
        return   pd.concat([X,out],axis=1)
    
    def _get_transformer_name(self):
        return str(self.transformer.__class__).split('.')[-1][0:-2]
    
class IdentityTransformer:
    '''Duummy_tansformer as a filler'''
    def __init__(self ):
        pass
    def fit(self, X, y = None):
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        return  X
      
    def transform( self, X):
        return  X
    
class SuppressControls( BaseEstimator, TransformerMixin ):
    #Class constructor method that takes in a list of values as its argument
    def __init__(self ):
        pass
    def fit(self, X, y = None):
        return self
    #Return self nothing else to do here
    def fit_transform( self, X, y = None  ):
        return   X.loc[X['cp_type']!='ctl_vehicle'].drop('cp_type', axis=1) 
    
    def transform( self, X):        
        return    X.loc[X['cp_type']!='ctl_vehicle'].drop('cp_type', axis=1)
    
def multifold_indexer(train,target_columns,n_splits=10,random_state=12347,**kwargs):
    folds = train.copy()

    mskf = MultilabelStratifiedKFold(n_splits=n_splits,random_state=random_state,**kwargs)
    folds[ 'kfold']=0
    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=train[target_columns])):
        folds.iloc[v_idx,-1] = int(f)

    folds['kfold'] = folds['kfold'].astype(int)
    return folds

class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    
class DAE_Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size=1100,hidden_size2=1300):
        super(DAE_Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        #self.dropout1 = nn.Dropout(drop_rate1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))
        
        self.batch_norm2 = nn.BatchNorm1d(hidden_size)
      #  self.dropout2 = nn.Dropout(drop_rate2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size, hidden_size2))
        
        self.batch_norm3 = nn.BatchNorm1d(hidden_size2)
        #self.dropout3 = nn.Dropout(drop_rate2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size2, hidden_size))
        
      #  self.batch_norm4 = nn.BatchNorm1d(hidden_size)
      #  self.dropout4 = nn.Dropout(drop_rate3)
        self.dense4 = nn.utils.weight_norm(nn.Linear(hidden_size, num_features))
        
    def forward(self, x,mode='DAE'):
      #  x = self.batch_norm1(x)
       # x1 = self.dropout1(x1)
        x1 = F.relu(self.dense1(x))
            
        x2 = self.batch_norm2(x1)
      #  x = self.dropout2(x)
        x2 = F.relu(self.dense2(x2))
        
        x3 = self.batch_norm3(x2)
      
        x3 = F.relu(self.dense3(x3))
        
        out = self.dense4(x3)
        
        if mode == 'DAE':
            return out
        else:
            return x1,x2,x3
        
class ColumnDropper( BaseEstimator, TransformerMixin ):
    #Class Constructor 
    def __init__( self, cols ):
        self.cols=cols
    #Return self nothing else to do here    
    def fit( self, X, y = None ):
        return self 
    
    #Method that describes what we need this transformer to do
    def transform( self, X, y = None ):
        return X.drop(self.cols,axis=1)
    
GENES = [col for col in train.columns if col.startswith('g-')]
CELLS = [col for col in train.columns if col.startswith('c-')]
        
map_controls = CatIntMapper('cp_type',{'ctl_vehicle': 0, 'trt_cp': 1})    
map_dose = CatIntMapper('cp_dose',{'D1': 1, 'D2': 0})    
map_time = CatIntMapper('cp_time',{24: 0, 48: 1, 72: 2})  
Rankg_g_tansform = NamedOutTWrapper(preprocessing.QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal"),columns= GENES+CELLS,inplace=True)
PCA_g_tansform =  NamedOutTWrapper(PCA(20),columns= GENES,prefix ='_g' )
PCA_c_tansform =  NamedOutTWrapper(PCA(20),columns= CELLS,prefix ='_c' )

CatDropper =ColumnDropper(cols=['cp_type','cp_time','cp_dose'])
transformers_list=[map_controls,map_dose,map_time,Rankg_g_tansform,CatDropper]
exp_name = 'test_DAE_0.2_all_together'

def run_inference(X_train,y_train,X_valid,y_valid,X_test,fold, seed,inference_only=False,**kwargs):
    seed_everything(seed)
    if not  inference_only:
        train_dataset = MoADataset(X_train, y_train)
        valid_dataset = MoADataset(X_valid, y_valid)
        trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
        validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

    testdataset = TestDataset(X_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = DAE_Model(
        num_features= X_train.shape[1] ,
        num_targets=  X_train.shape[1],
       # hidden_size=hidden_size,
        **kwargs
    )
    
    model.to(DEVICE)
    
    if not  inference_only:
        oof = inference_infer_features_fn(model, validloader, DEVICE)    
    else:
        oof= 0
    
    predictions = infer_features_fn(model, testloader, DEVICE)
    predictions = predictions
    
    return oof, predictions

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')
EPOCHS = 1000
BATCH_SIZE = 640
LEARNING_RATE = 4e-3
WEIGHT_DECAY = 1e-8
NFOLDS = 10
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False
#num_features=len(feature_cols)
#num_targets=len(target_cols)
hidden_size=1100
hidden_size2=1300
PATIENCE=10
THRESHOLD = 5e-3

def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        
        if not  scheduler.__class__ ==  torch.optim.lr_scheduler.ReduceLROnPlateau:
            scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss

def valid_fn(model, scheduler, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    if scheduler.__class__ ==  torch.optim.lr_scheduler.ReduceLROnPlateau:
        scheduler.step(final_loss)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds

def infer_features_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs,mode='get_features')
        
#         print(len(outputs))
#         for i in range(len(outputs)):
#             print(outputs[i].shape)
#         print(torch.cat(outputs,axis=1).shape)
        
        preds.append(torch.cat(outputs,axis=1).detach().cpu().numpy())
    preds = np.concatenate(preds)
    
    return preds

def train_short_form_loader(feature_file,target_file,extra_target_file=None):
    '''takes the original target and features and creates a train dataset 
    in col long format'''
    train_features = pd.read_csv(feature_file)
    train_targets = pd.read_csv(target_file)

    if extra_target_file is not None:
        extra_targets = pd.read_csv(extra_target_file)
        train_targets = pd.merge(train_targets,extra_targets,on ='sig_id')
        del extra_targets

    targets = train_targets.columns[1:]

    train_melt=train_targets.merge(train_features,how="left",on="sig_id")

    del train_features,train_targets
    
    train_melt.set_index("sig_id",inplace=True)

    #train_melt["variable"]= train_melt["variable"].astype('category')
    train_melt["cp_type"]= train_melt["cp_type"].astype('category')
    train_melt["cp_dose"]= train_melt["cp_dose"].astype('category')

    return train_melt , targets.to_list()

def test_short_form_loader(feature_file):
    '''takes the original target and features and creates a train dataset 
    in col long format'''

    train_features = pd.read_csv(feature_file)

    train_melt =  train_features.copy()
    del train_features

    train_melt.set_index("sig_id",inplace=True)

    train_melt["cp_type"]= train_melt["cp_type"].astype('category')
    train_melt["cp_dose"]= train_melt["cp_dose"].astype('category')

    return train_melt 

SEED = [0]
train,target_cols = train_short_form_loader('../input/lish-moa/train_features.csv','../input/lish-moa/train_targets_scored.csv')
test = test_short_form_loader("../input/lish-moa/test_features.csv")

train = pd.concat([train,test])
train[target_cols]= train[target_cols].fillna(0)
test = train.copy()

def seed_everything(seed=42):
    random.seed(seed)
    #os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    
    train = multifold_indexer(train,target_cols,n_splits=NFOLDS)
    
    for fold in range(NFOLDS):
        train_df = train[train['kfold'] != fold]#.reset_index(drop=True)
        valid_df = train[train['kfold'] == fold]#.reset_index(drop=True)
            
        feature_cols = [col  for col in train_df.columns if not (col in target_cols+['kfold'])]
                
        pipeline_val = make_pipeline(*transformers_list)
        
        X_train, y_train  = train_df[feature_cols], train_df[target_cols]
        X_valid, y_valid =  valid_df[feature_cols], valid_df[target_cols].values
        
        X_train = pipeline_val.fit_transform(X_train)
        X_train = X_train.values
        X_valid = pipeline_val.transform(X_valid)
        
        valid_index = X_valid.index
        X_valid = X_valid.values
        
        y_train = y_train.values
        
        X_test = test[feature_cols]
        X_test = pipeline_val.transform(X_test).values
        
        pred_ = run_inference(X_train,y_train,X_valid,y_valid,X_test,fold, seed,inference_only=True)    
        
        break
    
transformed_features = pd.DataFrame(pred_[1],index=test.index)
print(transformed_features.shape)
transformed_features.columns = [str(i) for i in range(len(transformed_features.columns))]
transformed_features.reset_index().to_feather('./features_0.2_altogether.fth')

(27796, 3500)


In [20]:
class DaeAdder( BaseEstimator, TransformerMixin ):
    def __init__(self,filename):
        self.filename=filename
    
    def fit(self,X,y=None):
        return self
    
    def fit_transform(self,X,y=None):
        Dae_features = pd.read_feather(self.filename).set_index('sig_id')
        return X.merge(Dae_features,how='left', on='sig_id')
        
    def transform(self,X):
        Dae_features = pd.read_feather(self.filename).set_index('sig_id')
        return X.merge(Dae_features,how='left', on='sig_id')
    
train,target_cols = train_short_form_loader('../input/lish-moa/train_features.csv','../input/lish-moa/train_targets_scored.csv')
Dae0_2 =DaeAdder(filename='features_0.2_altogether.fth')

res = Dae0_2.fit_transform(train)
map_controls = CatIntMapper('cp_type',{'ctl_vehicle': 0, 'trt_cp': 1})  

map_dose = CatIntMapper('cp_dose',{'D1': 1, 'D2': 0})    
map_time = CatIntMapper('cp_time',{24: 0, 48: 1, 72: 2})

class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets
    
    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),
            self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight)

        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()

        return loss
    
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size1=388,hidden_size2=512,drop_rate1=0.0,drop_rate2=0.3,drop_rate3=0.3):
        super(Model, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(drop_rate1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size1))

        self.batch_norm2 = nn.BatchNorm1d(hidden_size1)
        self.dropout2 = nn.Dropout(drop_rate2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(hidden_size1, hidden_size2))

        self.batch_norm3 = nn.BatchNorm1d(hidden_size2)
        self.dropout3 = nn.Dropout(drop_rate3)
        self.dense3 = nn.utils.weight_norm(nn.Linear(hidden_size2, num_targets))
        
    def forward(self, x):
        
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
DEVICE =  torch.device('cuda:0')
EPOCHS = 40
BATCH_SIZE = 512
LEARNING_RATE = 4e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 7
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False

hidden_size1=2048
hidden_size2=2048

def apply_pipe_together(pipeline,train,test):
    #@add warning when intesection is not the whole
    data = pd.concat([train,test])

    data = pipeline.fit_transform(data)
    
    train = data.loc[data.index.intersection(train.index)]
    test = data.loc[data.index.intersection(test.index)]
    
    return pipeline,train,test

transformers_list=[map_dose,map_time,Dae0_2,SuppressControls()]

def run_training(X_train,y_train,X_valid,y_valid,X_test,fold, seed,verbose=False,**kwargs):
    
    seed_everything(seed)
    
    train_dataset = MoADataset(X_train, y_train)
    valid_dataset = MoADataset(X_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features= X_train.shape[1] ,
        num_targets=  y_train.shape[1],hidden_size1=hidden_size1,hidden_size2=hidden_size2,
       **kwargs
    )
    
    model.to(DEVICE)  
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    
    #scheduler = optim.lr_scheduler.OneCycleLR(optimizer, pct_start=0.1, div_factor=1e3, 
     #                                         max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    
    scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer,patience=3)
    
    loss_val = nn.BCEWithLogitsLoss()
    #loss_tr =  nn.BCEWithLogitsLoss()
    loss_tr = SmoothBCEwLogits(smoothing =0.001)
    
    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    #todo el guardado de los resultados se puede mover a kfold que si tiene info de los indices
    #oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer,scheduler, loss_tr, trainloader, DEVICE)
        valid_loss, valid_preds = valid_fn(model,scheduler, loss_val, validloader, DEVICE)
        
        if valid_loss < best_loss:
            if verbose:
                print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}, valid_loss: {valid_loss}")
            
            best_loss = valid_loss
            oof = valid_preds
                
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
                
    #--------------------- PREDICTION---------------------
   
    testdataset = TestDataset(X_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions

def run_k_fold(folds,target_cols,test,transformers_list,NFOLDS, seed,verbose=False,**kwargs):
    
    
    train = folds
    test_ = test
    
    #oof = np.zeros((len(folds), len(target_cols)))
    oof = train[target_cols].copy()
    oof = oof*0
    predictions = pd.DataFrame(0,columns=target_cols,index=test.index)
    
    #print(test_.head())
    for fold in range(NFOLDS):
            
        train_df = train[train['kfold'] != fold]#.reset_index(drop=True)
        valid_df = train[train['kfold'] == fold]#.reset_index(drop=True)
        
        feature_cols = [col  for col in train_df.columns if not (col in target_cols+['kfold'])]
        
        pipeline_val = make_pipeline(*transformers_list)
        
        X_train, y_train  = train_df[feature_cols], train_df[target_cols]
        X_valid, y_valid =  valid_df[feature_cols], valid_df[target_cols].values
        
        X_train = pipeline_val.fit_transform(X_train,y_train)
        feature_cols = [col  for col in X_train.columns if not (col in target_cols+['kfold'])]
        
        X_train = X_train.values
        
        X_valid = pipeline_val.transform(X_valid)
        valid_index = X_valid.index
        X_valid = X_valid.values
        
        y_train = y_train.values
        
        X_test = pipeline_val.transform(test_)
        test_index = X_test.index
        X_test = X_test[feature_cols].values
            
        oof_, pred_ = run_training(X_train,y_train,X_valid,y_valid,X_test,fold, seed,verbose,**kwargs)
        
        oof.loc[valid_index] = oof_
        predictions.loc[test_index] += pred_ / NFOLDS
        
        
    return oof, predictions

SEED = [0]
params={}
train,target_cols = train_short_form_loader('../input/lish-moa/train_features.csv','../input/lish-moa/train_targets_scored.csv')
test = test_short_form_loader("../input/lish-moa/test_features.csv")


pipeline_test = make_pipeline(*transformers_list)
pipeline_test,train , test = apply_pipe_together(pipeline_test,train,test)
#pipeline_test.fit(train)
#test = pipeline_test.transform(test)
#suppresor = SupressControls()
#train = suppresor.fit_transform(train)
#test = suppresor.transform(test)
transformers_list=[IdentityTransformer()]

dae1_oof = np.zeros((len(train), len(target_cols)))
dae1_test = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    folds = multifold_indexer(train,target_cols,n_splits=NFOLDS)
    oof_, predictions_ = run_k_fold(folds,target_cols,test,transformers_list,NFOLDS, seed,verbose=True,**params)
    dae1_oof += oof_ / len(SEED)
    dae1_test += predictions_ / len(SEED)
    
dae1_test = dae1_test.to_numpy()

FOLD: 0, EPOCH: 0, train_loss: 0.4194359861918398, valid_loss: 0.05704883486032486
FOLD: 0, EPOCH: 1, train_loss: 0.03280015118621491, valid_loss: 0.022742184411202158
FOLD: 0, EPOCH: 2, train_loss: 0.024483928104510177, valid_loss: 0.0209354715687888
FOLD: 0, EPOCH: 3, train_loss: 0.02338922642976851, valid_loss: 0.020277188824755803
FOLD: 0, EPOCH: 4, train_loss: 0.02241920996960756, valid_loss: 0.01964901973094259
FOLD: 0, EPOCH: 5, train_loss: 0.021454858669155353, valid_loss: 0.019228234887123108
FOLD: 0, EPOCH: 6, train_loss: 0.020903111510985607, valid_loss: 0.018297184258699417
FOLD: 0, EPOCH: 8, train_loss: 0.02006987396728348, valid_loss: 0.01815445987241609
FOLD: 0, EPOCH: 9, train_loss: 0.01970954123582389, valid_loss: 0.01802694079067026
FOLD: 0, EPOCH: 10, train_loss: 0.01932980739385695, valid_loss: 0.017532807109611376
FOLD: 0, EPOCH: 11, train_loss: 0.018894281641051575, valid_loss: 0.017478506213852336
FOLD: 0, EPOCH: 16, train_loss: 0.016040663335573028, valid_loss: 

In [21]:
check_dae = np.zeros([y.shape[0], y.shape[1]])
check_dae[cons_train_index,:] = dae1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_dae)))

OOF log loss:  0.015686236165123876


# submission

In [22]:
p_min = 0.001
p_max = 1 - p_min

In [23]:
check = 0.1 * check_dae + 0.2 * check_tabnet + 0.7 * check_mlp  
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check)))

OOF log loss:  0.015269522839593815


In [24]:
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub.loc[cons_test_index,target_feats] = 0.1 * dae1_test + 0.2 * tabnet1_test + 0.7 * mlp1_test
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)