**late submission**

- ensemble 
- tabnet ve 47: private 0.01618  Public Score 0.01845 (7fold 5seed)
- mlp ver 49: private 0.01620 public 0.01845  (7fold 5seed)
- tabnet-multi ver 8: private 0.01652 public 0.01866 (no FE, DA by cutmix, 6fold 5seed)
- mlp-multi ver 10: private 0.01641 public 0.01856 (4-layer mlp, 7fold 5seed)
- add postprocess clipping

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0


In [2]:
import sys
import os
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from sklearn.utils import check_random_state
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold

import time
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf
from torch.nn.modules.loss import _WeightedLoss

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor, TabNetClassifier

warnings.filterwarnings('ignore')

# Preprocess

In [3]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

In [6]:
def make_string(row):
    return str(row[1:].values).replace('[','').replace(']','').replace('\n','').replace(' ','')

targets["target_pair"] = targets.apply(make_string, axis=1)
targetpair_id = dict(enumerate(targets["target_pair"].unique()))
id_targetpair = {y:x for x,y in targetpair_id.items()}
targets["target_pair_num"] = targets["target_pair"].map(id_targetpair)

multiclass_targets = pd.get_dummies(targets["target_pair_num"])

for i in range(len(id_targetpair.keys())):
    if i == 0:
        classid_target = np.array(list((targetpair_id[i]))).reshape(1,-1)      
    else:
        classid_target = np.vstack([classid_target, np.array(list((targetpair_id[i]))).reshape(1,-1)])
        
classid_target = classid_target.astype(int)
class_num = multiclass_targets.shape[1]
        
multiclass_targets.shape, classid_target.shape

((23814, 328), (328, 206))

In [7]:
test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
y = targets.drop(["sig_id", "target_pair", "target_pair_num"], axis=1).copy()
targets = targets[targets.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets.copy().drop(["sig_id", "target_pair", "target_pair_num"], axis=1).to_numpy()

multiclass_targets = targets["target_pair_num"].copy().reset_index(drop=True)
fn_multiclass_targets = multiclass_targets.copy().to_numpy()

In [8]:
tmp = multiclass_targets.value_counts().copy()
minor_class = tmp[tmp==1].index

In [9]:
def make_fold(NB_SPLITS, seed):   
    folds = []
    # LOAD FILES
    train_score = targets.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = train_score.drug_id.value_counts()
    vc1 = vc.loc[vc <= 18].index.sort_values()
    vc2 = vc.loc[vc > 18].index.sort_values()
    
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.groupby('drug_id')[target_feats].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.loc[train_score.drug_id.isin(vc2)].reset_index(drop = True)
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    train_score['fold'] = train_score.drug_id.map(dct1)
    train_score.loc[train_score.fold.isna(),'fold'] = train_score.loc[train_score.fold.isna(),'sig_id'].map(dct2)
    train_score.fold = train_score.fold.astype('int8')
    folds.append(train_score.fold.values)
    
    return np.array(folds)

mlp_fold = make_fold(7,34)

In [10]:
mlp_fold

array([[1, 2, 6, ..., 3, 4, 0]], dtype=int8)

# Feature engineering

In [11]:
def fe_simple(df, remove_features):
    tmp = df.copy()
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

def fe_simple2(df):
    tmp = df.copy()
    tmp = pd.get_dummies(tmp, columns=['cp_time','cp_dose'])
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True) 
    return tmp

def fe(df_train, df_test, quantiles, th, c_num, g_num):
    tmp_train = df_train.copy()
    tmp_test = df_test.copy()
    X = tmp_train.iloc[:,4:].copy().values
    select = VarianceThreshold(threshold=th)
    X_new = select.fit_transform(X)
    drop_feats = list(np.array(tmp_train.iloc[:,4:].columns)[select.get_support()==False])
    print(len(drop_feats))
    
    tmp_train.drop(drop_feats, axis=1, inplace=True)
    tmp_test.drop(drop_feats, axis=1, inplace=True)

    modg_feats = [i for i in tmp_train.columns if "g-" in i]
    modc_feats = [i for i in tmp_train.columns if "c-" in i]
    
    for i in modc_feats + modg_feats:
        ss = preprocessing.QuantileTransformer(n_quantiles=quantiles, random_state=0, output_distribution="normal")
        ss.fit(tmp_train[i].values.reshape(-1,1))
        tmp_train[i] = ss.transform(tmp_train[i].values.reshape(-1,1))
        tmp_test[i] = ss.transform(tmp_test[i].values.reshape(-1,1))
    
    pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
    pca = PCA(n_components=c_num,random_state=42)
    c_train = pca.fit_transform(tmp_train[modc_feats])
    c_test = pca.transform(tmp_test[modc_feats])
    c_train = pd.DataFrame(c_train, columns=pca_c_cols)
    c_test = pd.DataFrame(c_test, columns=pca_c_cols)

    pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
    pca = PCA(n_components=g_num, random_state=42)
    g_train = pca.fit_transform(tmp_train[modg_feats])
    g_test = pca.transform(tmp_test[modg_feats])
    g_train = pd.DataFrame(g_train, columns=pca_g_cols)
    g_test = pd.DataFrame(g_test, columns=pca_g_cols)

    tmp_train = pd.concat([tmp_train, c_train],axis=1)
    tmp_test = pd.concat([tmp_test, c_test],axis=1)
    tmp_train = pd.concat([tmp_train, g_train],axis=1)
    tmp_test = pd.concat([tmp_test, g_test],axis=1)
    
    return tmp_train, tmp_test

def fe_stats(df):
    tmp = df.copy()
    modg_feats = [i for i in tmp.columns if "g-" in i]
    modc_feats = [i for i in tmp.columns if "c-" in i]
    tmp['g_kurt'] = tmp[modg_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[modg_feats].skew(axis = 1)
    tmp['c_kurt'] = tmp[modc_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[modc_feats].skew(axis = 1)
    return tmp

remove_features = ["cp_type" , "sig_id"]

In [12]:
def cutmix_for_tabular(x, y=None, alpha=1.0, p=1.0, random_state=None):
    x_ = x.copy()
    n, d = x.shape

    if n is not None and random.random() < p:
        random_state = check_random_state(random_state)
        l = random_state.beta(alpha, alpha)
        mask = random_state.choice([False, True], size=d, p=[l, 1.0 - l])
        mask = np.where(mask)[0]
        shuffle = random_state.choice(n, n, replace=False)
        x_[:,mask] = x_[np.ix_(shuffle,mask)]
        
        if y is not None:
            y = l * y + (1.0 - l) * y[shuffle]
        
    # extract minor class
    tmp_index = np.where(np.isin(np.argmax(y, axis=1), minor_class))[0]
    print(tmp_index)
        
    return x_[tmp_index], np.argmax(y[tmp_index],axis=1)

In [13]:
# mlp -----------------------------------
mlp_train, mlp_test = fe(train, test, 1000, 0.7, 10, 60)
mlp_train = fe_stats(mlp_train)
mlp_test = fe_stats(mlp_test)
mlp_train = fe_simple2(mlp_train)
mlp_test = fe_simple2(mlp_test)
mlp_train["fold"] = mlp_fold.reshape(-1,1)
mlp_train = mlp_train.to_numpy()
mlp_test = mlp_test.to_numpy()

# tabnet --------------------------------
tab_train, tab_test = fe(train, test, 100, 0, 1, 10)
tab_train = fe_simple(tab_train, remove_features).to_numpy()
tab_test = fe_simple(tab_test, remove_features).to_numpy()

# multiclass mlp ------------------------
multiclass_mlp_train, multiclass_mlp_test = fe(train, test, 100, 0, 5, 30)
multiclass_mlp_train = fe_stats(multiclass_mlp_train)
multiclass_mlp_test = fe_stats(multiclass_mlp_test)
fn_multiclass_mlp_train = fe_simple2(multiclass_mlp_train).to_numpy()
fn_multiclass_mlp_test = fe_simple2(multiclass_mlp_test).to_numpy()

# multiclass tabnet ---------------------
multiclass_tab_train = fe_simple(train, remove_features)
multiclass_tab_test = fe_simple(test, remove_features).to_numpy()

for i in [0,1,2,3,4]:
    if i == 0:
        train_mod, y_mod = cutmix_for_tabular(multiclass_tab_train.values, 
                                      pd.get_dummies(multiclass_targets).values, alpha=1, p=1, random_state=i)
    else:
        train_tmp, y_tmp = cutmix_for_tabular(multiclass_tab_train.values, 
                                      pd.get_dummies(multiclass_targets).values, alpha=1, p=1, random_state=i)
        train_mod = np.concatenate([train_mod, train_tmp], axis=0)
        y_mod = np.concatenate([y_mod, y_tmp])
        
train_mod = pd.DataFrame(train_mod, columns = multiclass_tab_train.columns)
multiclass_tab_train = pd.concat([multiclass_tab_train, train_mod], axis=0).reset_index(drop=True).to_numpy()

y_mod = pd.DataFrame(y_mod)
multiclass_tabnet_targets = pd.concat([multiclass_targets, y_mod]).reset_index(drop=True)
fn_multiclass_tabnet_targets = multiclass_tabnet_targets.copy().to_numpy().reshape(-1,)

print(mlp_train.shape, mlp_test.shape)
print(tab_train.shape, tab_test.shape)
print(fn_multiclass_mlp_train.shape, fn_multiclass_mlp_test.shape)
print(multiclass_tab_train.shape, multiclass_tab_test.shape)

33
0
0
[  599  2348  5784 12816 17718 20062]
[ 6269  6613  9380 10755 13810 14313]
[ 7176  7455  9098 15661 15970 18894]
[  858  9318  9931 18208 19565 20789]
[ 7176  7455  9098 15661 15970 18894]
(21948, 919) (3624, 918)
(21948, 885) (3624, 885)
(21948, 916) (3624, 916)
(21978, 874) (3624, 874)


# 1st mlp

In [14]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets, n_classes, smoothing=0.0):
        assert 0 <= smoothing <= 1
        with torch.no_grad():
            targets = targets * (1 - smoothing) + torch.ones_like(targets).to(device) * smoothing / n_classes
        return targets

    def forward(self, inputs, targets):
        targets = SmoothCrossEntropyLoss()._smooth(targets, inputs.shape[1], self.smoothing)

        if self.weight is not None:
            inputs = inputs * self.weight.unsqueeze(0)

        loss = F.binary_cross_entropy_with_logits(inputs, targets)

        return loss

In [15]:
device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 128
n_folds=7
train_epochs = 20
smoothing = 0.0005
p_min = smoothing
p_max = 1 - smoothing

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class MoaModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1024))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.1)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.1)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, last_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num, train_epochs):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]
    
    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=224)
    metric = lambda inputs, targets : F.binary_cross_entropy((torch.clamp(torch.sigmoid(inputs), p_min, p_max)), targets)

    models = []
    
    X_test2 = torch.tensor(X_test, dtype=torch.float32)
    test = torch.utils.data.TensorDataset(X_test2) 
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold in range(n_folds):
        valid_index = X_train[:,-1] == fold
        train_index = X_train[:,-1] != fold
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        X_train2 = X_train2[:,:-1]
        X_valid2 = X_valid2[:,:-1]
        
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = SmoothCrossEntropyLoss(smoothing=smoothing)

        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
        
        clf.to(device)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            sm_avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                avg_loss += loss.item() / len(train_loader)  
                sm_avg_loss += metric(y_pred, y_batch) / len(train_loader) 
                
            clf.eval()
            avg_val_loss = 0.
            sm_avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                sm_avg_val_loss += metric(y_pred, y_batch) / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            #scheduler.step() #avg_val_loss # maybe mistake
                    
            if sm_avg_val_loss < best_val_loss:
                best_val_loss = sm_avg_val_loss
                print('Epoch {}   loss={:.5f}   val_loss={:.5f}   sm_loss={:.5f}   sm_val_loss={:.5f}   time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, sm_avg_loss, sm_avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
        
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))         
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max)
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, pred_value

In [16]:
seeds = [0,1,2,3,4]
mlp1_oof = np.zeros([len(mlp_train),fn_targets.shape[1]])
mlp1_test = np.zeros([len(mlp_test),fn_targets.shape[1]])

for seed_ in seeds:
    oof, pytorch_pred = modelling_torch(mlp_train, fn_targets, mlp_test, seed_, mlp_train.shape[1]-1, fn_targets.shape[1], 20)
    mlp1_oof += oof / len(seeds)
    mlp1_test += pytorch_pred / len(seeds)

Fold 1
Epoch 1   loss=0.41273   val_loss=0.02253   sm_loss=0.41269   sm_val_loss=0.02249   time=1.92s
Epoch 2   loss=0.02052   val_loss=0.02147   sm_loss=0.02042   sm_val_loss=0.02105   time=1.21s
Epoch 3   loss=0.01832   val_loss=0.01819   sm_loss=0.01826   sm_val_loss=0.01818   time=1.20s
Epoch 5   loss=0.01740   val_loss=0.01774   sm_loss=0.01743   sm_val_loss=0.01773   time=1.23s
Epoch 8   loss=0.01728   val_loss=0.01755   sm_loss=0.01730   sm_val_loss=0.01753   time=1.36s
Epoch 11   loss=0.01698   val_loss=0.01750   sm_loss=0.01701   sm_val_loss=0.01746   time=1.22s
Epoch 13   loss=0.01643   val_loss=0.01727   sm_loss=0.01646   sm_val_loss=0.01726   time=1.24s
Epoch 16   loss=0.01542   val_loss=0.01708   sm_loss=0.01546   sm_val_loss=0.01707   time=1.51s
Epoch 18   loss=0.01440   val_loss=0.01707   sm_loss=0.01445   sm_val_loss=0.01704   time=1.27s
Epoch 19   loss=0.01398   val_loss=0.01703   sm_loss=0.01403   sm_val_loss=0.01701   time=1.26s
Epoch 20   loss=0.01379   val_loss=0.0

In [17]:
check_mlp = np.zeros([y.shape[0], y.shape[1]])
check_mlp[cons_train_index,:] = mlp1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_mlp)))

OOF log loss:  0.01563544592517656


In [18]:
aucs = []
for task_id in range(y.shape[1]):
    aucs.append(roc_auc_score(y_true=y.iloc[:, task_id].values,
                              y_score=check_mlp[:, task_id]))
print(f"Overall AUC : {np.mean(aucs)}")

Overall AUC : 0.6571753301378248


# 1st tabnet

In [19]:
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        logits = 1 / (1 + np.exp(-y_pred))
        
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [20]:
MAX_EPOCH=200
        
def modelling_tabnet(tr, target, te, sample_seed):
    seed_everything(sample_seed) 
    tabnet_params = dict(n_d=24, n_a=24, n_steps=1, gamma=1.5, seed = sample_seed,
                     lambda_sparse=0, n_independent=1, n_shared=1, 
                     optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []

    NB_SPLITS = 7
    mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    oof_preds = np.zeros([len(tr),target.shape[1]])
    scores = []
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        print("FOLDS : ", fold_nb+1)
        
        ## model
        X_train, y_train = tr[train_idx, :], target[train_idx, :]
        X_val, y_val = tr[val_idx, :], target[val_idx, :]
        model = TabNetRegressor(**tabnet_params)
    
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=512, virtual_batch_size=128,
              num_workers=1, drop_last=False,
              # use binary cross entropy as this is not a regression problem
              loss_fn=torch.nn.functional.binary_cross_entropy_with_logits) 
    
        preds_val = model.predict(X_val)
        # Apply sigmoid to the predictions
        preds =  1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])
        oof_preds[val_idx,:] = preds
        scores.append(score)

        # preds on test
        preds_test = model.predict(te)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))
        
    test_preds_all = np.stack(test_cv_preds)
    print("OOF log loss:", log_loss(np.ravel(target), np.ravel(np.array(oof_preds))))
    aucs = []
    for task_id in range(206):
        aucs.append(roc_auc_score(y_true=target[:, task_id],y_score=oof_preds[:, task_id]))
    print(f"Overall AUC : {np.mean(aucs)}")
    return oof_preds, test_preds_all

In [21]:
tabnet1_oof = np.zeros([len(tab_train),fn_targets.shape[1]])
tabnet1_test = np.zeros([len(tab_test),fn_targets.shape[1]])
seeds = [0,100,2,10,20]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(tab_train, fn_targets, tab_test, seed_)
    tabnet1_oof += oof_preds / len(seeds)
    tabnet1_test += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 0.23313 | val_logits_ll: 0.02635 |  0:00:01s
epoch 10 | loss: 0.01787 | val_logits_ll: 0.02005 |  0:00:15s
epoch 20 | loss: 0.01684 | val_logits_ll: 0.01746 |  0:00:28s
epoch 30 | loss: 0.01651 | val_logits_ll: 0.01718 |  0:00:41s
epoch 40 | loss: 0.01632 | val_logits_ll: 0.01702 |  0:00:54s
epoch 50 | loss: 0.01616 | val_logits_ll: 0.01706 |  0:01:07s
epoch 60 | loss: 0.01613 | val_logits_ll: 0.01693 |  0:01:21s
epoch 70 | loss: 0.01593 | val_logits_ll: 0.01664 |  0:01:34s
epoch 80 | loss: 0.01568 | val_logits_ll: 0.01677 |  0:01:48s
epoch 90 | loss: 0.01547 | val_logits_ll: 0.01678 |  0:02:01s
epoch 100| loss: 0.01516 | val_logits_ll: 0.01684 |  0:02:14s
epoch 110| loss: 0.01501 | val_logits_ll: 0.01663 |  0:02:27s

Early stopping occured at epoch 119 with best_epoch = 99 and best_val_logits_ll = 0.01653
Best weights from best epoch are automatically used!
FOLDS :  2
Device used : cuda
epoch 0  | loss: 0.23281 | val_logits_ll: 0.02708 | 

In [22]:
check_tabnet = np.zeros([y.shape[0], y.shape[1]])
check_tabnet[cons_train_index,:] = tabnet1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_tabnet)))

OOF log loss:  0.014727178892444778


# 1st multiclass-mlp

In [23]:
batch_size = 128
n_folds=7
EARLY_STOPPING_STEPS = 10
train_epochs = 10

print(device)

class MoaModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1024))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu3 = nn.LeakyReLU()
        
        self.batch_norm4 = nn.BatchNorm1d(1024)
        self.dropout4 = nn.Dropout(0.2)
        self.dense4 = nn.utils.weight_norm(nn.Linear(1024, last_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.relu3(self.dense3(x))
        
        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = self.dense4(x)
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]
    
    mskf=KFold(n_splits = n_folds, shuffle=True, random_state=224)

    models = []
    
    X_test2 = torch.tensor(X_test, dtype=torch.float32)
    test = torch.utils.data.TensorDataset(X_test2) 
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),last_num])
    oof_targets = np.zeros([len(X_train),last_num])
    pred_value = np.zeros([test_len, last_num])
    print(pred_value.shape)
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Seed "+str(sample_seed)+"_Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.long)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.long)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = nn.CrossEntropyLoss() 

        optimizer = optim.Adam(clf.parameters(), lr = 0.01, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=1e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
        
        clf.to(device)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch)
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                avg_loss += loss.item() / len(train_loader)  
                
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
                    
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                print('Epoch {}  loss={:.5f}  val_loss={:.5f}  time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
                stop_counts = 0
            else:
                stop_counts += 1
                
            if stop_counts >= EARLY_STOPPING_STEPS:
                break
        
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))         
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), last_num])
        target_epoch = np.zeros([X_valid2.size(0), last_num])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
            y_pred = pred_model(x_batch).detach()
            oof_epoch[i * batch_size:(i+1) * batch_size,:] = F.softmax(y_pred.cpu()) 
        oof[valid_index,:] = oof_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, last_num])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = F.softmax(y_pred.cpu()) 
        pred_value += test_preds / n_folds
        # ------------------------------
    
    return oof, pred_value

cuda


In [24]:
seeds = [0,1,2,3,4]
multiclass_mlp_oof = np.zeros([len(multiclass_mlp_train), class_num])
multiclass_mlp_test = np.zeros([len(multiclass_mlp_test), class_num])

for seed_ in seeds:
    best_oof, pytorch_pred = modelling_torch(fn_multiclass_mlp_train, fn_multiclass_targets, fn_multiclass_mlp_test,
                                             seed_, fn_multiclass_mlp_train.shape[1], class_num)
    multiclass_mlp_oof += best_oof / len(seeds)
    multiclass_mlp_test += pytorch_pred / len(seeds)

(3624, 328)
Seed 0_Fold 1
Epoch 1  loss=4.11265  val_loss=3.39937  time=2.27s
Epoch 2  loss=3.19641  val_loss=3.13618  time=1.48s
Epoch 3  loss=2.95805  val_loss=3.04786  time=1.48s
Epoch 4  loss=2.80074  val_loss=2.99965  time=1.48s
Epoch 5  loss=2.61102  val_loss=2.99872  time=1.45s
Seed 0_Fold 2
Epoch 1  loss=4.11124  val_loss=3.26748  time=1.37s
Epoch 2  loss=3.23703  val_loss=3.03651  time=1.36s
Epoch 3  loss=2.98830  val_loss=2.95498  time=1.36s
Epoch 4  loss=2.80579  val_loss=2.89805  time=1.66s
Epoch 5  loss=2.63440  val_loss=2.89547  time=1.36s
Seed 0_Fold 3
Epoch 1  loss=4.10482  val_loss=3.38680  time=2.46s
Epoch 2  loss=3.18181  val_loss=3.18579  time=1.39s
Epoch 3  loss=2.95567  val_loss=3.10037  time=1.40s
Epoch 4  loss=2.77925  val_loss=3.04453  time=1.39s
Epoch 5  loss=2.59252  val_loss=3.02385  time=1.38s
Seed 0_Fold 4
Epoch 1  loss=4.10329  val_loss=3.39123  time=1.36s
Epoch 2  loss=3.15814  val_loss=3.20495  time=1.38s
Epoch 3  loss=2.95073  val_loss=3.14425  time=1.

In [25]:
multiclass_mlp_oof = np.dot(multiclass_mlp_oof, classid_target)
multiclass_mlp_test = np.dot(multiclass_mlp_test, classid_target)

p_min = 0.001
p_max = 1 - p_min
multiclass_mlp_oof = np.clip(multiclass_mlp_oof, p_min, p_max)
multiclass_mlp_test = np.clip(multiclass_mlp_test, p_min, p_max)

In [26]:
check_multiclass_mlp = np.zeros([y.shape[0], y.shape[1]])
check_multiclass_mlp[cons_train_index,:] = multiclass_mlp_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_multiclass_mlp)))

OOF log loss:  0.014685356751855078


# 1st multiclass tabnet

In [27]:
MAX_EPOCH=200
device = "cuda" if torch.cuda.is_available() else "cpu"
        
def modelling_tabnet(tr, target, te, sample_seed, target_shape):
    seed_everything(sample_seed) 
    tabnet_params = dict(n_d=32, n_a=32, n_steps=1, gamma=1.3, seed = sample_seed,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []

    oof_preds = np.zeros([len(tr),target_shape])
    scores = []
    NB_SPLITS = 6
    mskf = StratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(tr, target)):
        print("FOLDS : ", fold_nb+1)

        ## model
        X_train, y_train = tr[train_idx, :], target[train_idx]
        X_val, y_val = tr[val_idx, :], target[val_idx]
        model = TabNetClassifier(**tabnet_params)
        
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ['logloss'],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=256, virtual_batch_size=128,
              num_workers=1)
    
        preds_val = model.predict_proba(X_val)
        score = np.min(model.history["val_logloss"])
        oof_preds[val_idx,:] = preds_val
        scores.append(score)

        # preds on test
        preds_test = model.predict_proba(te)
        test_cv_preds.append(preds_test)
        
    test_preds_all = np.stack(test_cv_preds)
    return oof_preds, test_preds_all

multiclass_tabnet_oof = np.zeros([len(multiclass_tab_train),class_num])
multiclass_tabnet_test = np.zeros([len(multiclass_tab_test),class_num])

seeds = [0,1,2,3,4]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(multiclass_tab_train, fn_multiclass_tabnet_targets, multiclass_tab_test, seed_, class_num)
    multiclass_tabnet_oof += oof_preds / len(seeds)
    multiclass_tabnet_test += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 4.09597 | val_logloss: 3.74387 |  0:00:02s
epoch 10 | loss: 2.83402 | val_logloss: 3.16361 |  0:00:26s
epoch 20 | loss: 2.11482 | val_logloss: 3.60851 |  0:00:51s

Early stopping occured at epoch 27 with best_epoch = 7 and best_val_logloss = 3.15102
Best weights from best epoch are automatically used!
FOLDS :  2
Device used : cuda
epoch 0  | loss: 4.11473 | val_logloss: 3.73238 |  0:00:02s
epoch 10 | loss: 2.83603 | val_logloss: 3.13761 |  0:00:26s
epoch 20 | loss: 2.14984 | val_logloss: 3.52574 |  0:00:51s
epoch 30 | loss: 1.3328  | val_logloss: 5.05292 |  0:01:15s

Early stopping occured at epoch 31 with best_epoch = 11 and best_val_logloss = 3.11086
Best weights from best epoch are automatically used!
FOLDS :  3
Device used : cuda
epoch 0  | loss: 4.12638 | val_logloss: 3.6585  |  0:00:02s
epoch 10 | loss: 2.84536 | val_logloss: 3.14115 |  0:00:26s
epoch 20 | loss: 2.18855 | val_logloss: 3.60345 |  0:00:51s

Early stopping occured at ep

In [28]:
multiclass_tabnet_oof = np.dot(multiclass_tabnet_oof, classid_target)[:21948,:]
multiclass_tabnet_test = np.dot(multiclass_tabnet_test, classid_target)

p_min = 0.001
p_max = 1 - p_min
multiclass_tabnet_oof = np.clip(multiclass_tabnet_oof, p_min, p_max)
multiclass_tabnet_test = np.clip(multiclass_tabnet_test, p_min, p_max)

In [29]:
check_multiclass_tabnet = np.zeros([y.shape[0], y.shape[1]])
check_multiclass_tabnet[cons_train_index,:] = multiclass_tabnet_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_multiclass_tabnet)))

OOF log loss:  0.015278180066217983


# submission

In [30]:
p_min = 0.0005
p_max = 1 - p_min

sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub.loc[cons_test_index,target_feats] = np.clip((mlp1_test + tabnet1_test + multiclass_mlp_test + multiclass_tabnet_test) / 4, p_min, p_max)
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)