**late submission**
- copy tabnet from version 83 
- solve as a multiclass classification (metric: logloss)
- data augmentation for minor class by cutmix
- 6 fold 5 seed
- change batch size to 256
- add robust scaler

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0


In [2]:
import os
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn.metrics import log_loss
from sklearn.utils import check_random_state
from sklearn.model_selection import StratifiedKFold
from sklearn import preprocessing
from pytorch_tabnet.tab_model import TabNetClassifier

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

warnings.filterwarnings('ignore')

In [3]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[~train.index.isin(noncons_train_index)].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
def make_string(row):
    return str(row[1:].values).replace('[','').replace(']','').replace('\n','').replace(' ','')

targets["target_pair"] = targets.apply(make_string, axis=1)
targetpair_id = dict(enumerate(targets["target_pair"].unique()))
id_targetpair = {y:x for x,y in targetpair_id.items()}
targets["target_pair_num"] = targets["target_pair"].map(id_targetpair)

multiclass_targets = pd.get_dummies(targets["target_pair_num"])

for i in range(len(id_targetpair.keys())):
    if i == 0:
        classid_target = np.array(list((targetpair_id[i]))).reshape(1,-1)      
    else:
        classid_target = np.vstack([classid_target, np.array(list((targetpair_id[i]))).reshape(1,-1)])
        
classid_target = classid_target.astype(int)
class_num = multiclass_targets.shape[1]
        
multiclass_targets.shape, classid_target.shape

((23814, 328), (328, 206))

In [6]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
multiclass_targets = targets["target_pair_num"].copy().reset_index(drop=True)

In [7]:
tmp = multiclass_targets.value_counts().copy()
minor_class = tmp[tmp==1].index
minor_class6 = tmp[tmp==6].index
minor_class, minor_class6

(Int64Index([326, 327, 318, 312, 325, 310], dtype='int64'),
 Int64Index([321, 275,  19, 307, 323, 290, 146, 243, 322, 256,
             ...
             280, 168, 263, 303, 231, 199, 183,  71,  39,   7],
            dtype='int64', length=111))

# feature engineering

In [8]:
for i in c_feats + g_feats:
    ss = preprocessing.RobustScaler()
    ss.fit(train[i].values.reshape(-1,1))
    train[i] = ss.transform(train[i].values.reshape(-1,1))
    test[i] = ss.transform(test[i].values.reshape(-1,1))

In [9]:
def fe(df):
    tmp = df.copy()
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})      
    tmp.drop(["sig_id", "cp_type"], axis=1, inplace=True)
    return tmp

train = fe(train)
test = fe(test)

print(train.shape, test.shape)

(21948, 874) (3982, 874)


In [10]:
# data augmentation by cutmix
# https://www.kaggle.com/yxohrxn/moa-cutmix

def cutmix_for_tabular(x, y=None, alpha=1.0, p=1.0, random_state=None):
    x_ = x.copy()
    n, d = x.shape

    if n is not None and random.random() < p:
        random_state = check_random_state(random_state)
        l = random_state.beta(alpha, alpha)
        mask = random_state.choice([False, True], size=d, p=[l, 1.0 - l])
        mask = np.where(mask)[0]
        shuffle = random_state.choice(n, n, replace=False)
        x_[:,mask] = x_[np.ix_(shuffle,mask)]
        
        if y is not None:
            y = l * y + (1.0 - l) * y[shuffle]
        
    # extract minor class
    tmp_index = np.where(np.isin(np.argmax(y, axis=1), minor_class))[0]
    print(tmp_index)
        
    return x_[tmp_index], np.argmax(y[tmp_index],axis=1)

for i in [0,1,2,3,4]:
    if i == 0:
        train_mod, y_mod = cutmix_for_tabular(train.values, 
                                      pd.get_dummies(multiclass_targets).values, alpha=1, p=1, random_state=i)
    else:
        train_tmp, y_tmp = cutmix_for_tabular(train.values, 
                                      pd.get_dummies(multiclass_targets).values, alpha=1, p=1, random_state=i)
        train_mod = np.concatenate([train_mod, train_tmp], axis=0)
        y_mod = np.concatenate([y_mod, y_tmp])
        
train_mod = pd.DataFrame(train_mod, columns = train.columns)
train = pd.concat([train, train_mod], axis=0).reset_index(drop=True)

y_mod = pd.DataFrame(y_mod)
multiclass_targets = pd.concat([multiclass_targets, y_mod]).reset_index(drop=True)

[  599  2348  5784 12816 17718 20062]
[ 6269  6613  9380 10755 13810 14313]
[ 7176  7455  9098 15661 15970 18894]
[  858  9318  9931 18208 19565 20789]
[ 7176  7455  9098 15661 15970 18894]


In [11]:
fn_train = train.copy().to_numpy()
fn_test = test.copy().to_numpy()
    
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
fn_multiclass_targets = multiclass_targets.copy().to_numpy().reshape(-1,)

fn_train.shape, fn_test.shape, fn_multiclass_targets.shape

((21978, 874), (3982, 874), (21978,))

# modelling

In [12]:
MAX_EPOCH=200
device = "cuda" if torch.cuda.is_available() else "cpu"

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def modelling_tabnet(tr, target, te, sample_seed, target_shape):
    seed_everything(sample_seed) 
    tabnet_params = dict(n_d=32, n_a=32, n_steps=1, gamma=1.3, seed = sample_seed,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []

    oof_preds = np.zeros([len(tr),target_shape])
    scores = []
    NB_SPLITS = 6
    mskf = StratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(tr, target)):
        print("FOLDS : ", fold_nb+1)

        ## model
        X_train, y_train = tr[train_idx, :], target[train_idx]
        X_val, y_val = tr[val_idx, :], target[val_idx]
        model = TabNetClassifier(**tabnet_params)
    
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ['logloss'],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=256, virtual_batch_size=128,
              num_workers=1)
    
        preds_val = model.predict_proba(X_val)
        score = np.min(model.history["val_logloss"])
        oof_preds[val_idx,:] = preds_val
        scores.append(score)

        # preds on test
        preds_test = model.predict_proba(te)
        test_cv_preds.append(preds_test)
        
    test_preds_all = np.stack(test_cv_preds)
    return oof_preds, test_preds_all

target_oof = np.zeros([len(fn_train),class_num])
target_pred = np.zeros([len(fn_test),class_num])

seeds = [0,1,2,3,4]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(fn_train, fn_multiclass_targets, fn_test, seed_, class_num)
    target_oof += oof_preds / len(seeds)
    target_pred += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 4.11951 | val_logloss: 3.61906 |  0:00:02s
epoch 10 | loss: 2.78922 | val_logloss: 3.1474  |  0:00:18s
epoch 20 | loss: 2.10873 | val_logloss: 3.61048 |  0:00:34s

Early stopping occured at epoch 29 with best_epoch = 9 and best_val_logloss = 3.12504
Best weights from best epoch are automatically used!
FOLDS :  2
Device used : cuda
epoch 0  | loss: 4.11944 | val_logloss: 3.69822 |  0:00:01s
epoch 10 | loss: 2.85335 | val_logloss: 3.03776 |  0:00:17s
epoch 20 | loss: 2.24707 | val_logloss: 3.44122 |  0:00:33s
epoch 30 | loss: 1.49847 | val_logloss: 4.64161 |  0:00:49s

Early stopping occured at epoch 30 with best_epoch = 10 and best_val_logloss = 3.03776
Best weights from best epoch are automatically used!
FOLDS :  3
Device used : cuda
epoch 0  | loss: 4.11078 | val_logloss: 3.63913 |  0:00:01s
epoch 10 | loss: 2.85814 | val_logloss: 3.15428 |  0:00:18s
epoch 20 | loss: 2.2583  | val_logloss: 3.62336 |  0:00:33s

Early stopping occured at ep

In [13]:
target_oof = np.dot(target_oof, classid_target)[:21948,:]
target_pred = np.dot(target_pred, classid_target)

In [14]:
p_min = 0.001
p_max = 1 - p_min

t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
train_checkscore = t.copy()
train_checkscore.loc[train_checkscore.index.isin(cons_train_index),target_feats] = target_oof
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0

t.drop("sig_id", axis=1, inplace=True)
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(train_checkscore.iloc[:,1:]))))
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.clip(np.array(train_checkscore.iloc[:,1:]),p_min, p_max))))

OOF log loss:  0.01492488092801164
OOF log loss:  0.015258924248195863


In [15]:
train_checkscore.to_csv("tab_newval_oof.csv", index=False)

In [16]:
sub[target_feats] = np.clip(target_pred,p_min,p_max)
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)