- credit: https://www.kaggle.com/optimo/tabnetregressor-2-0-train-infer/data
- https://github.com/dreamquark-ai/tabnet
- cancel robust scaler

In [1]:
!pip install --no-index --find-links /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl pytorch-tabnet

Looking in links: /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Processing /kaggle/input/pytorchtabnet/pytorch_tabnet-2.0.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-2.0.0


In [2]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss,roc_auc_score
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from sklearn.cluster import KMeans
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import VarianceThreshold

from pytorch_tabnet.metrics import Metric
from pytorch_tabnet.tab_model import TabNetRegressor

sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

In [3]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[~train.index.isin(noncons_train_index)].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

non_target_feats = [i for i in non_targets.columns if i != "sig_id"]
nontarget_dists = pd.DataFrame(np.sum(non_targets[non_target_feats])).reset_index(drop=False)
nontarget_dists.columns = ["target", "number"]
nontarget_dists = nontarget_dists.sort_values("number", ascending=False).reset_index(drop=True)
drop_list1 = list(nontarget_dists[nontarget_dists.number==0]["target"].values)
print("first drop", len(drop_list1))
non_targets.drop(drop_list1, axis=1, inplace=True)
print("shape after 1st drop:", non_targets.shape)
non_target_feats = [i for i in non_targets.columns if i != "sig_id"]
print(len(non_target_feats))

first drop 71
shape after 1st drop: (21948, 332)
331


# feature engineering

In [6]:
n_clusters_g = 15
n_clusters_c = 5
def create_cluster(train, test, kind, n_clusters):
    if kind == "g":
        train_ = train[g_feats].copy()
        test_ = test[g_feats].copy()
    else:
        train_ = train[c_feats].copy()
        test_ = test[c_feats].copy()    
    kmeans = KMeans(n_clusters = n_clusters, random_state = 0).fit(train_)
    train[f'clusters_{kind}'] = kmeans.labels_
    test[f'clusters_{kind}'] = kmeans.predict(test_)
    train = pd.get_dummies(train, columns = [f'clusters_{kind}'])
    test = pd.get_dummies(test, columns = [f'clusters_{kind}'])
    return train, test
    
#train, test = create_cluster(train, test, kind = 'g', n_clusters = n_clusters_g)
#train, test = create_cluster(train, test, kind = 'c', n_clusters = n_clusters_c)

In [7]:
#num = 10
#pca_c_cols = ["pca-c"+str(i+1) for i in range(num)]
#pca = PCA(n_components=num,random_state=42)
#c_train = pca.fit_transform(train[c_feats])
#c_test = pca.transform(test[c_feats])
#c_train = pd.DataFrame(c_train, columns=pca_c_cols)
#c_test = pd.DataFrame(c_test, columns=pca_c_cols)

#num = 30
#pca_g_cols = ["pca-g"+str(i+1) for i in range(num)]
#pca = PCA(n_components=num, random_state=42)
#g_train = pca.fit_transform(train[g_feats])
#g_test = pca.transform(test[g_feats])
#g_train = pd.DataFrame(g_train, columns=pca_g_cols)
#g_test = pd.DataFrame(g_test, columns=pca_g_cols)

#train = pd.concat([train, c_train],axis=1)
#test = pd.concat([test, c_test],axis=1)
#train = pd.concat([train, g_train],axis=1)
#test = pd.concat([test, g_test],axis=1)

In [8]:
def fe(df):
    tmp = df.copy()
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})        
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True)
    return tmp

train = fe(train)
test = fe(test)

print(train.shape, test.shape)

(21948, 874) (3982, 874)


In [9]:
fn_train = train.copy().to_numpy()
fn_test = test.copy().to_numpy()

#ss = preprocessing.RobustScaler()
#fn_train= ss.fit_transform(fn_train)
#fn_test = ss.transform(fn_test)

fn_non_targets = non_targets.drop("sig_id", axis=1).copy().to_numpy()
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()

# modelling

In [10]:
class LogitsLogLoss(Metric):
    def __init__(self):
        self._name = "logits_ll"
        self._maximize = False

    def __call__(self, y_true, y_pred):
        """
        Compute LogLoss of predictions.

        Parameters
        ----------
        y_true: np.ndarray
            Target matrix or vector
        y_score: np.ndarray
            Score matrix or vector

        Returns
        -------
            float
            LogLoss of predictions vs targets.
        """
        logits = 1 / (1 + np.exp(-y_pred))
        aux = (1-y_true)*np.log(1-logits+1e-15) + y_true*np.log(logits+1e-15)
        return np.mean(-aux)

In [11]:
MAX_EPOCH=200

def seed_everything(seed_value):
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    os.environ['PYTHONHASHSEED'] = str(seed_value)
    
    if torch.cuda.is_available(): 
        torch.cuda.manual_seed(seed_value)
        torch.cuda.manual_seed_all(seed_value)
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

def modelling_tabnet(tr, target, te, sample_seed):
    seed_everything(sample_seed) 
    tabnet_params = dict(n_d=12, n_a=12, n_steps=1, gamma=1.3, seed = sample_seed,
                     lambda_sparse=0, optimizer_fn=torch.optim.Adam,
                     optimizer_params=dict(lr=2e-2, weight_decay=1e-5),
                     mask_type='entmax',
                     scheduler_params=dict(mode="min",
                                           patience=5,
                                           min_lr=1e-5,
                                           factor=0.9,),
                     scheduler_fn=torch.optim.lr_scheduler.ReduceLROnPlateau,
                     verbose=10,
                     )
    test_cv_preds = []

    NB_SPLITS = 5
    mskf = MultilabelStratifiedKFold(n_splits=NB_SPLITS, random_state=0, shuffle=True)
    oof_preds = np.zeros([len(tr),target.shape[1]])
    scores = []
    for fold_nb, (train_idx, val_idx) in enumerate(mskf.split(train, target)):
        print("FOLDS : ", fold_nb+1)

        ## model
        X_train, y_train = tr[train_idx, :], target[train_idx, :]
        X_val, y_val = tr[val_idx, :], target[val_idx, :]
        model = TabNetRegressor(**tabnet_params)
    
        model.fit(X_train=X_train,
              y_train=y_train,
              eval_set=[(X_val, y_val)],
              eval_name = ["val"],
              eval_metric = ["logits_ll"],
              max_epochs=MAX_EPOCH,
              patience=20, batch_size=1024, virtual_batch_size=128,
              num_workers=1, drop_last=False,
              # use binary cross entropy as this is not a regression problem
              loss_fn=torch.nn.functional.binary_cross_entropy_with_logits)
    
        preds_val = model.predict(X_val)
        # Apply sigmoid to the predictions
        preds =  1 / (1 + np.exp(-preds_val))
        score = np.min(model.history["val_logits_ll"])
    #     name = cfg.save_name + f"_fold{fold_nb}"
    #     model.save_model(name)
        oof_preds[val_idx,:] = preds
        scores.append(score)

        # preds on test
        preds_test = model.predict(te)
        test_cv_preds.append(1 / (1 + np.exp(-preds_test)))

    test_preds_all = np.stack(test_cv_preds)
    return oof_preds, test_preds_all

target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

seeds = [0,1]
for seed_ in seeds:
    oof_preds, test_preds_all = modelling_tabnet(fn_train, fn_targets, fn_test, seed_)
    target_oof += oof_preds / len(seeds)
    target_pred += test_preds_all.mean(axis=0) / len(seeds)

FOLDS :  1
Device used : cuda
epoch 0  | loss: 0.56891 | val_logits_ll: 0.30189 |  0:00:01s
epoch 10 | loss: 0.02044 | val_logits_ll: 0.02035 |  0:00:11s
epoch 20 | loss: 0.01882 | val_logits_ll: 0.01888 |  0:00:21s
epoch 30 | loss: 0.01766 | val_logits_ll: 0.0192  |  0:00:31s
epoch 40 | loss: 0.01701 | val_logits_ll: 0.01758 |  0:00:41s
epoch 50 | loss: 0.01697 | val_logits_ll: 0.01728 |  0:00:52s
epoch 60 | loss: 0.01655 | val_logits_ll: 0.01714 |  0:01:02s
epoch 70 | loss: 0.01654 | val_logits_ll: 0.01709 |  0:01:11s
epoch 80 | loss: 0.01625 | val_logits_ll: 0.01701 |  0:01:21s
epoch 90 | loss: 0.01592 | val_logits_ll: 0.01686 |  0:01:31s
epoch 100| loss: 0.0161  | val_logits_ll: 0.01735 |  0:01:41s
epoch 110| loss: 0.01536 | val_logits_ll: 0.01688 |  0:01:52s
epoch 120| loss: 0.01541 | val_logits_ll: 0.01687 |  0:02:02s

Early stopping occured at epoch 122 with best_epoch = 102 and best_val_logits_ll = 0.01667
Best weights from best epoch are automatically used!
FOLDS :  2
Device u

In [12]:
t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
train_checkscore = t.copy()
train_checkscore.loc[train_checkscore.index.isin(cons_train_index),target_feats] = target_oof
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0

t.drop("sig_id", axis=1, inplace=True)
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

OOF log loss:  0.015306827882797481


In [13]:
sub[target_feats] = target_pred
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)