- first keras model with label smoothing and new feature engineering

In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import VarianceThreshold
import tensorflow_addons as tfa

sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

from tensorflow.keras import layers,regularizers,Sequential,backend,callbacks,optimizers,metrics,losses
import tensorflow as tf

 The versions of TensorFlow you are currently using is 2.3.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [6]:
non_target_feats = [i for i in non_targets.columns if i != "sig_id"]
nontarget_dists = pd.DataFrame(np.sum(non_targets[non_target_feats])).reset_index(drop=False)
nontarget_dists.columns = ["target", "number"]
nontarget_dists = nontarget_dists.sort_values("number", ascending=False).reset_index(drop=True)

In [7]:
drop_list1 = list(nontarget_dists[nontarget_dists.number==0]["target"].values)
print("first drop", len(drop_list1))
non_targets.drop(drop_list1, axis=1, inplace=True)
print("shape after 1st drop:", non_targets.shape)

first drop 71
shape after 1st drop: (21948, 332)


In [8]:
def fe(df):
    tmp = df.copy()
    tmp.loc[:, 'cp_type'] = tmp.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    tmp['g_sum'] = tmp[g_feats].sum(axis = 1)
    tmp['g_mean'] = tmp[g_feats].mean(axis = 1)
    tmp['g_std'] = tmp[g_feats].std(axis = 1)
    tmp['g_kurt'] = tmp[g_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[g_feats].skew(axis = 1)
    tmp['c_sum'] = tmp[c_feats].sum(axis = 1)
    tmp['c_mean'] = tmp[c_feats].mean(axis = 1)
    tmp['c_std'] = tmp[c_feats].std(axis = 1)
    tmp['c_kurt'] = tmp[c_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[c_feats].skew(axis = 1)
    tmp['gc_sum'] = tmp[c_feats + g_feats].sum(axis = 1)
    tmp['gc_mean'] = tmp[c_feats + g_feats].mean(axis = 1)
    tmp['gc_std'] = tmp[c_feats + g_feats].std(axis = 1)
    tmp['gc_kurt'] = tmp[c_feats + g_feats].kurtosis(axis = 1)
    tmp['gc_skew'] = tmp[c_feats + g_feats].skew(axis = 1)
        
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True)
    return tmp

f_train = fe(train)
f_test = fe(test)

print(f_train.shape, f_test.shape)

(21948, 889) (3982, 889)


In [9]:
fn_train = f_train.copy().to_numpy()
fn_test = f_test.copy().to_numpy()

ss = preprocessing.RobustScaler()
fn_train= ss.fit_transform(fn_train)
fn_test = ss.transform(fn_test)

fn_nontargets = non_targets.drop("sig_id", axis=1).copy().to_numpy()
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()

# modelling

In [10]:
# Prediction Clipping Thresholds
p_min = 0.001
p_max = 0.999

n_folds = 5

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

# Evaluation Metric with clipping and no label smoothing
def logloss(y_true, y_pred):
    y_pred = tf.clip_by_value(y_pred,p_min,p_max)
    return -backend.mean(y_true*backend.log(y_pred) + (1-y_true)*backend.log(1-y_pred))

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    
def create_model(shape):
    inp = tf.keras.layers.Input(shape = (shape))
    x = tf.keras.layers.BatchNormalization()(inp)
    x = tf.keras.layers.Dropout(0.2)(x)
    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(2048, activation = 'relu'))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.6)(x)
    x = tfa.layers.WeightNormalization(tf.keras.layers.Dense(1048, activation = 'relu'))(x)
    x = tf.keras.layers.BatchNormalization()(x)
    x = tf.keras.layers.Dropout(0.6)(x)
    out = tfa.layers.WeightNormalization(tf.keras.layers.Dense(206, activation = 'sigmoid'))(x)
    model = tf.keras.models.Model(inputs = inp, outputs = out)
    return model
    
def modelling_keras(X_train, y_train, X_test, input_features, output_features, sample_seed):
    seed_everything(seed=sample_seed) 
    oof = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([X_test.shape[0], y_train.shape[1]])
    
    scores = []
    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    for i , (train_index, val_index) in enumerate(mskf.split(X_train, y_train)):   
        print("Fold "+str(i+1))
        X_train2 = X_train[train_index,:]
        y_train2 = y_train[train_index,:]
    
        X_test2 = X_train[val_index,:]
        y_test2 = y_train[val_index,:] 
        
        model = create_model(input_features)
        model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), 
                      loss=losses.BinaryCrossentropy(label_smoothing=0.001), metrics=logloss)
        reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(monitor='val_logloss', factor=0.1, patience=3, mode='min', min_lr=1E-5)
        #early_stopping = tf.keras.callbacks.EarlyStopping(monitor='val_logloss', min_delta=1E-5, 
        #                                                  patience=10, mode='min',restore_best_weights=True)
        save_best = tf.keras.callbacks.ModelCheckpoint('./nn_model.h5', save_best_only=True, monitor="val_logloss", verbose=1)
        
        model.fit(X_train2, y_train2, validation_data=(X_test2, y_test2),batch_size=128, 
                epochs=40,callbacks=[reduce_lr, save_best]) 

        model.load_weights('./nn_model.h5')
        valid = np.array(model.predict(X_test2))
        oof[val_index,:] = valid
        pred_value += model.predict(X_test)/ n_folds
        print("Fold {} log loss: {}".format(i+1, mean_log_loss(y_train[val_index,:], valid)))
        scores.append(mean_log_loss(y_train[val_index,:], valid))
    
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
        
    print("Total log loss: {}".format(mean_log_loss(y_train, oof)))

    return oof, pred_value

# non-targets, targets separate prediction

In [11]:
seeds = [0,1,2]

target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

nontarget_oof = np.zeros([len(fn_train),fn_nontargets.shape[1]])
nontarget_pred = np.zeros([len(fn_test),fn_nontargets.shape[1]])

for seed_ in seeds:
    oof, keras_pred = modelling_keras(fn_train, fn_targets, fn_test, fn_train.shape[1], fn_targets.shape[1], seed_)
    target_oof += oof / len(seeds)
    target_pred += keras_pred / len(seeds)
print("Total log loss in targets: {}".format(mean_log_loss(fn_targets, target_oof)))

#for seed_ in seeds:
#    oof, keras_pred = modelling_keras(fn_train, fn_nontargets, fn_test, fn_train.shape[1], fn_nontargets.shape[1], seed_)
#    nontarget_oof += oof / len(seeds)
#    nontarget_pred += keras_pred / len(seeds)
#print("Total log loss in Non targets: {}".format(mean_log_loss(oof_targets, nontarget_oof)))

Fold 1
Epoch 1/40
Epoch 00001: val_logloss improved from inf to 0.10674, saving model to ./nn_model.h5
Epoch 2/40
Epoch 00002: val_logloss improved from 0.10674 to 0.02940, saving model to ./nn_model.h5
Epoch 3/40
Epoch 00003: val_logloss improved from 0.02940 to 0.02274, saving model to ./nn_model.h5
Epoch 4/40
Epoch 00004: val_logloss improved from 0.02274 to 0.02126, saving model to ./nn_model.h5
Epoch 5/40
Epoch 00005: val_logloss improved from 0.02126 to 0.02008, saving model to ./nn_model.h5
Epoch 6/40
Epoch 00006: val_logloss improved from 0.02008 to 0.01941, saving model to ./nn_model.h5
Epoch 7/40
Epoch 00007: val_logloss improved from 0.01941 to 0.01894, saving model to ./nn_model.h5
Epoch 8/40
Epoch 00008: val_logloss improved from 0.01894 to 0.01866, saving model to ./nn_model.h5
Epoch 9/40
Epoch 00009: val_logloss improved from 0.01866 to 0.01837, saving model to ./nn_model.h5
Epoch 10/40
Epoch 00010: val_logloss improved from 0.01837 to 0.01812, saving model to ./nn_model

# final prediction

In [12]:
#n_train = f_train.copy()
#n_test = f_test.copy()

#n_train["target_sum"] = target_oof.sum(axis=1)
#n_train["nontarget_sum"] = nontarget_oof.sum(axis=1)
#n_test["target_sum"] = target_pred.sum(axis=1)
#n_test.loc[noncons_test_index, "target_sum"] = 0
#n_test["nontarget_sum"] = nontarget_pred.sum(axis=1)
#n_test.loc[noncons_test_index, "nontarget_sum"] = 0

#n_train = n_train.to_numpy()
#n_test = n_test.to_numpy()

In [13]:
#oof_final = np.zeros([len(n_train),fn_targets.shape[1]])
#pred_final = np.zeros([len(n_test),fn_targets.shape[1]])

#seeds = [10,40]
#for seed_ in seeds:
#    oof, keras_pred = modelling_keras(n_train, fn_targets, n_test, n_train.shape[1], fn_targets.shape[1], seed_)
#    oof_final += oof / len(seeds)
#    pred_final += keras_pred / len(seeds)
#print("Total log loss: {}".format(mean_log_loss(fn_targets, oof_final)))

In [14]:
t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
checkscore = t.copy()
checkscore.loc[checkscore.index.isin(cons_train_index),target_feats] = np.clip(target_oof, p_min, p_max)
checkscore.loc[checkscore.index.isin(noncons_train_index),target_feats] = 0
t.drop("sig_id", axis=1, inplace=True)
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(checkscore.iloc[:,1:]))))

OOF log loss:  0.015114910729546308


# submission

In [15]:
sub[target_feats] = np.clip(target_pred,p_min,p_max) #label smoothing
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)