- cancel standardization
- set early stopping not to overfit
- change model architecture
- set seed value differently

In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn.metrics import log_loss
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier

sys.path.append('../input/multilabelstraifier/')
sys.path.append('../input/lookahead/')
from ml_stratifiers import MultilabelStratifiedKFold
from lookahead import Lookahead
warnings.filterwarnings('ignore')

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [6]:
non_target_feats = [i for i in non_targets.columns if i != "sig_id"]
nontarget_dists = pd.DataFrame(np.sum(non_targets[non_target_feats])).reset_index(drop=False)
nontarget_dists.columns = ["target", "number"]
nontarget_dists = nontarget_dists.sort_values("number", ascending=False).reset_index(drop=True)

In [7]:
#target_dists = pd.DataFrame(np.sum(targets[target_feats])).reset_index(drop=False)
#target_dists.columns = ["target", "number"]
#target_dists = target_dists.sort_values("number", ascending=False).reset_index(drop=True)

In [8]:
drop_list1 = list(nontarget_dists[nontarget_dists.number==0]["target"].values)
print("first drop", len(drop_list1))
non_targets.drop(drop_list1, axis=1, inplace=True)
print("shape after 1st drop:", non_targets.shape)
drop_list2 = list(nontarget_dists[(nontarget_dists.number>0) & (nontarget_dists.number<=6)]["target"].values)[:-1]
print("second drop", len(drop_list2))
non_targets.drop(drop_list2, axis=1, inplace=True)
print("shape after 2nd drop:", non_targets.shape)

first drop 71
shape after 1st drop: (21948, 332)
second drop 125
shape after 2nd drop: (21948, 207)


# Feature engineering 

In [9]:
def fe(df):
    tmp = df.copy()
    tmp.loc[:, 'cp_type'] = tmp.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
        
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True)
    return tmp

f_train = fe(train)
f_test = fe(test)

print(f_train.shape, f_test.shape)

(21948, 874) (3982, 874)


# modelling

In [10]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class MoaModel(nn.Module):
    def __init__(self, num_columns):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.5)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 512))
        
        self.batch_norm2 = nn.BatchNorm1d(512)
        self.dropout2 = nn.Dropout(0.6)
        self.dense2 = nn.utils.weight_norm(nn.Linear(512,256))
        
        self.batch_norm3 = nn.BatchNorm1d(256)
        self.dropout3 = nn.Dropout(0.6)
        self.dense3 = nn.utils.weight_norm(nn.Linear(256, 206))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        #x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        #x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

cuda


# train by non-targets

In [11]:
batch_size = 128
train_epochs = 20
n_folds=5

def first_learning(tr, target, sample_seed, init_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    files = []
        
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(init_num)
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'parameters'+str(fold+1)+'.pt')
            else:
                stop_counts += 1
         
        files.append('parameters'+str(fold+1)+'.pt')
        pred_model = MoaModel(init_num)
        pred_model.load_state_dict(torch.load('parameters'+str(fold+1)+'.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return files

In [12]:
fn_train = f_train.copy().to_numpy()
fn_test = f_test.copy().to_numpy()

#ss = preprocessing.StandardScaler()
#fn_train= ss.fit_transform(fn_train)
#fn_test = ss.transform(fn_test)

fn_nontargets = non_targets.drop("sig_id", axis=1).copy().to_numpy()
nontarget_oof = np.zeros([len(fn_train),fn_nontargets.shape[1]])
nontarget_pred = np.zeros([len(fn_test),fn_nontargets.shape[1]])

seeds = [0]
for seed_ in seeds:
    files = first_learning(fn_train, fn_nontargets, seed_, fn_train.shape[1])

Fold 1
Best model: Epoch 1 	 loss=0.405414 	 val_loss=0.132971 	 time=1.58s
Best model: Epoch 2 	 loss=0.073508 	 val_loss=0.040709 	 time=0.73s
Best model: Epoch 3 	 loss=0.029345 	 val_loss=0.021798 	 time=0.78s
Best model: Epoch 4 	 loss=0.017509 	 val_loss=0.014571 	 time=0.83s
Best model: Epoch 5 	 loss=0.012868 	 val_loss=0.011435 	 time=0.86s
Best model: Epoch 6 	 loss=0.010624 	 val_loss=0.009840 	 time=0.74s
Best model: Epoch 7 	 loss=0.009393 	 val_loss=0.009003 	 time=0.73s
Best model: Epoch 8 	 loss=0.008652 	 val_loss=0.008428 	 time=0.75s
Best model: Epoch 9 	 loss=0.008212 	 val_loss=0.008118 	 time=0.75s
Best model: Epoch 10 	 loss=0.007879 	 val_loss=0.007713 	 time=0.77s
Best model: Epoch 11 	 loss=0.007641 	 val_loss=0.007572 	 time=0.74s
Best model: Epoch 12 	 loss=0.007486 	 val_loss=0.007406 	 time=0.74s
Best model: Epoch 13 	 loss=0.007330 	 val_loss=0.007303 	 time=0.73s
Best model: Epoch 14 	 loss=0.007223 	 val_loss=0.007294 	 time=0.83s
Best model: Epoch 15 	

# train by targets

In [13]:
batch_size = 64
train_epochs = 30
n_folds=5
EARLY_STOPPING_STEPS = 3

def modelling_torch(tr, target, te, sample_seed, init_num, files):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    models = []
    
    X_test = torch.tensor(X_test, dtype=torch.float32)
    X_test = torch.utils.data.TensorDataset(X_test) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(init_num)
        clf.load_state_dict(torch.load(files[fold]))
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001) 
        #lookahead = Lookahead(optimizer, k=10, alpha=0.6) #lookahead
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=1, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
        
            if stop_counts >= EARLY_STOPPING_STEPS: 
                break
         
        pred_model = MoaModel(init_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).sigmoid().detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy()
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, oof_targets, pred_value

In [14]:
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

seeds = [0, 100, 200, 300, 400] 
for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_torch(fn_train, fn_targets, fn_test, seed_, fn_train.shape[1], files)
    target_oof += oof / len(seeds)
    target_pred += pytorch_pred / len(seeds)
print("Total log loss in targets: {}".format(mean_log_loss(oof_targets, target_oof)))

Fold 1
Best model: Epoch 1 	 loss=0.019704 	 val_loss=0.018255 	 time=1.26s
Best model: Epoch 2 	 loss=0.018056 	 val_loss=0.017696 	 time=1.49s
Best model: Epoch 3 	 loss=0.017486 	 val_loss=0.017210 	 time=1.37s
Best model: Epoch 4 	 loss=0.017122 	 val_loss=0.017028 	 time=1.21s
Best model: Epoch 5 	 loss=0.016815 	 val_loss=0.016842 	 time=1.33s
Best model: Epoch 6 	 loss=0.016553 	 val_loss=0.016805 	 time=1.37s
Best model: Epoch 7 	 loss=0.016413 	 val_loss=0.016562 	 time=1.21s
Best model: Epoch 9 	 loss=0.015932 	 val_loss=0.016441 	 time=1.21s
Best model: Epoch 10 	 loss=0.015791 	 val_loss=0.016417 	 time=1.23s
Best model: Epoch 12 	 loss=0.015431 	 val_loss=0.016414 	 time=1.22s
Best model: Epoch 13 	 loss=0.015233 	 val_loss=0.016312 	 time=1.32s
Epoch    15: reducing learning rate of group 0 to 1.0000e-04.
Best model: Epoch 16 	 loss=0.014403 	 val_loss=0.016166 	 time=1.21s
Best model: Epoch 18 	 loss=0.014185 	 val_loss=0.016152 	 time=1.44s
Fold 1 log loss: 0.0161817439

In [15]:
t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
train_checkscore = t.copy()
train_checkscore.loc[train_checkscore.index.isin(cons_train_index),target_feats] = target_oof
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0
t.drop("sig_id", axis=1, inplace=True)
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

OOF log loss:  0.014430810161293344


In [16]:
sub[target_feats] = target_pred
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)