- consider sum of moa by taking axis=1

In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn.metrics import log_loss
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier

sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
# normalization by ctl group
train_ctl = train[train.index.isin(noncons_train_index)].copy().reset_index(drop=True)
test_ctl = test[test.index.isin(noncons_test_index)].copy().reset_index(drop=True)
ctl_df = pd.concat([train_ctl, test_ctl])

ctl_group_data = ctl_df.groupby(["cp_dose", "cp_time"]).agg({"mean"}).reset_index()
mean_g_feats = ["mean-" + i for i in g_feats]
mean_c_feats = ["mean-" + i for i in c_feats]
columns = ["cp_dose", "cp_time"] + mean_g_feats + mean_c_feats
ctl_group_data.columns = columns

train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

# Feature engineering 

In [6]:
def fe(df, remove_features):
    tmp = df.copy()
    tmp = pd.merge(tmp, ctl_group_data, on=["cp_time", "cp_dose"], how="left")
    for i in range(len(g_feats)):
        tmp["diff-g-"+str(i)] = tmp["g-"+str(i)] - tmp["mean-g-"+str(i)]
    for i in range(len(c_feats)):
        tmp["diff-c-"+str(i)] = tmp["c-"+str(i)] - tmp["mean-c-"+str(i)]
    
    tmp.loc[:, 'cp_type'] = tmp.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
        
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

def fe2(df):
    tmp = df.copy()
    tmp.loc[:, 'cp_type'] = tmp.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
        
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True)
    return tmp

f_train = fe2(train)
f_test = fe2(test)

remove_features = ["cp_type" , "sig_id"] + mean_g_feats + mean_c_feats + g_feats + c_feats 
n_train = fe(train, remove_features)
n_test = fe(test, remove_features)

print(f_train.shape, f_test.shape)
print(n_train.shape, n_test.shape)

(21948, 874) (3982, 874)
(21948, 874) (3982, 874)


# modelling

In [7]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
batch_size = 128
train_epochs = 40
n_folds=5
EARLY_STOPPING_STEPS = 10

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class MoaModel(nn.Module):
    def __init__(self, num_columns, last_columns_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.3)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 2048))
        
        self.batch_norm2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.6)
        self.dense2 = nn.utils.weight_norm(nn.Linear(2048, 1048))
        
        self.batch_norm3 = nn.BatchNorm1d(1048)
        self.dropout3 = nn.Dropout(0.6)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1048, last_columns_num))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    models = []
    
    X_test = torch.tensor(X_test, dtype=torch.float32)
    X_test = torch.utils.data.TensorDataset(X_test) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
            #print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
     
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        #print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).sigmoid().detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy()
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, oof_targets, pred_value

cuda


# predict targets, non-targets separately

In [8]:
seeds = [0,1,2,3,4]
fn_train = f_train.copy().to_numpy()
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
fn_nontargets = non_targets.drop("sig_id", axis=1).copy().to_numpy()
fn_test = f_test.copy().to_numpy()

target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

nontarget_oof = np.zeros([len(fn_train),fn_nontargets.shape[1]])
nontarget_pred = np.zeros([len(fn_test),fn_nontargets.shape[1]])

for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_torch(fn_train, fn_targets, fn_test, seed_, fn_train.shape[1], fn_targets.shape[1])
    target_oof += oof / len(seeds)
    target_pred += pytorch_pred / len(seeds)
print("Total log loss in targets: {}".format(mean_log_loss(oof_targets, target_oof)))

for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_torch(fn_train, fn_nontargets, fn_test, seed_, fn_train.shape[1], fn_nontargets.shape[1])
    nontarget_oof += oof / len(seeds)
    nontarget_pred += pytorch_pred / len(seeds)
print("Total log loss in Non targets: {}".format(mean_log_loss(oof_targets, nontarget_oof)))

n_train["target_sum"] = target_oof.sum(axis=1)
n_train["nontarget_sum"] = nontarget_oof.sum(axis=1)
n_test["target_sum"] = target_pred.sum(axis=1)
n_test.loc[noncons_test_index, "target_sum"] = 0
n_test["nontarget_sum"] = nontarget_pred.sum(axis=1)
n_test.loc[noncons_test_index, "nontarget_sum"] = 0

Fold 1
Best model: Epoch 1 	 loss=0.414770 	 val_loss=0.080759 	 time=1.90s
Best model: Epoch 2 	 loss=0.048856 	 val_loss=0.028515 	 time=0.94s
Best model: Epoch 3 	 loss=0.027138 	 val_loss=0.023158 	 time=0.87s
Best model: Epoch 4 	 loss=0.023288 	 val_loss=0.021195 	 time=0.86s
Best model: Epoch 5 	 loss=0.021337 	 val_loss=0.019858 	 time=0.89s
Best model: Epoch 6 	 loss=0.020526 	 val_loss=0.019390 	 time=0.88s
Best model: Epoch 7 	 loss=0.019862 	 val_loss=0.018968 	 time=0.89s
Best model: Epoch 9 	 loss=0.019258 	 val_loss=0.018369 	 time=0.86s
Best model: Epoch 10 	 loss=0.018756 	 val_loss=0.018010 	 time=0.88s
Best model: Epoch 11 	 loss=0.018383 	 val_loss=0.017663 	 time=0.87s
Best model: Epoch 13 	 loss=0.017725 	 val_loss=0.017420 	 time=1.15s
Best model: Epoch 15 	 loss=0.017565 	 val_loss=0.017146 	 time=0.87s
Best model: Epoch 16 	 loss=0.017314 	 val_loss=0.016968 	 time=0.87s
Best model: Epoch 17 	 loss=0.016986 	 val_loss=0.016948 	 time=0.92s
Best model: Epoch 18 

# final prediction

In [9]:
seeds = [0,1,2,3,4,5,6] 
nn_train = n_train.copy().to_numpy()
nn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
nn_test = n_test.copy().to_numpy()

oof_final = np.zeros([len(n_train),nn_targets.shape[1]])
pred_final = np.zeros([len(n_test),nn_targets.shape[1]])

for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_torch(nn_train, nn_targets, nn_test, seed_, nn_train.shape[1], nn_targets.shape[1])
    oof_final += oof / len(seeds)
    pred_final += pytorch_pred / len(seeds)
print("Total log loss: {}".format(mean_log_loss(oof_targets, oof_final)))

Fold 1
Best model: Epoch 1 	 loss=0.411053 	 val_loss=0.082669 	 time=0.96s
Best model: Epoch 2 	 loss=0.048884 	 val_loss=0.029771 	 time=1.03s
Best model: Epoch 3 	 loss=0.027667 	 val_loss=0.024162 	 time=1.27s
Best model: Epoch 4 	 loss=0.023129 	 val_loss=0.020741 	 time=1.13s
Best model: Epoch 5 	 loss=0.022136 	 val_loss=0.020504 	 time=0.92s
Best model: Epoch 6 	 loss=0.020886 	 val_loss=0.019311 	 time=1.10s
Best model: Epoch 7 	 loss=0.020160 	 val_loss=0.019125 	 time=0.87s
Best model: Epoch 8 	 loss=0.019546 	 val_loss=0.018681 	 time=0.91s
Best model: Epoch 9 	 loss=0.019174 	 val_loss=0.018354 	 time=0.88s
Best model: Epoch 10 	 loss=0.018924 	 val_loss=0.018203 	 time=0.91s
Best model: Epoch 11 	 loss=0.018586 	 val_loss=0.017870 	 time=0.97s
Best model: Epoch 12 	 loss=0.018367 	 val_loss=0.017657 	 time=1.46s
Best model: Epoch 13 	 loss=0.018054 	 val_loss=0.017556 	 time=0.89s
Best model: Epoch 14 	 loss=0.017986 	 val_loss=0.017538 	 time=0.88s
Best model: Epoch 15 	

In [10]:
sub[target_feats] = pred_final
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)