- try transfer learning
- cancel diff preprocess
- change dropout rate to 0.3 in first layer

In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn.metrics import log_loss
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier

sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
# normalization by ctl group
#train_ctl = train[train.index.isin(noncons_train_index)].copy().reset_index(drop=True)
#test_ctl = test[test.index.isin(noncons_test_index)].copy().reset_index(drop=True)
#ctl_df = pd.concat([train_ctl, test_ctl])

#ctl_group_data = ctl_df.groupby(["cp_dose", "cp_time"]).agg({"mean"}).reset_index()
#mean_g_feats = ["mean-" + i for i in g_feats]
#mean_c_feats = ["mean-" + i for i in c_feats]
#columns = ["cp_dose", "cp_time"] + mean_g_feats + mean_c_feats
#ctl_group_data.columns = columns

train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [6]:
non_target_feats = [i for i in non_targets.columns if i != "sig_id"]
nontarget_dists = pd.DataFrame(np.sum(non_targets[non_target_feats])).reset_index(drop=False)
nontarget_dists.columns = ["target", "number"]
nontarget_dists = nontarget_dists.sort_values("number", ascending=False).reset_index(drop=True)

In [7]:
drop_list1 = list(nontarget_dists[nontarget_dists.number==0]["target"].values)
print("first drop", len(drop_list1))
non_targets.drop(drop_list1, axis=1, inplace=True)
print("shape after 1st drop:", non_targets.shape)
drop_list2 = list(nontarget_dists[(nontarget_dists.number>0) & (nontarget_dists.number<=6)]["target"].values)[:-1]
print("second drop", len(drop_list2))
non_targets.drop(drop_list2, axis=1, inplace=True)
print("shape after 2nd drop:", non_targets.shape)

first drop 71
shape after 1st drop: (21948, 332)
second drop 125
shape after 2nd drop: (21948, 207)


# Feature engineering 

In [8]:
def fe(df):
    tmp = df.copy()
    tmp.loc[:, 'cp_type'] = tmp.loc[:, 'cp_type'].map({'trt_cp': 0, 'ctl_vehicle': 1})
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
        
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True)
    return tmp

f_train = fe(train)
f_test = fe(test)

print(f_train.shape, f_test.shape)

(21948, 874) (3982, 874)


# modelling

In [9]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class MoaModel(nn.Module):
    def __init__(self, num_columns):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.3)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 2048))
        
        self.batch_norm2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.6)
        self.dense2 = nn.utils.weight_norm(nn.Linear(2048, 1048))
        
        self.batch_norm3 = nn.BatchNorm1d(1048)
        self.dropout3 = nn.Dropout(0.6)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1048, 206))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x

cuda


# train by non-targets

In [10]:
batch_size = 128
train_epochs = 40
n_folds=5
EARLY_STOPPING_STEPS = 10

def first_learning(tr, target, sample_seed, init_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    files = []
        
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(init_num)
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'parameters'+str(fold+1)+'.pt')
            else:
                stop_counts += 1
         
        files.append('parameters'+str(fold+1)+'.pt')
        pred_model = MoaModel(init_num)
        pred_model.load_state_dict(torch.load('parameters'+str(fold+1)+'.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return files

In [11]:
fn_train = f_train.copy().to_numpy()
fn_test = f_test.copy().to_numpy()

ss = preprocessing.StandardScaler()
fn_train= ss.fit_transform(fn_train)
fn_test = ss.transform(fn_test)

fn_nontargets = non_targets.drop("sig_id", axis=1).copy().to_numpy()
nontarget_oof = np.zeros([len(fn_train),fn_nontargets.shape[1]])
nontarget_pred = np.zeros([len(fn_test),fn_nontargets.shape[1]])

seeds = [0]
for seed_ in seeds:
    files = first_learning(fn_train, fn_nontargets, seed_, fn_train.shape[1])

Fold 1
Best model: Epoch 1 	 loss=0.411553 	 val_loss=0.066848 	 time=1.66s
Best model: Epoch 2 	 loss=0.039160 	 val_loss=0.019184 	 time=0.98s
Best model: Epoch 3 	 loss=0.015622 	 val_loss=0.011588 	 time=0.79s
Best model: Epoch 4 	 loss=0.010761 	 val_loss=0.008867 	 time=0.78s
Best model: Epoch 5 	 loss=0.009001 	 val_loss=0.008143 	 time=0.77s
Best model: Epoch 6 	 loss=0.008491 	 val_loss=0.007786 	 time=0.81s
Best model: Epoch 7 	 loss=0.007914 	 val_loss=0.007451 	 time=0.92s
Best model: Epoch 8 	 loss=0.007738 	 val_loss=0.007360 	 time=0.78s
Best model: Epoch 11 	 loss=0.007491 	 val_loss=0.007235 	 time=0.78s
Best model: Epoch 13 	 loss=0.007401 	 val_loss=0.007163 	 time=0.88s
Best model: Epoch 14 	 loss=0.007295 	 val_loss=0.007146 	 time=0.80s
Best model: Epoch 15 	 loss=0.007134 	 val_loss=0.007122 	 time=0.99s
Best model: Epoch 16 	 loss=0.006997 	 val_loss=0.007050 	 time=0.83s
Best model: Epoch 17 	 loss=0.006939 	 val_loss=0.007000 	 time=0.82s
Best model: Epoch 18 

# train by targets

In [12]:
batch_size = 64
train_epochs = 30
n_folds=5
EARLY_STOPPING_STEPS = 10

def modelling_torch(tr, target, te, sample_seed, init_num, files):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    models = []
    
    X_test = torch.tensor(X_test, dtype=torch.float32)
    X_test = torch.utils.data.TensorDataset(X_test) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(init_num)
        clf.load_state_dict(torch.load(files[fold]))
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001) 
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
         
        pred_model = MoaModel(init_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).sigmoid().detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy()
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, oof_targets, pred_value

In [13]:
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

seeds = [0,1,2,3,4]
for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_torch(fn_train, fn_targets, fn_test, seed_, fn_train.shape[1], files)
    target_oof += oof / len(seeds)
    target_pred += pytorch_pred / len(seeds)
print("Total log loss in targets: {}".format(mean_log_loss(oof_targets, target_oof)))

Fold 1
Best model: Epoch 1 	 loss=0.020109 	 val_loss=0.018032 	 time=1.36s
Best model: Epoch 2 	 loss=0.018119 	 val_loss=0.017476 	 time=1.56s
Best model: Epoch 3 	 loss=0.017589 	 val_loss=0.017089 	 time=1.62s
Best model: Epoch 4 	 loss=0.017159 	 val_loss=0.016899 	 time=1.41s
Best model: Epoch 5 	 loss=0.016893 	 val_loss=0.016732 	 time=1.45s
Best model: Epoch 6 	 loss=0.016661 	 val_loss=0.016641 	 time=1.34s
Best model: Epoch 7 	 loss=0.016442 	 val_loss=0.016635 	 time=1.38s
Best model: Epoch 8 	 loss=0.016214 	 val_loss=0.016451 	 time=1.92s
Best model: Epoch 9 	 loss=0.016110 	 val_loss=0.016405 	 time=1.36s
Best model: Epoch 10 	 loss=0.015885 	 val_loss=0.016390 	 time=1.33s
Best model: Epoch 11 	 loss=0.015690 	 val_loss=0.016352 	 time=1.66s
Best model: Epoch 12 	 loss=0.015583 	 val_loss=0.016346 	 time=1.40s
Best model: Epoch 14 	 loss=0.015258 	 val_loss=0.016267 	 time=1.36s
Best model: Epoch 15 	 loss=0.015106 	 val_loss=0.016260 	 time=1.58s
Best model: Epoch 16 	

In [14]:
sub[target_feats] = target_pred
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)