In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from category_encoders import CountEncoder
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn import preprocessing
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier

sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
#non_targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
# normalization by ctl group
train_ctl = train[train.index.isin(noncons_train_index)].copy().reset_index(drop=True)
test_ctl = test[test.index.isin(noncons_test_index)].copy().reset_index(drop=True)
ctl_df = pd.concat([train_ctl, test_ctl])

ctl_group_data = ctl_df.groupby(["cp_dose", "cp_time"]).agg({"mean"}).reset_index()
mean_g_feats = ["mean-" + i for i in g_feats]
mean_c_feats = ["mean-" + i for i in c_feats]
columns = ["cp_dose", "cp_time"] + mean_g_feats + mean_c_feats
ctl_group_data.columns = columns

#train_cons = train_feat[train_feat.index.isin(cons_train_index)].copy().reset_index(drop=True)
#test_cons = test_feat[test_feat.index.isin(cons_test_index)].copy().reset_index(drop=True)
#n_train_score = train_score[train_score.index.isin(cons_train_index)].copy().reset_index(drop=True)
#n_train_nonscore = train_nonscore[train_nonscore.index.isin(cons_train_index)].copy().reset_index(drop=True)

train = pd.merge(train, ctl_group_data, on=["cp_time", "cp_dose"], how="left")
test = pd.merge(test, ctl_group_data, on=["cp_time", "cp_dose"], how="left")

for i in range(len(g_feats)):
    train["diff-g-"+str(i)] = train["g-"+str(i)] - train["mean-g-"+str(i)]
    test["diff-g-"+str(i)] = test["g-"+str(i)] - test["mean-g-"+str(i)]
    
for i in range(len(c_feats)):
    train["diff-c-"+str(i)] = train["c-"+str(i)] - train["mean-c-"+str(i)]
    test["diff-c-"+str(i)] = test["c-"+str(i)] - test["mean-c-"+str(i)]

In [6]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

train, test = encoding(train, test)

# Feature engineering 

In [7]:
def fe(df, remove_features):
    df.drop(remove_features, axis=1, inplace=True)
    return df

remove_features = ["cp_type" , "sig_id"] + mean_g_feats + mean_c_feats + g_feats + c_feats 

train = fe(train, remove_features)
test = fe(test, remove_features)
    
print(train.shape, test.shape)

(23814, 874) (3982, 874)


# modelling

In [8]:
import os
import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.optimizers import *
from tensorflow.keras.metrics import *
from tensorflow.keras.utils import *
from tensorflow.keras.callbacks import *

device = "cuda" if torch.cuda.is_available() else "cpu"
batch_size = 64
train_epochs = 30
n_folds=7

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class MoaModel(nn.Module):
    def __init__(self, num_columns):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 2048))
        
        self.batch_norm2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.5)
        self.dense2 = nn.utils.weight_norm(nn.Linear(2048, 1048))
        
        self.batch_norm3 = nn.BatchNorm1d(1048)
        self.dropout3 = nn.Dropout(0.5)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1048, 206))
    
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = F.sigmoid(self.dense3(x))
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    models = []
    
    X_test = torch.tensor(X_test, dtype=torch.float32)
    X_test = torch.utils.data.TensorDataset(X_test) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(num_columns=init_num)
        loss_fn = torch.nn.BCELoss()
        optimizer = optim.Adam(clf.parameters(), lr = 0.001) #, weight_decay=1e-5
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        for epoch in range(train_epochs):
            best_val_loss = np.inf
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach() 
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            print('Epoch {}/{} \t loss={:.4f} \t val_loss={:.4f} \t time={:.2f}s'.format(
                epoch + 1, train_epochs, avg_loss, avg_val_loss, elapsed_time))
            scheduler.step(avg_val_loss)
            #print('Epoch-{0} lr: {1}'.format(epoch, optimizer.param_groups[0]['lr']))
                    
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
     
        pred_model = MoaModel(num_columns=init_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach() 
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy()
        pred_value += test_preds / n_folds
        # ------------------------------
        
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, pred_value

In [9]:
print(device)
nn_train = train.copy().to_numpy()
nn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
nn_test = test.copy().to_numpy()
print(nn_train.shape, nn_targets.shape, nn_test.shape)
oof, pytorch_pred = modelling_torch(nn_train, nn_targets, nn_test, sample_seed = 13, init_num = nn_train.shape[1])

cuda
(23814, 874) (23814, 206) (3982, 874)
Fold 1
Epoch 1/30 	 loss=0.2009 	 val_loss=0.0240 	 time=2.53s
Epoch 2/30 	 loss=0.0222 	 val_loss=0.0192 	 time=1.55s
Epoch 3/30 	 loss=0.0194 	 val_loss=0.0180 	 time=1.50s
Epoch 4/30 	 loss=0.0185 	 val_loss=0.0173 	 time=1.50s
Epoch 5/30 	 loss=0.0176 	 val_loss=0.0168 	 time=1.52s
Epoch 6/30 	 loss=0.0169 	 val_loss=0.0165 	 time=1.51s
Epoch 7/30 	 loss=0.0164 	 val_loss=0.0163 	 time=1.49s
Epoch 8/30 	 loss=0.0160 	 val_loss=0.0159 	 time=1.69s
Epoch 9/30 	 loss=0.0157 	 val_loss=0.0158 	 time=1.55s
Epoch 10/30 	 loss=0.0152 	 val_loss=0.0156 	 time=1.56s
Epoch 11/30 	 loss=0.0149 	 val_loss=0.0155 	 time=1.60s
Epoch 12/30 	 loss=0.0148 	 val_loss=0.0154 	 time=1.60s
Epoch 13/30 	 loss=0.0143 	 val_loss=0.0154 	 time=1.57s
Epoch 14/30 	 loss=0.0138 	 val_loss=0.0154 	 time=1.50s
Epoch 15/30 	 loss=0.0134 	 val_loss=0.0154 	 time=1.76s
Epoch 16/30 	 loss=0.0131 	 val_loss=0.0155 	 time=1.70s
Epoch 17/30 	 loss=0.0129 	 val_loss=0.0155 	 t

In [10]:
sub[target_feats] = pytorch_pred
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)