**Late submission**
- solve as a multiclass classification problem
- change learning rate scheduler
- change PCA size

In [1]:
import os
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from sklearn.feature_selection import VarianceThreshold
from sklearn.model_selection import StratifiedKFold, KFold
from tqdm._tqdm_notebook import tqdm_notebook

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

warnings.filterwarnings('ignore')

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  if sys.path[0] == '':


# final engineering

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# target group

In [5]:
y = targets.drop("sig_id", axis=1)

In [6]:
def make_string(row):
    return str(row[1:].values).replace('[','').replace(']','').replace('\n','').replace(' ','')

targets["target_pair"] = targets.apply(make_string, axis=1)
targetpair_id = dict(enumerate(targets["target_pair"].unique()))
id_targetpair = {y:x for x,y in targetpair_id.items()}
targets["target_pair_num"] = targets["target_pair"].map(id_targetpair)

multiclass_targets = pd.get_dummies(targets["target_pair_num"])

for i in range(len(id_targetpair.keys())):
    if i == 0:
        classid_target = np.array(list((targetpair_id[i]))).reshape(1,-1)      
    else:
        classid_target = np.vstack([classid_target, np.array(list((targetpair_id[i]))).reshape(1,-1)])
        
classid_target = classid_target.astype(int)
class_num = multiclass_targets.shape[1]
        
multiclass_targets.shape, classid_target.shape

((23814, 328), (328, 206))

# preprocess

In [7]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
test = test[test.index.isin(cons_test_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

# no need to change to one-hot encoding form
# https://discuss.pytorch.org/t/runtimeerror-multi-target-not-supported-newbie/10216
multiclass_targets = targets["target_pair_num"].copy().reset_index(drop=True)

# Feature engineering 

In [8]:
#X = train.iloc[:,4:].copy().values
#select = VarianceThreshold(threshold=0.9)
#X_new = select.fit_transform(X)
#drop_feats = list(np.array(train.iloc[:,4:].columns)[select.get_support()==False])
#print(len(drop_feats))

#train.drop(drop_feats, axis=1, inplace=True)
#test.drop(drop_feats, axis=1, inplace=True)

g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [9]:
# rank gauss
for i in c_feats + g_feats:
    ss = preprocessing.QuantileTransformer(n_quantiles=100, random_state=0, output_distribution="normal")
    ss.fit(train[i].values.reshape(-1,1))
    train[i] = ss.transform(train[i].values.reshape(-1,1))
    test[i] = ss.transform(test[i].values.reshape(-1,1))

In [10]:
c_num = 8
pca_c_cols = ["pca-c"+str(i+1) for i in range(c_num)]
pca = PCA(n_components=c_num,random_state=42)
c_train = pca.fit_transform(train[c_feats])
c_test = pca.transform(test[c_feats])
c_train = pd.DataFrame(c_train, columns=pca_c_cols)
c_test = pd.DataFrame(c_test, columns=pca_c_cols)

g_num = 32
pca_g_cols = ["pca-g"+str(i+1) for i in range(g_num)]
pca = PCA(n_components=g_num, random_state=42)
g_train = pca.fit_transform(train[g_feats])
g_test = pca.transform(test[g_feats])
g_train = pd.DataFrame(g_train, columns=pca_g_cols)
g_test = pd.DataFrame(g_test, columns=pca_g_cols)

train = pd.concat([train, c_train],axis=1)
test = pd.concat([test, c_test],axis=1)
train = pd.concat([train, g_train],axis=1)
test = pd.concat([test, g_test],axis=1)

In [11]:
def fe(df):
    tmp = df.copy()
    tmp['g_kurt'] = tmp[g_feats].kurtosis(axis = 1)
    tmp['g_skew'] = tmp[g_feats].skew(axis = 1)
    tmp['c_kurt'] = tmp[c_feats].kurtosis(axis = 1)
    tmp['c_skew'] = tmp[c_feats].skew(axis = 1)
    tmp = pd.get_dummies(tmp, columns=['cp_time','cp_dose'])
    tmp.drop(["cp_type", "sig_id"], axis=1, inplace=True)
    return tmp

train = fe(train)
test = fe(test)

In [12]:
fn_train = train.copy().to_numpy()
fn_test = test.copy().to_numpy()
fn_targets = y.copy().to_numpy()
fn_multiclass_targets = multiclass_targets.copy().to_numpy()
print(fn_train.shape, fn_test.shape, fn_targets.shape, fn_multiclass_targets.shape)

(21948, 921) (3624, 921) (23814, 206) (21948,)


# modelling

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"

In [14]:
print(device)
def seed_everything(seed=42): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

class MoaModel(nn.Module):
    def __init__(self, num_columns, last_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 1024))
        self.relu1 = nn.LeakyReLU()
        
        self.batch_norm2 = nn.BatchNorm1d(1024)
        self.dropout2 = nn.Dropout(0.2)
        self.dense2 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu2 = nn.LeakyReLU()
        
        self.batch_norm3 = nn.BatchNorm1d(1024)
        self.dropout3 = nn.Dropout(0.2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1024, 1024))
        self.relu3 = nn.LeakyReLU()
        
        self.batch_norm4 = nn.BatchNorm1d(1024)
        self.dropout4 = nn.Dropout(0.2)
        self.dense4 = nn.utils.weight_norm(nn.Linear(1024, last_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = self.relu1(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = self.relu2(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.relu3(self.dense3(x))
        
        x = self.batch_norm4(x)
        x = self.dropout4(x)
        x = self.dense4(x)
        
        return x

cuda


# modelling

In [15]:
batch_size = 128
n_folds=7
EARLY_STOPPING_STEPS = 10
smoothing = 0.001
p_min = smoothing
p_max = 1 - smoothing
train_epochs = 10

def multi_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]
    
    mskf=KFold(n_splits = n_folds, shuffle=True, random_state=224)

    models = []
    
    X_test2 = torch.tensor(X_test, dtype=torch.float32)
    test = torch.utils.data.TensorDataset(X_test2) 
    test_loader = torch.utils.data.DataLoader(test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),last_num])
    oof_targets = np.zeros([len(X_train),last_num])
    pred_value = np.zeros([test_len, last_num])
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Seed "+str(sample_seed)+"_Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.long)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.long)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = nn.CrossEntropyLoss() 

        optimizer = optim.Adam(clf.parameters(), lr = 0.01, weight_decay=1e-4) 
        scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3, 
                                              max_lr=2e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
        
        clf.to(device)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch)
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                scheduler.step()
                avg_loss += loss.item() / len(train_loader)  
                
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
                    
            if avg_val_loss < best_val_loss:
                best_val_loss = avg_val_loss
                print('Epoch {}  loss={:.5f}  val_loss={:.5f}  time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
                stop_counts = 0
            else:
                stop_counts += 1
                
            if stop_counts >= EARLY_STOPPING_STEPS:
                break
        
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))         
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), last_num])
        target_epoch = np.zeros([X_valid2.size(0), last_num])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
            y_pred = pred_model(x_batch).detach()
            oof_epoch[i * batch_size:(i+1) * batch_size,:] = F.softmax(y_pred.cpu()) 
        oof[valid_index,:] = oof_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, last_num])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = F.softmax(y_pred.cpu()) 
        pred_value += test_preds / n_folds
        # ------------------------------
    
    return oof, pred_value

In [16]:
seeds = [0,1,2,3,4]
target_oof = np.zeros([len(fn_train), class_num])
target_pred = np.zeros([len(fn_test), class_num])

for seed_ in seeds:
    best_oof, pytorch_pred = modelling_torch(fn_train, fn_multiclass_targets, fn_test, seed_, fn_train.shape[1], class_num)
    target_oof += best_oof / len(seeds)
    target_pred += pytorch_pred / len(seeds)

Seed 0_Fold 1
Epoch 1  loss=4.01575  val_loss=3.34556  time=1.43s
Epoch 2  loss=3.23968  val_loss=3.19645  time=0.84s
Epoch 3  loss=3.09389  val_loss=3.13372  time=0.85s
Epoch 4  loss=3.01580  val_loss=3.09864  time=0.85s
Epoch 5  loss=2.93302  val_loss=3.02499  time=0.85s
Epoch 6  loss=2.82352  val_loss=2.99378  time=0.97s
Epoch 7  loss=2.68276  val_loss=2.96382  time=0.95s
Epoch 8  loss=2.48420  val_loss=2.91642  time=0.89s
Seed 0_Fold 2
Epoch 1  loss=4.02648  val_loss=3.27061  time=0.84s
Epoch 2  loss=3.25370  val_loss=3.09267  time=0.86s
Epoch 3  loss=3.11123  val_loss=3.03394  time=0.84s
Epoch 4  loss=3.03129  val_loss=3.00535  time=0.84s
Epoch 5  loss=2.94985  val_loss=2.93242  time=0.84s
Epoch 6  loss=2.84200  val_loss=2.90246  time=0.88s
Epoch 7  loss=2.70030  val_loss=2.85700  time=1.06s
Epoch 8  loss=2.48780  val_loss=2.83162  time=0.85s
Epoch 9  loss=2.21096  val_loss=2.82899  time=0.91s
Seed 0_Fold 3
Epoch 1  loss=3.97721  val_loss=3.50129  time=0.85s
Epoch 2  loss=3.23790 

In [17]:
target_oof = np.dot(target_oof, classid_target)
target_pred = np.dot(target_pred, classid_target)

In [18]:
t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
train_checkscore = t.copy()
train_checkscore.loc[train_checkscore.index.isin(cons_train_index),target_feats] = np.clip(target_oof, p_min, p_max)
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0
t.drop("sig_id", axis=1, inplace=True)
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

OOF log loss:  0.01449598959310228


In [19]:
train_checkscore.to_csv("mlp_oof.csv", index=False)

In [20]:
sub.loc[cons_test_index,target_feats] = np.clip(target_pred, p_min, p_max)
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)