- incorporate new validation

In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import VarianceThreshold

sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf
from torch.nn.modules.loss import _WeightedLoss

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
def make_fold(NB_SPLITS, seed):   
    folds = []
    # LOAD FILES
    train_score = targets.merge(drug, on='sig_id', how='left') 

    # LOCATE DRUGS
    vc = train_score.drug_id.value_counts()
    vc1 = vc.loc[vc <= 19].index.sort_values()
    vc2 = vc.loc[vc > 19].index.sort_values()
    
    # STRATIFY DRUGS 18X OR LESS
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.groupby('drug_id')[target_feats].mean().loc[vc1]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    # STRATIFY DRUGS MORE THAN 18X
    skf = MultilabelStratifiedKFold(n_splits = NB_SPLITS, shuffle = True, random_state = seed)
    tmp = train_score.loc[train_score.drug_id.isin(vc2)].reset_index(drop = True)
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    # ASSIGN FOLDS
    train_score['fold'] = train_score.drug_id.map(dct1)
    train_score.loc[train_score.fold.isna(),'fold'] = train_score.loc[train_score.fold.isna(),'sig_id'].map(dct2)
    train_score.fold = train_score.fold.astype('int8')
    folds.append(train_score.fold.values)
    
    return np.array(folds).reshape(-1,1)

In [6]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [7]:
lstm_folds = make_fold(7, 34)

# feature engineering

In [8]:
def fe(df):
    tmp = df.copy()
    #tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})        
    tmp.drop(["cp_type", "sig_id", "cp_dose", "cp_time"], axis=1, inplace=True)
    return tmp

f_train = fe(train)
f_test = fe(test)

print(f_train.shape, f_test.shape)

(21948, 872) (3982, 872)


In [9]:
X = f_train.copy().values
select = VarianceThreshold(threshold=0.4)
X_new = select.fit_transform(X)
drop_feats = list(np.array(f_train.columns)[select.get_support()==False])
len(drop_feats)

4

In [10]:
f_train.drop(drop_feats, axis = 1, inplace=True)
f_test.drop(drop_feats, axis = 1, inplace=True)

modg_feats = [i for i in f_train.columns if "g-" in i]
modc_feats = [i for i in f_train.columns if "c-" in i]

for i in c_feats + g_feats:
    ss = preprocessing.RobustScaler()
    ss.fit(f_train[i].values.reshape(-1,1))
    f_train[i] = ss.transform(f_train[i].values.reshape(-1,1))
    f_test[i] = ss.transform(f_test[i].values.reshape(-1,1))

f_train["fold"] = lstm_folds
fn_train = f_train.copy().to_numpy()
fn_test = f_test.copy().to_numpy()

fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()

KeyError: 'g-104'

In [11]:
class SmoothCrossEntropyLoss(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction

    @staticmethod
    def _smooth(targets, n_classes, smoothing=0.0):
        assert 0 <= smoothing <= 1
        with torch.no_grad():
#             targets = targets * (1.0 - smoothing) + 0.5 * smoothing
            targets = targets * (1 - smoothing) + torch.ones_like(targets).to(device) * smoothing / n_classes
        return targets

    def forward(self, inputs, targets):
        targets = SmoothCrossEntropyLoss()._smooth(targets, inputs.shape[1], self.smoothing)

        if self.weight is not None:
            inputs = inputs * self.weight.unsqueeze(0)

        loss = F.binary_cross_entropy_with_logits(inputs, targets)

        return loss

In [12]:
class myLSTM(nn.Module):
    def __init__(self, lstm_hidden_size, c_lstm_hidden_size, last_num):
        super().__init__()

        self.g_layer_num = 1
        self.c_layer_num = 1

        self.hidden_dim = 512
        self.hidden_dim_c = 10
        
        self.lstm = nn.LSTM(lstm_hidden_size, self.hidden_dim, batch_first=True, bidirectional=True, num_layers=self.g_layer_num)
        self.c_lstm = nn.LSTM(c_lstm_hidden_size, self.hidden_dim_c, batch_first=True, bidirectional=True, num_layers=self.c_layer_num)
        
        self.batch_norm = nn.BatchNorm1d((self.hidden_dim+self.hidden_dim_c) * 2)
        self.dropout = nn.Dropout(0.1)
        self.out = nn.utils.weight_norm(nn.Linear((self.hidden_dim+self.hidden_dim_c) * 2, last_num))
        
    def forward(self, cont_g, cont_c): 
        cont_g = torch.unsqueeze(cont_g, 1)
        h_lstm, lstm_out = self.lstm(cont_g) # h_lstm: 256 * 1 * (2 * 512)
        conc_g = h_lstm.view(-1, self.hidden_dim * 2)
        
        cont_c = torch.unsqueeze(cont_c, 1)
        h_lstm_c, lstm_out_c = self.c_lstm(cont_c) # h_lstm: 256 * 1 * (2 * 5)
        conc_c = h_lstm_c.view(-1, self.hidden_dim_c * 2)
        
        conc = torch.cat((conc_g, conc_c),1)
        conc = self.batch_norm(conc)
        dropped = self.dropout(conc)
        out = self.out(dropped)

        return out

In [13]:
batch_size = 256
train_epochs = 30
n_folds=7
EARLY_STOPPING_STEPS = 10
smoothing = 0.001
p_min = smoothing
p_max = 1 - smoothing

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def modelling_lstm(tr, target, te, sample_seed, last_num):
    
    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    metric = lambda inputs, targets : F.binary_cross_entropy((torch.clamp(torch.sigmoid(inputs), p_min, p_max)), targets)
    
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]
    
    models = []
    
    X_test_g = torch.tensor(X_test[:,:len(modg_feats)], dtype=torch.float32)
    X_test_c = torch.tensor(X_test[:,len(modg_feats):], dtype=torch.float32)

    X_test = torch.utils.data.TensorDataset(X_test_g, X_test_c) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
            
    for fold in range(n_folds):
        valid_index = X_train[:,-1] == fold
        train_index = X_train[:,-1] != fold        
        print("Fold "+str(fold+1))
        X_train2_g = torch.tensor(X_train[train_index,:len(modg_feats)], dtype=torch.float32)
        X_valid2_g = torch.tensor(X_train[valid_index,:len(modg_feats)], dtype=torch.float32)
        X_train2_c = torch.tensor(X_train[train_index,len(modg_feats):-1], dtype=torch.float32)
        X_valid2_c = torch.tensor(X_train[valid_index,len(modg_feats):-1], dtype=torch.float32)
        
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
        
        train = torch.utils.data.TensorDataset(X_train2_g, X_train2_c, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2_g, X_valid2_c, y_valid2)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
            
        clf = myLSTM(len(modg_feats), len(modc_feats), last_num)
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        #loss_fn = SmoothCrossEntropyLoss(smoothing=smoothing)

        optimizer = optim.Adam(clf.parameters(), lr = 0.01, weight_decay=1e-5) 
        #lookahead = Lookahead(optimizer, k=3, alpha=0.5) #lookahead
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        #scheduler2 = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e1, 
        #                                      max_lr=1e-2, epochs=train_epochs, steps_per_epoch=len(train_loader))
    
        clf.to(device)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            sm_avg_loss = 0.
            for x_batch_g, x_batch_c, y_batch in tqdm(train_loader, disable=True):
                x_batch_g = x_batch_g.to(device)
                x_batch_c = x_batch_c.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch_g, x_batch_c) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                #scheduler.step()
                avg_loss += loss.item() / len(train_loader)  
                sm_avg_loss += metric(y_pred, y_batch) / len(train_loader) 
            
            clf.eval()
            avg_val_loss = 0.
            sm_avg_val_loss = 0.
            for i, (x_batch_g, x_batch_c, y_batch) in enumerate(valid_loader): 
                x_batch_g = x_batch_g.to(device)
                x_batch_c = x_batch_c.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch_g, x_batch_c).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                sm_avg_val_loss += metric(y_pred, y_batch) / len(valid_loader)
                
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best: Epoch {} \t loss={:.6f}  val_loss={:.6f}  sm_loss={:.6f} \t sm_val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, sm_avg_loss, sm_avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')

            else:
                stop_counts += 1
        
            #if stop_counts >= EARLY_STOPPING_STEPS: 
            #    break
         
        pred_model = myLSTM(len(modg_feats), len(modc_feats), last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2_g.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2_g.size(0), y_train.shape[1]])
        for i, (x_batch_g, x_batch_c, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch_g, x_batch_c).sigmoid().detach() #
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy() #torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max) #
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch_g, x_batch_c, ) in enumerate(test_loader): 
            y_pred = pred_model(x_batch_g, x_batch_c).sigmoid().detach() #
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy() #torch.clamp(torch.sigmoid(y_pred.cpu()), p_min, p_max) #
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))

    return oof, oof_targets, pred_value

cuda


In [14]:
target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

seeds = [0, 1, 2]

for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_lstm(fn_train, fn_targets, fn_test, seed_, fn_targets.shape[1])
    target_oof += oof / len(seeds)
    target_pred += pytorch_pred / len(seeds)
print("Total log loss in targets: {}".format(mean_log_loss(oof_targets, target_oof)))

NameError: name 'fn_train' is not defined

In [15]:
t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
train_checkscore = t.copy()
train_checkscore.loc[train_checkscore.index.isin(cons_train_index),target_feats] = target_oof
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0

t.drop("sig_id", axis=1, inplace=True)

print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

NameError: name 'target_oof' is not defined

In [16]:
train_checkscore.to_csv("lstm_newval_oof.csv", index=False)

In [17]:
sub[target_feats] = target_pred
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)

NameError: name 'target_pred' is not defined