- 1st lstm

In [1]:
import os
import sys
import random
import warnings
import numpy as np
import pandas as pd 
from sklearn import preprocessing
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.decomposition import PCA
from tqdm import tqdm_notebook as tqdm
from sklearn.multioutput import MultiOutputClassifier
from sklearn.feature_selection import VarianceThreshold

sys.path.append('../input/multilabelstraifier/')
sys.path.append('../input/lookahead/')
from ml_stratifiers import MultilabelStratifiedKFold
from lookahead import Lookahead
warnings.filterwarnings('ignore')

import time
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

# preprocess

In [5]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
non_targets = non_targets[non_targets.index.isin(cons_train_index)].copy().reset_index(drop=True)

# feature engineering

In [6]:
def fe(df):
    tmp = df.copy()
    #tmp['g_sum'] = tmp[g_feats].sum(axis = 1)
    #tmp['g_mean'] = tmp[g_feats].mean(axis = 1)
    #tmp['g_std'] = tmp[g_feats].std(axis = 1)
    #tmp['g_kurt'] = tmp[g_feats].kurtosis(axis = 1)
    #tmp['g_skew'] = tmp[g_feats].skew(axis = 1)
    #tmp['c_sum'] = tmp[c_feats].sum(axis = 1)
    #tmp['c_mean'] = tmp[c_feats].mean(axis = 1)
    #tmp['c_std'] = tmp[c_feats].std(axis = 1)
    #tmp['c_kurt'] = tmp[c_feats].kurtosis(axis = 1)
    #tmp['c_skew'] = tmp[c_feats].skew(axis = 1)
    #tmp['gc_sum'] = tmp[c_feats + g_feats].sum(axis = 1)
    #tmp['gc_mean'] = tmp[c_feats + g_feats].mean(axis = 1)
    #tmp['gc_std'] = tmp[c_feats + g_feats].std(axis = 1)
    #tmp['gc_kurt'] = tmp[c_feats + g_feats].kurtosis(axis = 1)
    #tmp['gc_skew'] = tmp[c_feats + g_feats].skew(axis = 1)
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
        
    tmp.drop(["cp_type", "sig_id", "cp_time"], axis=1, inplace=True)
    return tmp

f_train = fe(train)
f_test = fe(test)

print(f_train.shape, f_test.shape)

(21948, 873) (3982, 873)


In [7]:
fn_train = f_train.copy().to_numpy()
fn_test = f_test.copy().to_numpy()

ss = preprocessing.RobustScaler()
fn_train= ss.fit_transform(fn_train)
fn_test = ss.transform(fn_test)

fn_nontargets = non_targets.drop("sig_id", axis=1).copy().to_numpy()
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()

In [8]:
class CFG:
    max_grad_norm = 1000
    gradient_accumulation_steps = 1
    hidden_size = 1024
    hidden_dim = 256
    dropout = 0.5
    lr = 1e-2
    weight_decay = 1e-6
    batch_size = 256
    epochs = 20
    # total_cate_size=5
    # emb_size=4
    #num_features = num_features
    #cat_features = cat_features
    #target_cols = target_cols
    target_length = 206


class LSTMClassifier(nn.Module):
    def __init__(self, cfg):
        super().__init__()

        self.gru_hidden_size = 64
        self.lstm_hidden_size = 873

        self.embedding_dropout = nn.Dropout2d(0.2)
        self.lstm = nn.LSTM(self.lstm_hidden_size, cfg.hidden_dim, batch_first=True, bidirectional=True)

        self.gru = nn.GRU(cfg.hidden_dim * 2, self.gru_hidden_size, bidirectional=True, batch_first=True)
        self.linear = nn.Linear(384, self.gru_hidden_size * 6)
        self.cls = nn.Linear(cfg.hidden_dim, cfg.target_length)

        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.2)
        self.out = nn.Linear(self.gru_hidden_size * 6, cfg.target_length)
        
    def forward(self, cont_x):
        cont_x = torch.unsqueeze(cont_x, 1)
        h_lstm, lstm_out = self.lstm(cont_x)
        h_gru, hh_gru = self.gru(h_lstm)
        hh_gru = hh_gru.view(-1, self.gru_hidden_size * 2)

        avg_pool = torch.mean(h_gru, 1)
        max_pool, _ = torch.max(h_gru, 1)
        conc = torch.cat((hh_gru, avg_pool, max_pool), 1)

        dropped = self.dropout(conc)
        out = self.out(dropped)
        return out

In [9]:
batch_size = 128
train_epochs = 30
n_folds=5
EARLY_STOPPING_STEPS = 10

device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True

def modelling_lstm(tr, target, te, sample_seed, init_num, last_num, files):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    models = []
    
    X_test = torch.tensor(X_test, dtype=torch.float32)
    X_test = torch.utils.data.TensorDataset(X_test) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = LSTMClassifier(CFG)
        if files != []:
            clf.load_state_dict(torch.load(files[fold]))
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        #lookahead = Lookahead(optimizer, k=10, alpha=0.6) #lookahead
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader)        
            
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
                
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)
                    
            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
        
            #if stop_counts >= EARLY_STOPPING_STEPS: 
            #    break
         
        pred_model = LSTMClassifier(CFG)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).sigmoid().detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy()
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))

    return oof, oof_targets, pred_value

cuda


In [10]:
seeds = [10,40]
#seeds = [0,1,2,3]

target_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
target_pred = np.zeros([len(fn_test),fn_targets.shape[1]])

nontarget_oof = np.zeros([len(fn_train),fn_nontargets.shape[1]])
nontarget_pred = np.zeros([len(fn_test),fn_nontargets.shape[1]])

for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_lstm(fn_train, fn_targets, fn_test, seed_, fn_train.shape[1], fn_targets.shape[1],[])
    target_oof += oof / len(seeds)
    target_pred += pytorch_pred / len(seeds)
print("Total log loss in targets: {}".format(mean_log_loss(oof_targets, target_oof)))

#for seed_ in seeds:
#    oof, oof_targets, pytorch_pred = modelling_lstm(fn_train, fn_nontargets, fn_test, seed_, fn_train.shape[1], fn_nontargets.shape[1],[])
#    nontarget_oof += oof / len(seeds)
#    nontarget_pred += pytorch_pred / len(seeds)
#print("Total log loss in Non targets: {}".format(mean_log_loss(oof_targets, nontarget_oof)))

Fold 1
Best model: Epoch 1 	 loss=0.076298 	 val_loss=0.021729 	 time=1.11s
Best model: Epoch 2 	 loss=0.021409 	 val_loss=0.020980 	 time=0.93s
Best model: Epoch 3 	 loss=0.020804 	 val_loss=0.020529 	 time=1.14s
Best model: Epoch 4 	 loss=0.020367 	 val_loss=0.020025 	 time=0.94s
Best model: Epoch 5 	 loss=0.019787 	 val_loss=0.019520 	 time=0.93s
Best model: Epoch 6 	 loss=0.019163 	 val_loss=0.019026 	 time=1.04s
Best model: Epoch 7 	 loss=0.018694 	 val_loss=0.018681 	 time=0.96s
Best model: Epoch 8 	 loss=0.018319 	 val_loss=0.018489 	 time=1.04s
Best model: Epoch 9 	 loss=0.017975 	 val_loss=0.018239 	 time=0.97s
Best model: Epoch 10 	 loss=0.017668 	 val_loss=0.018077 	 time=0.94s
Best model: Epoch 11 	 loss=0.017424 	 val_loss=0.017945 	 time=0.93s
Best model: Epoch 13 	 loss=0.016890 	 val_loss=0.017737 	 time=0.94s
Best model: Epoch 15 	 loss=0.016328 	 val_loss=0.017559 	 time=0.95s
Best model: Epoch 18 	 loss=0.015533 	 val_loss=0.017555 	 time=0.92s
Best model: Epoch 20 	

In [11]:
t = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
train_checkscore = t.copy()
train_checkscore.loc[train_checkscore.index.isin(cons_train_index),target_feats] = target_oof
train_checkscore.loc[train_checkscore.index.isin(noncons_train_index),target_feats] = 0
t.drop("sig_id", axis=1, inplace=True)
print('OOF log loss: ', log_loss(np.ravel(t), np.ravel(np.array(train_checkscore.iloc[:,1:]))))

OOF log loss:  0.015772764380511603


In [12]:
sub[target_feats] = target_pred
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)