In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from category_encoders import CountEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

import time
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

from sklearn.kernel_approximation import Nystroem
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge

# Preprocess & Feature engineering

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
#non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

In [5]:
test = test[test.index.isin(cons_test_index)].reset_index(drop=True)

In [6]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

train, test = encoding(train, test)

In [7]:
def fe(df, remove_features):
    tmp = df.copy()
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

remove_features = ["cp_type" , "sig_id"]
        
train = fe(train, remove_features)
test = fe(test, remove_features)
    
print(train.shape, test.shape)

(23814, 874) (3624, 874)


# data preparation

In [8]:
# xgb --------------------------
X = train.copy()
y = targets.drop("sig_id", axis=1).copy()
X_test = test.copy()

# pytorch and logistic regression-----------------------
fn_train = train.copy() 
fn_test = test.copy() 
fn_targets = targets.drop("sig_id", axis=1).copy()

fn_train = fn_train[fn_train.index.isin(cons_train_index)].copy().reset_index(drop=True).to_numpy()
fn_targets = fn_targets[fn_targets.index.isin(cons_train_index)].copy().reset_index(drop=True).to_numpy()

ss = preprocessing.StandardScaler()
fn_train= ss.fit_transform(fn_train)
fn_test = ss.transform(fn_test)

# 1st XGB

In [9]:
classifier = MultiOutputClassifier(XGBClassifier(tree_method='gpu_hist'))

clf = Pipeline([('classify', classifier)
               ])

params = {'classify__estimator__gamma': 3.6975,
          'classify__estimator__learning_rate': 0.0703,
          'classify__estimator__max_delta_step': 2.0706,
          'classify__estimator__max_depth': 10,
          'classify__estimator__min_child_weight': 31.5800,
          'classify__estimator__n_estimators': 166,
         }

clf.set_params(**params)

Pipeline(steps=[('classify',
                 MultiOutputClassifier(estimator=XGBClassifier(base_score=None,
                                                               booster=None,
                                                               colsample_bylevel=None,
                                                               colsample_bynode=None,
                                                               colsample_bytree=None,
                                                               gamma=3.6975,
                                                               gpu_id=None,
                                                               importance_type='gain',
                                                               interaction_constraints=None,
                                                               learning_rate=0.0703,
                                                               max_delta_step=2.0706,
                                                   

In [10]:
def modelling_xgb(X, y, X_test, seed):
    NFOLDS=5
    oof_preds = np.zeros(y.shape)
    test_preds = np.zeros((X_test.shape[0], y.shape[1]))
    oof_losses = []
    mskf = MultilabelStratifiedKFold(n_splits=NFOLDS, random_state=seed, shuffle=True)
    for fn, (trn_idx, val_idx) in enumerate(mskf.split(X, y)):
        print('Starting fold: ', fn)
        X_train, X_val = X.iloc[trn_idx,:], X.iloc[val_idx,:].to_numpy()
        y_train, y_val = y.iloc[trn_idx], y.iloc[val_idx].to_numpy()
    
        X_train = X_train[X_train.index.isin(cons_train_index)].to_numpy()
        y_train = y_train[y_train.index.isin(cons_train_index)].to_numpy()
    
        clf.fit(X_train, y_train)
        val_preds = clf.predict_proba(X_val) # list of preds per class
        val_preds = np.array(val_preds)[:,:,1].T # take the positive class
        oof_preds[val_idx] = val_preds
    
        loss = log_loss(np.ravel(y_val), np.ravel(val_preds))
        print(loss)
        oof_losses.append(loss)
        preds = clf.predict_proba(X_test)
        preds = np.array(preds)[:,:,1].T # take the positive class
        test_preds += preds / NFOLDS
    
    print(oof_losses)
    print('Mean OOF loss across folds', np.mean(oof_losses))
    print('STD OOF loss across folds', np.std(oof_losses))
    return oof_preds, test_preds

In [11]:
seeds = [42,43,44]
xgb1_oof = np.zeros(y.shape)
xgb1_test = np.zeros((test.shape[0], y.shape[1]))
for seed_ in seeds:
    ind_preds, ind_test_preds = modelling_xgb(X, y, X_test, seed_)
    xgb1_oof += ind_preds / len(seeds)
    xgb1_test += ind_test_preds / len(seeds)

Starting fold:  0
0.016738832513000318
Starting fold:  1
0.016908957230028557
Starting fold:  2
0.016773734659711288
Starting fold:  3
0.016497753560878757
Starting fold:  4
0.01667905765085504
[0.016738832513000318, 0.016908957230028557, 0.016773734659711288, 0.016497753560878757, 0.01667905765085504]
Mean OOF loss across folds 0.01671966712289479
STD OOF loss across folds 0.00013417608848077545
Starting fold:  0
0.01668812742202729
Starting fold:  1
0.01672661540135191
Starting fold:  2
0.01671405352510354
Starting fold:  3
0.016696373422081447
Starting fold:  4
0.016772998153547904
[0.01668812742202729, 0.01672661540135191, 0.01671405352510354, 0.016696373422081447, 0.016772998153547904]
Mean OOF loss across folds 0.016719633584822417
STD OOF loss across folds 2.9870833562472123e-05
Starting fold:  0
0.016696372969820933
Starting fold:  1
0.01672151248200614
Starting fold:  2
0.016674213856001072
Starting fold:  3
0.016673970749857346
Starting fold:  4
0.016831564237686652
[0.016696

In [12]:
check_xgb1 = targets.copy()
check_xgb1.iloc[:,1:] = xgb1_oof
check_xgb1.loc[check_xgb1.index.isin(noncons_train_index),target_feats] = 0
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(np.array(check_xgb1.iloc[:,1:]))))

OOF log loss:  0.016352895291178133


# 1st NN 

In [13]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)
batch_size = 128
train_epochs = 40
n_folds=5
EARLY_STOPPING_STEPS = 10

def mean_log_loss(y_true, y_pred):
    metrics = []
    for i, target in enumerate(target_feats):
        metrics.append(log_loss(y_true[:, i], y_pred[:, i].astype(float), labels=[0,1]))
    return np.mean(metrics)

def seed_everything(seed=1234): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    tf.random.set_seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
class MoaModel(nn.Module):
    def __init__(self, num_columns, last_columns_num):
        super(MoaModel, self).__init__()
        self.batch_norm1 = nn.BatchNorm1d(num_columns)
        self.dropout1 = nn.Dropout(0.2)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_columns, 2048))
        
        self.batch_norm2 = nn.BatchNorm1d(2048)
        self.dropout2 = nn.Dropout(0.6)
        self.dense2 = nn.utils.weight_norm(nn.Linear(2048, 1048))
        
        self.batch_norm3 = nn.BatchNorm1d(1048)
        self.dropout3 = nn.Dropout(0.6)
        self.dense3 = nn.utils.weight_norm(nn.Linear(1048, last_columns_num))
        
    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.relu(self.dense1(x))
        
        x = self.batch_norm2(x)
        x = self.dropout2(x)
        x = F.relu(self.dense2(x))
        
        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)
        
        return x
    
def modelling_torch(tr, target, te, sample_seed, init_num, last_num, layer):
    seed_everything(seed=sample_seed) 
    X_train = tr.copy()
    y_train = target.copy()
    X_test = te.copy()
    test_len = X_test.shape[0]

    mskf=MultilabelStratifiedKFold(n_splits = n_folds, shuffle=True, random_state=2)
    models = []
    
    X_test = torch.tensor(X_test, dtype=torch.float32)
    X_test = torch.utils.data.TensorDataset(X_test) 
    test_loader = torch.utils.data.DataLoader(X_test, batch_size=batch_size, shuffle=False)
    
    oof = np.zeros([len(X_train),y_train.shape[1]])
    oof_targets = np.zeros([len(X_train),y_train.shape[1]])
    pred_value = np.zeros([test_len, y_train.shape[1]])
    scores = []
    for fold, (train_index, valid_index) in enumerate(mskf.split(X_train, y_train)):
        print("Fold "+str(fold+1))
        X_train2 = torch.tensor(X_train[train_index,:], dtype=torch.float32)
        y_train2 = torch.tensor(y_train[train_index], dtype=torch.float32)

        X_valid2 = torch.tensor(X_train[valid_index,:], dtype=torch.float32)
        y_valid2 = torch.tensor(y_train[valid_index], dtype=torch.float32)
            
        clf = MoaModel(init_num, last_num)
        loss_fn = torch.nn.BCEWithLogitsLoss() 
        optimizer = optim.Adam(clf.parameters(), lr = 0.001, weight_decay=1e-5) 
        scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, eps=1e-4, verbose=True)
        
        train = torch.utils.data.TensorDataset(X_train2, y_train2)
        valid = torch.utils.data.TensorDataset(X_valid2, y_valid2)
        
        clf.to(device)
        
        train_loader = torch.utils.data.DataLoader(train, batch_size=batch_size, shuffle=True) 
        valid_loader = torch.utils.data.DataLoader(valid, batch_size=batch_size, shuffle=False)
        
        best_val_loss = np.inf
        stop_counts = 0
        for epoch in range(train_epochs):
            start_time = time.time()
            clf.train()
            avg_loss = 0.
            for x_batch, y_batch in tqdm(train_loader, disable=True):
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch) 
                loss = loss_fn(y_pred, y_batch)
                optimizer.zero_grad()
                loss.backward()
                optimizer.step()
                avg_loss += loss.item() / len(train_loader) 
    
            clf.eval()
            avg_val_loss = 0.
            for i, (x_batch, y_batch) in enumerate(valid_loader): 
                x_batch = x_batch.to(device)
                y_batch = y_batch.to(device)
                y_pred = clf(x_batch).detach()
                avg_val_loss += loss_fn(y_pred, y_batch).item() / len(valid_loader)
        
            elapsed_time = time.time() - start_time 
            scheduler.step(avg_val_loss)

            if avg_val_loss < best_val_loss:
                stop_counts = 0
                best_val_loss = avg_val_loss
                print('Best model: Epoch {} \t loss={:.6f} \t val_loss={:.6f} \t time={:.2f}s'.format(
                    epoch + 1, avg_loss, avg_val_loss, elapsed_time))
                torch.save(clf.state_dict(), 'best-model-parameters.pt')
            else:
                stop_counts += 1
         
        pred_model = MoaModel(init_num, last_num)
        pred_model.load_state_dict(torch.load('best-model-parameters.pt'))
        pred_model.eval()
        
        # validation check ----------------
        oof_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        target_epoch = np.zeros([X_valid2.size(0), y_train.shape[1]])
        for i, (x_batch, y_batch) in enumerate(valid_loader): 
                y_pred = pred_model(x_batch).sigmoid().detach()
                oof_epoch[i * batch_size:(i+1) * batch_size,:] = y_pred.cpu().numpy()
                target_epoch[i * batch_size:(i+1) * batch_size,:] = y_batch.cpu().numpy()
        print("Fold {} log loss: {}".format(fold+1, mean_log_loss(target_epoch, oof_epoch)))
        scores.append(mean_log_loss(target_epoch, oof_epoch))
        oof[valid_index,:] = oof_epoch
        oof_targets[valid_index,:] = target_epoch
        #-----------------------------------
        
        # test predcition --------------
        test_preds = np.zeros([test_len, y_train.shape[1]])
        for i, (x_batch,) in enumerate(test_loader): 
            y_pred = pred_model(x_batch).sigmoid().detach()
            test_preds[i * batch_size:(i+1) * batch_size, :] = y_pred.cpu().numpy()
        pred_value += test_preds / n_folds
        # ------------------------------
        
    print("Seed {}".format(seed_))
    for i, ele in enumerate(scores):
        print("Fold {} log loss: {}".format(i+1, scores[i]))
    print("Std of log loss: {}".format(np.std(scores)))
    print("Total log loss: {}".format(mean_log_loss(oof_targets, oof)))
    
    return oof, oof_targets, pred_value

cuda


In [14]:
seeds = [0,1,2,3,4]
pytorch1_oof = np.zeros([len(fn_train),fn_targets.shape[1]])
pytorch1_test = np.zeros([len(fn_test),fn_targets.shape[1]])

for seed_ in seeds:
    oof, oof_targets, pytorch_pred = modelling_torch(fn_train, fn_targets, fn_test, seed_, fn_train.shape[1], fn_targets.shape[1],1)
    pytorch1_oof += oof / len(seeds)
    pytorch1_test += pytorch_pred / len(seeds)

Fold 1
Best model: Epoch 1 	 loss=0.412591 	 val_loss=0.077221 	 time=1.70s
Best model: Epoch 2 	 loss=0.048580 	 val_loss=0.028151 	 time=0.79s
Best model: Epoch 3 	 loss=0.027003 	 val_loss=0.022851 	 time=0.81s
Best model: Epoch 4 	 loss=0.023460 	 val_loss=0.021035 	 time=0.76s
Best model: Epoch 5 	 loss=0.021372 	 val_loss=0.019930 	 time=0.76s
Best model: Epoch 6 	 loss=0.020710 	 val_loss=0.019312 	 time=0.76s
Best model: Epoch 7 	 loss=0.020091 	 val_loss=0.019111 	 time=0.75s
Best model: Epoch 8 	 loss=0.019413 	 val_loss=0.018614 	 time=0.76s
Best model: Epoch 9 	 loss=0.018994 	 val_loss=0.018114 	 time=0.95s
Best model: Epoch 10 	 loss=0.018733 	 val_loss=0.017949 	 time=0.76s
Best model: Epoch 11 	 loss=0.018251 	 val_loss=0.017597 	 time=0.77s
Best model: Epoch 13 	 loss=0.017651 	 val_loss=0.017239 	 time=0.74s
Best model: Epoch 14 	 loss=0.017350 	 val_loss=0.017176 	 time=0.75s
Best model: Epoch 15 	 loss=0.017291 	 val_loss=0.017116 	 time=0.76s
Best model: Epoch 16 	

In [15]:
check_pytorch1 = targets.copy()
check_pytorch1.loc[cons_train_index,target_feats] = pytorch1_oof
check_pytorch1.loc[noncons_train_index,target_feats] = 0
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(np.array(check_pytorch1.iloc[:,1:]))))

OOF log loss:  0.014671957119226644


# 1st Logistic Regression

In [16]:
N_SPLITS = 5

def log_loss_metric(y_true, y_pred):
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = - np.mean(np.mean(y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip), axis = 1))
    return loss

def modelling_lr(tr, ta, te):    
    oof = np.zeros([len(tr),ta.shape[1]])
    pred_value = np.zeros([te.shape[0], ta.shape[1]])
    
    mskf_lr = MultilabelStratifiedKFold(n_splits = N_SPLITS, random_state = 0, shuffle = True)
    
    for n, (train_index, val_index) in enumerate(mskf_lr.split(tr, ta)):
        x_tr, x_val = tr[train_index], tr[val_index]
        y_tr, y_val = ta[train_index], ta[val_index]
        
        model = KernelRidge(alpha = 80, kernel = 'rbf')
        model.fit(x_tr, y_tr)

        fold_pred = model.predict(x_val)
        pred_value += model.predict(te) / N_SPLITS
        oof[val_index,:] = fold_pred
        fold_score = log_loss_metric(y_val, fold_pred)
        print('KRR: Fold {} Score {}:'.format(n+1, fold_score))
    return oof, pred_value

In [17]:
lr0_oof = np.zeros([len(fn_train), fn_targets.shape[1]])
lr0_test = np.zeros([len(fn_test), fn_targets.shape[1]])
lr0_oof, lr0_test = modelling_lr(fn_train, fn_targets, fn_test)

KRR: Fold 1 Score 0.020053231410728768:
KRR: Fold 2 Score 0.020395236493920596:
KRR: Fold 3 Score 0.02040202134683225:
KRR: Fold 4 Score 0.02020233106232731:
KRR: Fold 5 Score 0.02046674791227364:


In [18]:
lr1_test = pd.read_csv('../input/lish-moa/sample_submission.csv')
lr1_test.loc[:, target_feats] = 0
lr1_oof = np.zeros([fn_targets.shape[0],fn_targets.shape[1]]) 

for ind in tqdm(range(len(target_feats))):

    ind_target_sum = targets.drop("sig_id", axis=1).copy().values[:, ind].sum()

    if ind_target_sum >= N_SPLITS:

        skf = StratifiedKFold(n_splits = N_SPLITS, random_state = 0, shuffle = True)
        for n, (train_index, val_index) in enumerate(skf.split(lr0_oof, fn_targets[:,ind])):
            x_tr, x_val = lr0_oof[train_index, ind].reshape(-1, 1), lr0_oof[val_index, ind].reshape(-1, 1)
            y_tr, y_val = fn_targets[train_index,ind], fn_targets[val_index,ind]
            model = LogisticRegression(penalty = 'none', max_iter = 1000)
            model.fit(x_tr, y_tr)
            
            lr1_test.loc[cons_test_index, target_feats[ind]] += model.predict_proba(lr0_test[:, ind].reshape(-1, 1))[:, 1] / N_SPLITS
            lr1_oof[val_index, ind] += model.predict_proba(x_val)[:, 1]

HBox(children=(FloatProgress(value=0.0, max=206.0), HTML(value='')))




In [19]:
check_lr1 = targets.copy()
check_lr1.loc[cons_train_index,target_feats] = res_lr
check_lr1.loc[noncons_train_index,target_feats] = 0
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(np.array(check_lr1.iloc[:,1:]))))

NameError: name 'res_lr' is not defined

# 1st SVM

# submission

In [20]:
# weight optimization
#class OptimizedRounder(object):
#    def __init__(self, length):
#        self.coef_ = [0 for in range(length)]

#    def _log_loss(self, coef, Xs, y):
#        X_p = np.zeros_like(Xs[0])
#        for i in range(len(coef)):
#            X_p += coef[i] * Xs[i]
#        return log_loss(np.ravel(y), np.ravel(np.array(X_p)))
    
#    def fit(self, X, y, random_flg = False):
#        loss_partial = partial(self._log_loss, X=X, y=y)
#        if random_flg:
#            initial_coef = [np.random.uniform(0.4,0.5), np.random.uniform(0.5,0.6), np.random.uniform(0.6,0.7)]
#        else:
#            initial_coef = [1/length for i in range(length)]
#        self.coef_ = sp.optimize.minimize(loss_partial, initial_coef, method='nelder-mead') #Powell
        
#    def predict(self, X, coef):
#        return pd.cut(X, [-np.inf] + list(np.sort(coef)) + [np.inf], labels = [0, 1, 2, 3])

#    def coefficients(self):
#        return self.coef_
    
#best_score = 100
#for i in range(10):
#    optR = OptimizedRounder()
#    optR.fit(, y, random_flg=False)
#    coefficients = optR.coefficients()
#    score = qwk(new_train.accuracy_group, final_valid_pred)
#    print(i, np.sort(coefficients), score)
#    if score > best_score:
#        best_score = score
#        best_coefficients = coefficients
#final_test_pred = pd.cut(np.array(test_exp_accuracy).reshape(-1,), [-np.inf] + list(np.sort(best_coefficients)) + [np.inf], labels = [0, 1, 2, 3])

In [21]:
check = 0.1 * check_lr1.iloc[:,1:] + 0.2 * check_xgb1.iloc[:,1:] + 0.7 * check_pytorch1.iloc[:,1:]
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(np.array(check))))

OOF log loss:  0.007991414664058021


In [22]:
sub_torch = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub_xgb = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub_lr = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

sub_torch.loc[cons_test_index,target_feats] = pytorch1_test
sub_torch.loc[noncons_test_index,target_feats] = 0
sub_xgb.loc[cons_test_index,target_feats] = xgb1_test
sub_xgb.loc[noncons_test_index,target_feats] = 0
sub_lr.loc[cons_test_index,target_feats] = lr1_test
sub_lr.loc[noncons_test_index,target_feats] = 0

sub[target_feats] = 0.1 * sub_lr.iloc[:,1:] + 0.2 * sub_xgb.iloc[:,1:] + 0.7 * sub_torch.iloc[:,1:]
sub.to_csv('submission.csv', index=False)