- 1st linear model

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from xgboost import XGBClassifier
from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from category_encoders import CountEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.metrics import log_loss, mean_squared_error
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

import time
import torch
import random
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import tensorflow as tf

from sklearn.kernel_approximation import Nystroem
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge

# preprocess and feature engineering

In [2]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
#non_targets = pd.read_csv(DATA_DIR + 'train_targets_nonscored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

In [5]:
test = test[test.index.isin(cons_test_index)].reset_index(drop=True)

In [6]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

train, test = encoding(train, test)

In [7]:
def fe(df, remove_features):
    tmp = df.copy()
    tmp.drop(remove_features, axis=1, inplace=True)
    return tmp

remove_features = ["cp_type" , "sig_id"]
        
train = fe(train, remove_features)
test = fe(test, remove_features)
    
print(train.shape, test.shape)

(23814, 874) (3624, 874)


In [8]:
fn_train = train.copy() 
fn_test = test.copy() 
fn_targets = targets.drop("sig_id", axis=1).copy()

fn_train = fn_train[fn_train.index.isin(cons_train_index)].copy().reset_index(drop=True).to_numpy()
fn_targets = fn_targets[fn_targets.index.isin(cons_train_index)].copy().reset_index(drop=True).to_numpy()

ss = preprocessing.StandardScaler()
fn_train= ss.fit_transform(fn_train)
fn_test = ss.transform(fn_test)

# modelling

In [9]:
N_SPLITS = 5

def log_loss_metric(y_true, y_pred):
    y_pred_clip = np.clip(y_pred, 1e-15, 1 - 1e-15)
    loss = - np.mean(np.mean(y_true * np.log(y_pred_clip) + (1 - y_true) * np.log(1 - y_pred_clip), axis = 1))
    return loss

def modelling_lr(tr, ta, te):    
    oof = np.zeros([len(tr),ta.shape[1]])
    pred_value = np.zeros([te.shape[0], ta.shape[1]])
    
    mskf_lr = MultilabelStratifiedKFold(n_splits = N_SPLITS, random_state = 0, shuffle = True)
    
    for n, (train_index, val_index) in enumerate(mskf_lr.split(tr, ta)):
        x_tr, x_val = tr[train_index], tr[val_index]
        y_tr, y_val = ta[train_index], ta[val_index]
        
        model = KernelRidge(alpha = 80, kernel = 'rbf')
        model.fit(x_tr, y_tr)

        fold_pred = model.predict(x_val)
        pred_value += model.predict(te) / N_SPLITS
        oof[val_index,:] = fold_pred
        fold_score = log_loss_metric(y_val, fold_pred)
        print('KRR: Fold {} Score {}:'.format(n+1, fold_score))
    return oof, pred_value

In [10]:
lr0_oof = np.zeros([len(fn_train), fn_targets.shape[1]])
lr0_test = np.zeros([len(fn_test), fn_targets.shape[1]])
lr0_oof, lr0_test = modelling_lr(fn_train, fn_targets, fn_test)

KRR: Fold 1 Score 0.020053231410728768:
KRR: Fold 2 Score 0.020395236493920593:
KRR: Fold 3 Score 0.02040202134683225:
KRR: Fold 4 Score 0.020202331062327317:
KRR: Fold 5 Score 0.02046674791227364:


In [11]:
lr1_test = pd.read_csv('../input/lish-moa/sample_submission.csv')
lr1_test.loc[:, target_feats] = 0
lr1_oof = np.zeros([fn_targets.shape[0],fn_targets.shape[1]]) 

for ind in tqdm(range(len(target_feats))):

    ind_target_sum = targets.drop("sig_id", axis=1).copy().values[:, ind].sum()

    if ind_target_sum >= N_SPLITS:

        skf = StratifiedKFold(n_splits = N_SPLITS, random_state = 0, shuffle = True)
        for n, (train_index, val_index) in enumerate(skf.split(lr0_oof, fn_targets[:,ind])):
            x_tr, x_val = lr0_oof[train_index, ind].reshape(-1, 1), lr0_oof[val_index, ind].reshape(-1, 1)
            y_tr, y_val = fn_targets[train_index,ind], fn_targets[val_index,ind]
            model = LogisticRegression(penalty = 'none', max_iter = 1000)
            model.fit(x_tr, y_tr)
            
            lr1_test.loc[cons_test_index, target_feats[ind]] += model.predict_proba(lr0_test[:, ind].reshape(-1, 1))[:, 1] / N_SPLITS
            lr1_oof[val_index, ind] += model.predict_proba(x_val)[:, 1]

HBox(children=(FloatProgress(value=0.0, max=206.0), HTML(value='')))




In [12]:
check_lr1 = targets.copy()
check_lr1.loc[cons_train_index,target_feats] = lr1_oof
check_lr1.loc[noncons_train_index,target_feats] = 0
print('OOF log loss: ', log_loss(np.ravel(targets.iloc[:,1:]), np.ravel(np.array(check_lr1.iloc[:,1:]))))

OOF log loss:  0.01635319513482686


# submission

In [13]:
sub_lr = pd.read_csv(DATA_DIR + 'sample_submission.csv')
sub_lr.loc[cons_test_index,target_feats] = lr1_test
sub_lr.loc[noncons_test_index,target_feats] = 0
sub_lr.to_csv('submission.csv', index=False)