- incorporate validation method

In [1]:
import sys
!cp ../input/rapids/rapids.0.15.0 /opt/conda/envs/rapids.tar.gz
!cd /opt/conda/envs/ && tar -xzvf rapids.tar.gz > /dev/null
sys.path = ["/opt/conda/envs/rapids/lib/python3.7/site-packages"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib/python3.7"] + sys.path
sys.path = ["/opt/conda/envs/rapids/lib"] + sys.path
!cp /opt/conda/envs/rapids/lib/libxgboost.so /opt/conda/lib/

In [2]:
import os
import warnings
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from tqdm import tqdm_notebook as tqdm
from category_encoders import CountEncoder
from sklearn.multioutput import MultiOutputClassifier
from sklearn.model_selection import KFold, StratifiedKFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

from sklearn.kernel_approximation import Nystroem
from sklearn.isotonic import IsotonicRegression
from sklearn.linear_model import LogisticRegression
from sklearn.kernel_ridge import KernelRidge
from cuml.svm import SVC, SVR

# preprocess

In [3]:
DATA_DIR = '/kaggle/input/lish-moa/'
train = pd.read_csv(DATA_DIR + 'train_features.csv')
targets = pd.read_csv(DATA_DIR + 'train_targets_scored.csv')
test = pd.read_csv(DATA_DIR + 'test_features.csv')
drug = pd.read_csv(DATA_DIR + 'train_drug.csv')

In [4]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [5]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

test = test[test.index.isin(cons_test_index)].reset_index(drop=True)
train = train[train.index.isin(cons_train_index)].reset_index(drop=True)
targets = targets[targets.index.isin(cons_train_index)].reset_index(drop=True)
fn_targets = targets.drop("sig_id", axis=1).copy().to_numpy()
y = targets.drop("sig_id", axis=1).copy()

In [6]:
# https://www.kaggle.com/c/lish-moa/discussion/195195
N_SPLITS = 5
seed = 34

folds = []
    
# LOAD FILES
train_score = targets.merge(drug, on='sig_id', how='left') 

# LOCATE DRUGS
vc = train_score.drug_id.value_counts()
vc1 = vc.loc[vc <= 18].index.sort_values()
vc2 = vc.loc[vc > 18].index.sort_values()
    
# STRATIFY DRUGS 18X OR LESS
dct1 = {}; dct2 = {}
skf = MultilabelStratifiedKFold(n_splits = N_SPLITS, shuffle = True, random_state = seed)
tmp = train_score.groupby('drug_id')[target_feats].mean().loc[vc1]
for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
    dd = {k:fold for k in tmp.index[idxV].values}
    dct1.update(dd)

# STRATIFY DRUGS MORE THAN 18X
skf = MultilabelStratifiedKFold(n_splits = N_SPLITS, shuffle = True, random_state = seed)
tmp = train_score.loc[train_score.drug_id.isin(vc2)].reset_index(drop = True)
for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_feats])):
    dd = {k:fold for k in tmp.sig_id[idxV].values}
    dct2.update(dd)
    
# ASSIGN FOLDS
train_score['fold'] = train_score.drug_id.map(dct1)
train_score.loc[train_score.fold.isna(),'fold'] = train_score.loc[train_score.fold.isna(),'sig_id'].map(dct2)
train_score.fold = train_score.fold.astype('int8')
folds.append(train_score.fold.values)
    
np.array(folds)

array([[1, 1, 4, ..., 1, 0, 1]], dtype=int8)

# Feature engineering

In [7]:
for i in c_feats + g_feats:
    ss = preprocessing.StandardScaler()
    ss.fit(train[i].values.reshape(-1,1))
    train[i] = ss.transform(train[i].values.reshape(-1,1))
    test[i] = ss.transform(test[i].values.reshape(-1,1))

In [8]:
def fe_simple(df, remove_features):
    tmp = df.copy()
    tmp.drop(remove_features, axis=1, inplace=True)
    tmp.loc[:, 'cp_dose'] = tmp.loc[:, 'cp_dose'].map({'D1': 0, 'D2': 1})
    return tmp

remove_features = ["cp_type" , "sig_id"]
train_fn = fe_simple(train, remove_features)
test_fn = fe_simple(test, remove_features)
train_fn["fold"] = np.array(folds).reshape(-1,1)

print(train_fn.shape, test_fn.shape)

(21948, 875) (3624, 874)


In [9]:
fn_train = train_fn.copy().to_numpy()
fn_test = test_fn.copy().to_numpy()

# modelling

In [10]:
N_STARTS = 1

def modelling_svm(tr, ta, te):
    svm0_oof = np.zeros([len(tr), ta.shape[1]])
    svm0_test = np.zeros([len(te), ta.shape[1]])

    svm1_test = np.zeros([len(te),ta.shape[1]])
    svm1_oof = np.zeros([ta.shape[0],ta.shape[1]]) 

    for ind in tqdm(range(ta.shape[1])):
        ind_target_sum = ta[:, ind].sum()
        if ind_target_sum >= N_SPLITS:
            for seed in range(N_STARTS):
                for n in range(N_SPLITS):
                    val_index = tr[:,-1] == n
                    train_index = tr[:,-1] != n
                    x_tr, x_val = tr[train_index], tr[val_index]
                    y_tr, y_val = ta[train_index,ind], ta[val_index,ind]
                    x_tr = np.delete(x_tr, -1, 1)
                    x_val = np.delete(x_val, -1, 1)
                    model = SVC(C = 40, cache_size = 2000)
                    model.fit(x_tr, y_tr)
                    svm0_test[:, ind] += model.decision_function(te) / (N_SPLITS * N_STARTS)
                    svm0_oof[val_index, ind] += model.decision_function(x_val) / N_STARTS
                    
            for seed in range(N_STARTS):
                for n in range(N_SPLITS):
                    val_index = tr[:,-1] == n
                    train_index = tr[:,-1] != n
                    x_tr, x_val = svm0_oof[train_index, ind].reshape(-1, 1), svm0_oof[val_index, ind].reshape(-1, 1)
                    y_tr, y_val = ta[train_index,ind], ta[val_index,ind]

                    model = LogisticRegression(C = 35, max_iter = 1000)
                    model.fit(x_tr, y_tr)
                    svm1_test[:, ind] += model.predict_proba(svm0_test[:, ind].reshape(-1, 1))[:, 1] / (N_SPLITS * N_STARTS)
                    svm1_oof[val_index, ind] += model.predict_proba(x_val)[:, 1] / N_STARTS

        score1 = log_loss(ta[:, ind], svm0_oof[:, ind])
        score2 = log_loss(ta[:, ind], svm1_oof[:, ind])
        print('SVM Target ind {} score {}: {}'.format(ind, score1, score2))

    return svm1_test, svm1_oof

In [11]:
svm1_test, svm1_oof = modelling_svm(fn_train, fn_targets, fn_test)

HBox(children=(FloatProgress(value=0.0, max=206.0), HTML(value='')))

SVM Target ind 0 score 0.0267522871657328: 0.004743647562673678
SVM Target ind 1 score 0.028325951116658196: 0.006774335586245682
SVM Target ind 2 score 0.0377679348222106: 0.008631807249958105
SVM Target ind 3 score 0.2990824020150525: 0.049010819801783526
SVM Target ind 4 score 0.4736795111378154: 0.07145334988722146
SVM Target ind 5 score 0.11487829264805978: 0.02235007404877279
SVM Target ind 6 score 0.08498090986327342: 0.0172405251690922
SVM Target ind 7 score 0.15109277046175193: 0.02791993784308129
SVM Target ind 8 score 0.018894670386136802: 0.004206256770236577
SVM Target ind 9 score 0.4179283738202106: 0.06164773392913534
SVM Target ind 10 score 0.5665615646808058: 0.08267443288447668
SVM Target ind 11 score 0.08605773811471114: 0.017576320635726304



RuntimeError: Exception occured! file=/opt/conda/envs/rapids/conda-bld/libcuml_1598469299551/work/cpp/src/svm/svc_impl.cuh line=66: Only binary classification is implemented at the moment
Obtained 64 stack frames
#0 in /opt/conda/envs/rapids/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN8MLCommon9Exception16collectCallStackEv+0x3e) [0x7f3285ab166e]
#1 in /opt/conda/envs/rapids/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN8MLCommon9ExceptionC2ERKNSt7__cxx1112basic_stringIcSt11char_traitsIcESaIcEEE+0x71) [0x7f3285ab21e1]
#2 in /opt/conda/envs/rapids/lib/python3.7/site-packages/cuml/common/../../../../libcuml++.so(_ZN2ML3SVM6svcFitIdEEvRKNS_10cumlHandleEPT_iiS6_RKNS0_12svmParameterERN8MLCommon6Matrix12KernelParamsERNS0_8svmModelIS5_EEPKS5_+0x1628) [0x7f3285dd6718]
#3 in /opt/conda/envs/rapids/lib/python3.7/site-packages/cuml/svm/svc.cpython-37m-x86_64-linux-gnu.so(+0x24994) [0x7f32c6627994]
#4 in /opt/conda/envs/rapids/lib/python3.7/site-packages/cuml/svm/svc.cpython-37m-x86_64-linux-gnu.so(+0x28c49) [0x7f32c662bc49]
#5 in /opt/conda/bin/python(PyObject_Call+0x6e) [0x55e72110875e]
#6 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x1f6a) [0x55e7211afd6a]
#7 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0x5da) [0x55e7210f6bda]
#8 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0x387) [0x55e7211459e7]
#9 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x4cb9) [0x55e7211b2ab9]
#10 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0xfb) [0x55e72114575b]
#11 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x416) [0x55e7211ae216]
#12 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0x2f9) [0x55e7210f68f9]
#13 in /opt/conda/bin/python(PyEval_EvalCodeEx+0x44) [0x55e7210f7824]
#14 in /opt/conda/bin/python(PyEval_EvalCode+0x1c) [0x55e7210f784c]
#15 in /opt/conda/bin/python(+0x1dcafd) [0x55e7211bdafd]
#16 in /opt/conda/bin/python(_PyMethodDef_RawFastCallKeywords+0xe9) [0x55e721146069]
#17 in /opt/conda/bin/python(_PyCFunction_FastCallKeywords+0x21) [0x55e721146301]
#18 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x4904) [0x55e7211b2704]
#19 in /opt/conda/bin/python(_PyGen_Send+0x2a2) [0x55e72114e212]
#20 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x1add) [0x55e7211af8dd]
#21 in /opt/conda/bin/python(_PyGen_Send+0x2a2) [0x55e72114e212]
#22 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x1add) [0x55e7211af8dd]
#23 in /opt/conda/bin/python(_PyGen_Send+0x2a2) [0x55e72114e212]
#24 in /opt/conda/bin/python(_PyMethodDef_RawFastCallKeywords+0x8d) [0x55e72114600d]
#25 in /opt/conda/bin/python(_PyMethodDescr_FastCallKeywords+0x4f) [0x55e72114d04f]
#26 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x4e0d) [0x55e7211b2c0d]
#27 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0xfb) [0x55e72114575b]
#28 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x416) [0x55e7211ae216]
#29 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0xfb) [0x55e72114575b]
#30 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x6a0) [0x55e7211ae4a0]
#31 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0x2f9) [0x55e7210f68f9]
#32 in /opt/conda/bin/python(_PyFunction_FastCallDict+0x400) [0x55e7210f7c60]
#33 in /opt/conda/bin/python(_PyObject_Call_Prepend+0x63) [0x55e721115e03]
#34 in /opt/conda/bin/python(PyObject_Call+0x6e) [0x55e72110875e]
#35 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x1f6a) [0x55e7211afd6a]
#36 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0x5da) [0x55e7210f6bda]
#37 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0x387) [0x55e7211459e7]
#38 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x14e7) [0x55e7211af2e7]
#39 in /opt/conda/bin/python(+0x16ccd9) [0x55e72114dcd9]
#40 in /opt/conda/bin/python(_PyMethodDef_RawFastCallKeywords+0xe9) [0x55e721146069]
#41 in /opt/conda/bin/python(_PyCFunction_FastCallKeywords+0x21) [0x55e721146301]
#42 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x4904) [0x55e7211b2704]
#43 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0xab8) [0x55e7210f70b8]
#44 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0x387) [0x55e7211459e7]
#45 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x6a0) [0x55e7211ae4a0]
#46 in /opt/conda/bin/python(+0x16ccd9) [0x55e72114dcd9]
#47 in /opt/conda/bin/python(_PyMethodDef_RawFastCallKeywords+0xe9) [0x55e721146069]
#48 in /opt/conda/bin/python(_PyCFunction_FastCallKeywords+0x21) [0x55e721146301]
#49 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x4904) [0x55e7211b2704]
#50 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0xab8) [0x55e7210f70b8]
#51 in /opt/conda/bin/python(_PyFunction_FastCallKeywords+0x387) [0x55e7211459e7]
#52 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x416) [0x55e7211ae216]
#53 in /opt/conda/bin/python(+0x16ccd9) [0x55e72114dcd9]
#54 in /opt/conda/bin/python(_PyMethodDef_RawFastCallKeywords+0xe9) [0x55e721146069]
#55 in /opt/conda/bin/python(_PyCFunction_FastCallKeywords+0x21) [0x55e721146301]
#56 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x4904) [0x55e7211b2704]
#57 in /opt/conda/bin/python(_PyEval_EvalCodeWithName+0xab8) [0x55e7210f70b8]
#58 in /opt/conda/bin/python(_PyFunction_FastCallDict+0x1d5) [0x55e7210f7a35]
#59 in /opt/conda/bin/python(_PyObject_Call_Prepend+0x63) [0x55e721115e03]
#60 in /opt/conda/bin/python(PyObject_Call+0x6e) [0x55e72110875e]
#61 in /opt/conda/bin/python(_PyEval_EvalFrameDefault+0x1f6a) [0x55e7211afd6a]
#62 in /opt/conda/bin/python(_PyGen_Send+0x14c) [0x55e72114e0bc]
#63 in /opt/conda/bin/python(_PyMethodDef_RawFastCallKeywords+0x8d) [0x55e72114600d]


In [12]:
check_svm1 = np.zeros([targets.shape[0], targets.shape[1]-1])
check_svm1[cons_train_index,:] = svm1_oof
print('OOF log loss: ', log_loss(np.ravel(y), np.ravel(check_svm1)))

NameError: name 'svm1_oof' is not defined

# submission

In [13]:
sub = pd.read_csv(DATA_DIR + 'sample_submission.csv')

sub.loc[cons_test_index,target_feats] =  svm1_test 
sub.loc[noncons_test_index,target_feats] = 0
sub.to_csv('submission.csv', index=False)

NameError: name 'svm1_test' is not defined