# base 1d cnn model


In [56]:
EPOCHS = 25
BATCH_SIZE = 128
LEARNING_RATE = 1e-3
WEIGHT_DECAY = 1e-5
NFOLDS = 5
EARLY_STOPPING_STEPS = 10
EARLY_STOP = False
SEED = [0, 1, 2, 3 ,4, 5]

In [57]:
#ColaboratoryかKaggleNotebookか判別
import sys
import os
from pathlib import Path

if 'google.colab' in sys.modules:  # colab環境
    SEED = [0] # 1つ目のみ
    INPUT = Path('/content/input/')

    # drive mount
    from google.colab import drive
    drive.mount('/content/drive/')

    # working dir に移動
    os.chdir('/content/drive/MyDrive/git/kaggle-lish-moa/working')
    print("cwd:", os.getcwd())
    !pip install iterative-stratification

elif 'kaggle_web_client' in sys.modules:  # kaggle環境
    INPUT = Path('../input/')
    sys.path.append('../input/iterative-stratification/iterative-stratification-master')

from iterstrat.ml_stratifiers import MultilabelStratifiedKFold


Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).
cwd: /content/drive/MyDrive/git/kaggle-lish-moa/working


In [58]:
import numpy as np
import random
import pandas as pd
import matplotlib.pyplot as plt
import os
import copy
import seaborn as sns

from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

import warnings
warnings.filterwarnings('ignore')

In [59]:
os.listdir('../input/lish-moa')

['train_drug.csv',
 'test_features.csv',
 'train_targets_scored.csv',
 'train_targets_nonscored.csv',
 'train_features.csv',
 'sample_submission.csv']

In [60]:
train_features = pd.read_csv('../input/lish-moa/train_features.csv')
train_targets_scored = pd.read_csv('../input/lish-moa/train_targets_scored.csv')
train_targets_nonscored = pd.read_csv('../input/lish-moa/train_targets_nonscored.csv')

test_features = pd.read_csv('../input/lish-moa/test_features.csv')
sample_submission = pd.read_csv('../input/lish-moa/sample_submission.csv')

In [61]:
# 相関係数の高い nonscored を抽出
corr = pd.concat([train_targets_scored, train_targets_nonscored],axis=1).corr()
corr[train_targets_nonscored.drop("sig_id",axis=1).columns]
corr_se = corr[:len(train_targets_scored.drop("sig_id",axis=1).columns)][train_targets_nonscored.drop("sig_id",axis=1).columns].abs().max(axis=0).sort_values(ascending=False)
len(corr_se[corr_se>0.3])
non_scored_target_high_corr = list(corr_se[corr_se>0.3].index)
train_targets_nonscored_high_corr = train_targets_nonscored[["sig_id"]+non_scored_target_high_corr]
print(train_targets_nonscored_high_corr)

             sig_id  ...  differentiation_inducer
0      id_000644bb2  ...                        0
1      id_000779bfc  ...                        0
2      id_000a6266a  ...                        0
3      id_0015fd391  ...                        0
4      id_001626bd3  ...                        0
...             ...  ...                      ...
23809  id_fffb1ceed  ...                        0
23810  id_fffb70c0c  ...                        0
23811  id_fffc1c3f4  ...                        0
23812  id_fffcb9e7c  ...                        0
23813  id_ffffdd77b  ...                        0

[23814 rows x 34 columns]


In [62]:
GENES = [col for col in train_features.columns if col.startswith('g-')]
CELLS = [col for col in train_features.columns if col.startswith('c-')]
feat_dic = {}
feat_dic['gene'] = GENES
feat_dic['cell'] = CELLS

def norm_fit(df_1,saveM = True, sc_name = 'zsco'):   
    from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,Normalizer,QuantileTransformer,PowerTransformer
    ss_1_dic = {'zsco':StandardScaler(),
                'mima':MinMaxScaler(),
                'maxb':MaxAbsScaler(), 
                'robu':RobustScaler(),
                'norm':Normalizer(), 
                'quan':QuantileTransformer(n_quantiles=100,random_state=0, output_distribution="normal"),
                'powe':PowerTransformer()}
    ss_1 = ss_1_dic[sc_name]
    df_2 = pd.DataFrame(ss_1.fit_transform(df_1),index = df_1.index,columns = df_1.columns)
    if saveM == False:
        return(df_2)
    else:
        return(df_2,ss_1)
  
def norm_tra(df_1,ss_x):
    df_2 = pd.DataFrame(ss_x.transform(df_1),index = df_1.index,columns = df_1.columns)
    return(df_2)
    
# sample norm 
q2 = train_features[feat_dic['gene']].apply(np.quantile,axis = 1,q = 0.25).copy()
q7 = train_features[feat_dic['gene']].apply(np.quantile,axis = 1,q = 0.75).copy()
qmean = (q2+q7)/2
train_features[feat_dic['gene']] = (train_features[feat_dic['gene']].T - qmean.values).T
q2 = test_features[feat_dic['gene']].apply(np.quantile,axis = 1,q = 0.25).copy()
q7 = test_features[feat_dic['gene']].apply(np.quantile,axis = 1,q = 0.75).copy()
qmean = (q2+q7)/2
test_features[feat_dic['gene']] = (test_features[feat_dic['gene']].T - qmean.values).T

q2 = train_features[feat_dic['cell']].apply(np.quantile,axis = 1,q = 0.25).copy()
q7 = train_features[feat_dic['cell']].apply(np.quantile,axis = 1,q = 0.72).copy()
qmean = (q2+q7)/2
train_features[feat_dic['cell']] = (train_features[feat_dic['cell']].T - qmean.values).T
qmean2 = train_features[feat_dic['cell']].abs().apply(np.quantile,axis = 1,q = 0.75).copy()+4
train_features[feat_dic['cell']] = (train_features[feat_dic['cell']].T / qmean2.values).T.copy()

q2 = test_features[feat_dic['cell']].apply(np.quantile,axis = 1,q = 0.25).copy()
q7 = test_features[feat_dic['cell']].apply(np.quantile,axis = 1,q = 0.72).copy()
qmean = (q2+q7)/2
test_features[feat_dic['cell']] = (test_features[feat_dic['cell']].T - qmean.values).T
qmean2 = test_features[feat_dic['cell']].abs().apply(np.quantile,axis = 1,q = 0.75).copy()+4
test_features[feat_dic['cell']] = (test_features[feat_dic['cell']].T / qmean2.values).T.copy()

#------------ norm --------------
col_num = list(set(feat_dic['gene'] + feat_dic['cell']) )
col_num.sort()
train_features[col_num], ss = norm_fit(train_features[col_num],True,'quan')
test_features[col_num]     = norm_tra(test_features[col_num],ss)

In [63]:
def seed_everything(seed=42):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything(seed=42)

In [64]:
train_targets_scored.sum()[1:].sort_values()

atp-sensitive_potassium_channel_antagonist      1
erbb2_inhibitor                                 1
diuretic                                        6
autotaxin_inhibitor                             6
protein_phosphatase_inhibitor                   6
                                             ... 
serotonin_receptor_antagonist                 404
dopamine_receptor_antagonist                  424
cyclooxygenase_inhibitor                      435
proteasome_inhibitor                          726
nfkb_inhibitor                                832
Length: 206, dtype: object

In [65]:
train_features['cp_type'].unique()

array(['trt_cp', 'ctl_vehicle'], dtype=object)

# PCA features + Existing features

In [66]:
# GENES
n_comp = 50

data = pd.concat([pd.DataFrame(train_features[GENES]), pd.DataFrame(test_features[GENES])])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[GENES]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_G-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_G-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(GENES))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

In [67]:
#CELLS
n_comp = 15

data = pd.concat([pd.DataFrame(train_features[CELLS]), pd.DataFrame(test_features[CELLS])])
data2 = (PCA(n_components=n_comp, random_state=42).fit_transform(data[CELLS]))
train2 = data2[:train_features.shape[0]]; test2 = data2[-test_features.shape[0]:]

train2 = pd.DataFrame(train2, columns=[f'pca_C-{i}' for i in range(n_comp)])
test2 = pd.DataFrame(test2, columns=[f'pca_C-{i}' for i in range(n_comp)])

# drop_cols = [f'c-{i}' for i in range(n_comp,len(CELLS))]
train_features = pd.concat((train_features, train2), axis=1)
test_features = pd.concat((test_features, test2), axis=1)

# feature Selection using Variance Encoding

In [68]:
from sklearn.feature_selection import VarianceThreshold


var_thresh = VarianceThreshold(threshold=0.5)
data = train_features.append(test_features)
data_transformed = var_thresh.fit_transform(data.iloc[:, 4:])

train_features_transformed = data_transformed[ : train_features.shape[0]]
test_features_transformed = data_transformed[-test_features.shape[0] : ]


train_features = pd.DataFrame(train_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                              columns=['sig_id','cp_type','cp_time','cp_dose'])

train_features = pd.concat([train_features, pd.DataFrame(train_features_transformed)], axis=1)


test_features = pd.DataFrame(test_features[['sig_id','cp_type','cp_time','cp_dose']].values.reshape(-1, 4),\
                             columns=['sig_id','cp_type','cp_time','cp_dose'])

test_features = pd.concat([test_features, pd.DataFrame(test_features_transformed)], axis=1)

train_features


Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,...,897,898,899,900,901,902,903,904,905,906,907,908,909,910,911,912,913,914,915,916,917,918,919,920,921,922,923,924,925,926,927,928,929,930,931,932,933,934,935,936
0,id_000644bb2,trt_cp,24,D1,1.126588,0.896743,-0.419975,-0.987724,-0.261362,-1.019833,-1.357432,-0.036391,0.683932,-0.312484,1.544427,0.169884,0.629200,-0.560967,0.284361,-1.063809,-1.140764,0.868345,0.382571,-0.515578,-0.744083,-1.308057,-1.685607,1.232221,0.551543,0.398992,0.238473,0.166220,-0.535662,0.779434,0.414264,-1.121055,-0.059489,-0.445588,-0.204124,0.266637,...,1.874684,0.728876,-0.682201,-0.867972,-0.727414,0.896742,-0.160439,0.201803,-1.701895,0.038630,0.217904,-1.560113,-0.251815,2.059157,0.177284,0.753183,0.172003,0.254249,1.103743,-2.423089,0.282657,-0.056549,-0.550481,0.470953,-0.819566,0.155108,1.129632,1.769502,0.061783,-1.103373,-0.687035,2.363661,-0.119732,1.058464,-0.396841,-0.482863,1.082089,-0.608718,-0.950615,-0.598907
1,id_000779bfc,trt_cp,72,D1,0.123960,0.686144,0.281471,0.089220,1.197490,0.695488,0.325732,0.558904,-0.528336,0.846367,-1.255438,-0.563654,-0.200083,0.549301,0.162757,0.402200,0.406731,0.410560,-0.831429,0.511447,1.256254,-0.124842,-0.387620,-0.415918,0.400921,-0.579409,0.651503,0.234805,-0.737432,-0.179216,-0.106861,-0.537304,1.652468,-0.360877,0.367221,-0.269058,...,-2.507209,0.812504,-0.494634,0.511296,0.945359,2.548873,0.652772,-1.680784,-1.395383,2.575009,-0.298309,1.074672,1.706043,-0.366847,-1.130237,1.450744,-0.765327,-0.052048,-0.485376,0.853620,-1.366777,0.516627,0.262665,-0.224431,-1.143572,-0.334881,-1.205319,-0.054506,-0.800481,-0.276697,0.462977,-0.174409,-0.245746,-2.197200,0.679922,0.844831,0.382629,0.042944,0.224284,-0.149290
2,id_000a6266a,trt_cp,48,D1,0.791802,0.963103,1.446197,-0.104188,0.016850,1.506677,0.259336,0.388398,0.018690,1.263162,-0.631766,-0.744935,-0.104532,-2.272928,0.915987,-0.516485,0.488828,-0.381854,-0.264876,-0.014108,-0.085669,-0.977670,-1.943257,0.552849,0.644842,0.732406,-1.375768,2.391415,-0.062331,1.604275,-1.456559,0.835065,0.757337,0.233871,0.026929,-1.628269,...,-0.397246,-1.556993,-1.364556,-1.276963,-0.575073,1.139321,-2.757371,-0.148768,1.124297,-1.284978,2.288263,3.239749,-3.147330,1.647471,2.900226,0.035748,1.511792,-0.538020,-0.514330,1.722944,-0.042691,-2.499278,-0.361552,-0.065624,-3.439062,0.360763,0.236295,-0.459681,0.782969,-0.706135,0.228319,0.043046,0.128723,0.095275,-0.093033,-1.067634,-0.158550,-0.425903,1.510575,-0.614885
3,id_0015fd391,trt_cp,48,D1,-0.678723,-0.225107,-0.390217,0.817606,2.347629,-0.829998,-2.240530,0.358999,-0.144604,-1.366325,-0.983661,-0.448022,-1.119056,-0.759329,-1.758028,1.454682,-0.186508,-1.017801,0.333821,-2.112178,2.098888,-1.292435,-1.057987,-1.011599,-1.391260,0.050921,-1.030224,0.311702,-0.397560,-1.224687,-0.954690,0.193184,-2.071169,1.082679,0.848135,-2.098790,...,-0.403723,0.799991,-0.846906,-1.825898,0.275374,3.215788,2.466925,2.896389,-1.712297,1.412401,-1.530897,1.101305,-5.110287,-4.281156,1.345197,-1.717697,0.219488,1.444570,2.686697,-0.720024,1.434588,1.635843,0.502135,1.529322,-0.407975,5.915576,7.043598,-1.278447,-3.642887,-0.170505,1.328007,-0.341669,-0.700178,-0.568794,0.617534,-1.144837,2.252996,-0.307845,2.358449,-0.733371
4,id_001626bd3,trt_cp,72,D2,-0.455668,-0.480522,0.967960,0.978515,1.455881,-0.875480,-0.378600,-0.213123,-1.067782,0.846761,-0.349202,-0.708051,-1.257696,1.176129,0.319188,0.333307,-0.030044,-1.567980,-0.863951,-0.708170,-0.464143,-0.291901,1.055318,0.481658,1.671455,1.008205,1.084597,-0.562678,-1.550207,0.376295,-0.157058,0.842966,1.108839,-0.143759,1.506831,-1.012107,...,-2.842882,0.590315,0.866107,0.838733,0.742959,-0.048764,1.566977,-0.588171,-1.045144,3.052931,-1.138468,-1.484278,0.894380,-2.876091,-0.384195,-0.206759,-2.312379,-0.468270,1.313270,1.193817,0.719543,-0.504175,0.625338,0.591882,0.430694,-0.147733,0.656648,-0.200896,1.242065,-0.432718,-0.425102,0.018450,0.033976,0.282757,0.236529,-0.090623,-0.186010,0.049554,0.686977,0.187832
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
23809,id_fffb1ceed,trt_cp,24,D2,0.187188,-0.034056,-0.249429,-0.832378,-0.679101,0.908953,0.718568,0.480088,-0.637730,-0.055882,0.281132,0.187133,-0.706960,-1.506956,0.370626,-0.778314,0.218043,0.214944,-1.535446,-0.137503,1.132241,-0.510657,0.823143,0.509190,-0.700666,-0.614566,-1.087769,-0.383896,-1.097243,0.279529,0.955677,-0.028320,-0.581780,-0.880376,0.969181,-0.243005,...,-1.756401,-0.818403,0.732983,0.507685,-0.406591,-0.649918,-0.660144,2.410254,-1.417525,-0.057994,2.201000,0.117971,0.400822,2.322524,0.551766,-0.900840,0.855837,-1.174991,0.228284,0.609089,-1.026241,0.148294,2.244669,1.241264,-0.043656,-1.050282,0.709188,-0.058357,-0.780204,1.271030,0.939382,-0.901208,-0.039406,-1.717758,-2.110985,-0.094941,0.872448,-0.179214,0.339059,0.011609
23810,id_fffb70c0c,trt_cp,24,D2,-1.884473,0.577520,-0.592163,1.313730,-0.988445,0.850154,-0.302785,-0.700000,0.635047,0.647216,-0.298722,1.773136,-0.374706,-0.855289,1.187777,0.966057,-0.497439,0.184017,0.010274,0.193440,-0.300944,-1.103900,0.141793,-1.815007,-0.966702,-1.222007,0.397687,0.179500,0.138097,0.220735,0.236966,0.755984,-0.004636,-0.105144,1.283041,0.547557,...,0.673692,1.248552,-1.891382,-0.396722,-0.010035,-1.606604,-0.422678,-1.524455,-1.018971,-0.047771,0.936799,1.223066,-0.410030,0.715196,-0.677887,0.861431,0.573731,0.581244,-0.391945,0.546381,-1.140731,-1.245623,-0.559947,-1.892817,0.162867,-0.440551,-0.393629,2.727819,1.514617,0.884857,2.089866,-0.903363,1.596745,-0.966179,-0.909414,-0.350229,-1.627276,1.369043,-1.987220,0.880771
23811,id_fffc1c3f4,ctl_vehicle,48,D2,0.523746,0.629871,0.293035,-1.155001,0.755713,0.006751,-0.332541,0.301984,-0.054996,0.419032,-0.291796,0.074075,0.159425,-0.063017,0.259025,-0.356916,-0.197651,-0.550486,0.821665,-0.726219,0.218768,-0.274455,0.390740,-1.060709,0.632625,1.031457,-0.516732,-1.737759,0.718004,1.550451,-0.332772,-2.226475,-0.474578,-0.033901,-0.385621,-0.309447,...,4.020869,0.882611,-2.952872,-1.611710,0.571437,-2.019117,0.426538,-2.023727,3.154268,-0.598999,-1.030936,2.518993,-0.232341,1.191275,1.262104,-0.391270,0.928126,0.937796,1.476072,-0.571739,0.667097,0.488269,0.758886,1.743447,-1.043349,0.174079,0.821518,1.073441,-0.860698,0.939051,-0.362093,-0.767579,0.407726,0.141635,-0.161827,0.258464,-0.177991,0.050372,0.134395,0.132962
23812,id_fffcb9e7c,trt_cp,24,D1,0.733179,0.285483,0.339431,0.179894,0.977203,-0.131723,-0.065934,-0.037950,0.427271,-1.271809,0.637745,0.054030,1.676909,0.141191,0.678802,-2.306730,-0.095084,0.724772,0.260592,1.378512,-1.056449,1.255206,0.427376,2.436134,0.517181,0.875201,0.192560,-0.615563,1.151749,-1.824834,-0.457983,-0.596739,1.000482,-0.177672,-0.229582,1.576964,...,0.604516,0.958135,-0.576911,3.018175,3.840337,-1.996945,-3.727985,2.913305,0.082226,0.026248,-1.123510,-1.742035,1.632227,-2.704517,1.593665,-0.284915,0.758484,-0.177536,0.690516,0.573094,-2.316931,-0.084358,-0.436005,0.478374,0.263108,-0.916321,0.248089,1.720493,1.079618,1.223236,-0.936273,0.540446,-1.280423,-0.100979,2.306954,-1.543399,0.672200,2.061812,0.069075,0.463491


In [69]:
train = train_features.merge(train_targets_scored, on='sig_id')
train = train.merge(train_targets_nonscored_high_corr, on='sig_id')

train = train[train['cp_type']!='ctl_vehicle'].reset_index(drop=True)
test = test_features[test_features['cp_type']!='ctl_vehicle'].reset_index(drop=True)

#----
# augmentation for less target
skew_target_cols = train_targets_scored.drop('sig_id', axis=1).sum()[lambda x:x<10].index.values
print(skew_target_cols)
augmented_num = 0
for col in skew_target_cols:
    tmp_df = train[train[col]==1].copy()
for _ in range(4):
    train = pd.concat((train, tmp_df))
    augmented_num += len(tmp_df.index) * 4
print(train.shape)
train = train.reset_index(drop=True)
#---
target = train[train_targets_scored.columns]

['aldehyde_dehydrogenase_inhibitor' 'antiarrhythmic'
 'atm_kinase_inhibitor' 'atp-sensitive_potassium_channel_antagonist'
 'autotaxin_inhibitor' 'bacterial_membrane_integrity_inhibitor'
 'calcineurin_inhibitor' 'coagulation_factor_inhibitor' 'diuretic'
 'elastase_inhibitor' 'erbb2_inhibitor' 'laxative' 'leukotriene_inhibitor'
 'lxr_agonist' 'nicotinic_receptor_agonist'
 'norepinephrine_reuptake_inhibitor' 'protein_phosphatase_inhibitor'
 'retinoid_receptor_antagonist' 'steroid' 'tlr_antagonist'
 'tropomyosin_receptor_kinase_inhibitor'
 'ubiquitin_specific_protease_inhibitor']
(21972, 1180)


In [70]:
train = train.drop('cp_type', axis=1)
test = test.drop('cp_type', axis=1)

In [71]:
train

Unnamed: 0,sig_id,cp_time,cp_dose,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31,32,33,34,35,36,...,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor,abc_transporter_expression_enhancer,dna_methyltransferase_inhibitor,ror_inverse_agonist,nfkb_activator,sars_coronavirus_3c-like_protease_inhibitor,glucocorticoid_receptor_antagonist,macrophage_migration_inhibiting_factor_inhibitor,heme_oxygenase_activators,glutathione_reductase_(nadph)_activators,diacylglycerol_o_acyltransferase_inhibitor,keap1_ligand,steryl_sulfatase_inhibitor,hiv_protease_inhibitor,quorum_sensing_signaling_modulator,camp_stimulant,macrophage_inhibitor,abl_inhibitor,membrane_permeability_inhibitor,ephrin_inhibitor,gaba_gated_chloride_channel_blocker,omega_3_fatty_acid_stimulant,niemann-pick_c1-like_1_protein_antagonist,gap_junction_modulator,dna_dependent_protein_kinase_inhibitor,reducing_agent,big1_inhibitor,tyrosine_phosphatase_inhibitor,hgf_receptor_inhibitor,caspase_inhibitor,selective_estrogen_receptor_modulator_(serm),imidazoline_ligand,sphingosine_1_phosphate_receptor_agonist,differentiation_inducer
0,id_000644bb2,24,D1,1.126588,0.896743,-0.419975,-0.987724,-0.261362,-1.019833,-1.357432,-0.036391,0.683932,-0.312484,1.544427,0.169884,0.629200,-0.560967,0.284361,-1.063809,-1.140764,0.868345,0.382571,-0.515578,-0.744083,-1.308057,-1.685607,1.232221,0.551543,0.398992,0.238473,0.166220,-0.535662,0.779434,0.414264,-1.121055,-0.059489,-0.445588,-0.204124,0.266637,0.381819,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,id_000779bfc,72,D1,0.123960,0.686144,0.281471,0.089220,1.197490,0.695488,0.325732,0.558904,-0.528336,0.846367,-1.255438,-0.563654,-0.200083,0.549301,0.162757,0.402200,0.406731,0.410560,-0.831429,0.511447,1.256254,-0.124842,-0.387620,-0.415918,0.400921,-0.579409,0.651503,0.234805,-0.737432,-0.179216,-0.106861,-0.537304,1.652468,-0.360877,0.367221,-0.269058,0.210515,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,id_000a6266a,48,D1,0.791802,0.963103,1.446197,-0.104188,0.016850,1.506677,0.259336,0.388398,0.018690,1.263162,-0.631766,-0.744935,-0.104532,-2.272928,0.915987,-0.516485,0.488828,-0.381854,-0.264876,-0.014108,-0.085669,-0.977670,-1.943257,0.552849,0.644842,0.732406,-1.375768,2.391415,-0.062331,1.604275,-1.456559,0.835065,0.757337,0.233871,0.026929,-1.628269,0.171128,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,id_0015fd391,48,D1,-0.678723,-0.225107,-0.390217,0.817606,2.347629,-0.829998,-2.240530,0.358999,-0.144604,-1.366325,-0.983661,-0.448022,-1.119056,-0.759329,-1.758028,1.454682,-0.186508,-1.017801,0.333821,-2.112178,2.098888,-1.292435,-1.057987,-1.011599,-1.391260,0.050921,-1.030224,0.311702,-0.397560,-1.224687,-0.954690,0.193184,-2.071169,1.082679,0.848135,-2.098790,-1.399664,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,id_001626bd3,72,D2,-0.455668,-0.480522,0.967960,0.978515,1.455881,-0.875480,-0.378600,-0.213123,-1.067782,0.846761,-0.349202,-0.708051,-1.257696,1.176129,0.319188,0.333307,-0.030044,-1.567980,-0.863951,-0.708170,-0.464143,-0.291901,1.055318,0.481658,1.671455,1.008205,1.084597,-0.562678,-1.550207,0.376295,-0.157058,0.842966,1.108839,-0.143759,1.506831,-1.012107,-1.331879,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
21967,id_2c1f61f65,72,D2,-0.052487,-0.090507,0.634989,0.970022,0.622500,0.058178,0.543512,-0.594642,-0.360361,-1.218731,0.958293,0.761964,-0.271883,-0.683985,-0.375000,0.607132,0.853628,-0.138479,0.934118,0.331971,1.218196,0.522146,-0.699850,0.532469,-1.542698,-0.152300,-0.686239,-0.799104,0.505779,1.270769,0.650029,-0.513454,0.267579,0.143805,0.563469,0.368879,0.065039,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21968,id_311b47f39,24,D1,0.053022,-1.254248,1.023506,-0.701955,0.653108,1.511247,1.466840,0.904672,-1.123266,-1.249615,-1.425165,0.120341,0.176264,-0.592334,-1.659904,1.388189,0.402469,-1.403861,0.312515,1.156923,0.190132,-0.334289,-1.512546,-1.219286,1.489906,0.965432,-0.745090,-0.772918,-0.040477,1.231560,-0.695933,0.976067,-1.158849,1.105856,1.018993,-1.305189,-1.050904,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21969,id_452abbea4,72,D1,-0.274378,0.379436,-0.168918,0.203143,-0.624543,-0.607401,-0.423641,0.337792,-0.127849,0.221964,0.611517,0.070140,0.251288,-0.552554,0.816557,-0.645957,1.025548,0.353395,-0.508629,0.345481,0.703846,-0.855306,-0.694089,0.934408,0.520652,0.520284,-0.502054,-0.609899,0.609651,0.385123,0.634011,0.438970,0.878351,1.195516,-0.725871,-1.578988,0.924955,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
21970,id_5c8f28f62,48,D2,-0.421358,-0.895211,-0.399550,0.043395,-0.333042,0.693621,-0.440438,-0.715162,0.174992,-0.366629,0.673011,-0.145082,0.186435,-0.277007,0.444210,-0.634385,0.761734,0.423898,2.526040,0.803327,-0.945090,0.315366,-1.372486,0.496475,-0.405035,1.305608,-0.162337,0.420401,0.985519,0.191848,0.928903,0.668194,-0.676193,0.597841,-0.175968,0.640769,-0.702191,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [72]:
target_cols = target.drop('sig_id', axis=1).columns.values.tolist()

# CV folds

In [73]:
train_drug = pd.read_csv('../input/lish-moa/train_drug.csv')

folds = train.copy()
def make_fold():
    folds = train.copy()

    """
    # LOCATE DRUGS
    vc = train_drug.drug_id.value_counts()
    vc1 = vc.loc[vc <= 19].index
    vc2 = vc.loc[vc > 19].index


    # kfold - leave drug out target2 = target.copy()
    dct1 = {}; dct2 = {}
    skf = MultilabelStratifiedKFold(n_splits = 5) # , shuffle = True, random_state = seed
    tmp = target2.groupby('drug_id')[target_cols].mean().loc[vc1]
    tmp_idx = tmp.index.tolist()
    tmp_idx.sort()
    tmp_idx2 = random.sample(tmp_idx,len(tmp_idx))
    tmp = tmp.loc[tmp_idx2]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_cols])):
        dd = {k:fold for k in tmp.index[idxV].values}
        dct1.update(dd)

    skf = MultilabelStratifiedKFold(n_splits = 5) # , shuffle = True, random_state = seed
    tmp = target2.loc[target2.drug_id.isin(vc2)].reset_index(drop = True)
    tmp_idx = tmp.index.tolist()
    tmp_idx.sort()
    tmp_idx2 = random.sample(tmp_idx,len(tmp_idx))
    tmp = tmp.loc[tmp_idx2]
    for fold,(idxT,idxV) in enumerate(skf.split(tmp,tmp[target_cols])):
        dd = {k:fold for k in tmp.sig_id[idxV].values}
        dct2.update(dd)

    target2['kfold'] = target2.drug_id.map(dct1)
    target2.loc[target2.kfold.isna(),'kfold'] = target2.loc[target2.kfold.isna(),'sig_id'].map(dct2)
    target2.kfold = target2.kfold.astype(int)
    folds['kfold'] = target2['kfold'].copy()

    """
    folds = train.copy()

    mskf = MultilabelStratifiedKFold(n_splits=5)

    for f, (t_idx, v_idx) in enumerate(mskf.split(X=train, y=target)):
        folds.loc[v_idx, 'kfold'] = int(f)

    folds['kfold'] = folds['kfold'].astype(int)
    

    return folds


In [74]:
print(train.shape)
print(folds.shape)
print(test.shape)
print(target.shape)
print(sample_submission.shape)

(21972, 1179)
(21972, 1179)
(3624, 940)
(21972, 207)
(3982, 207)


# Dataset Classes

In [75]:
class MoADataset:
    def __init__(self, features, targets):
        self.features = features
        self.targets = targets
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float),
            'y' : torch.tensor(self.targets[idx, :], dtype=torch.float)            
        }
        return dct
    
class TestDataset:
    def __init__(self, features):
        self.features = features
        
    def __len__(self):
        return (self.features.shape[0])
    
    def __getitem__(self, idx):
        dct = {
            'x' : torch.tensor(self.features[idx, :], dtype=torch.float)
        }
        return dct
    

In [76]:
def train_fn(model, optimizer, scheduler, loss_fn, dataloader, device):
    model.train()
    final_loss = 0
    
    for data in dataloader:
        optimizer.zero_grad()
        inputs, targets = data['x'].to(device), data['y'].to(device)
#         print(inputs.shape)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        final_loss += loss.item()
        
    final_loss /= len(dataloader)
    
    return final_loss


def valid_fn(model, loss_fn, dataloader, device):
    model.eval()
    final_loss = 0
    valid_preds = []
    
    for data in dataloader:
        inputs, targets = data['x'].to(device), data['y'].to(device)
        outputs = model(inputs)
        loss = loss_fn(outputs, targets)
        
        final_loss += loss.item()
        valid_preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    final_loss /= len(dataloader)
    valid_preds = np.concatenate(valid_preds)
    
    return final_loss, valid_preds

def inference_fn(model, dataloader, device):
    model.eval()
    preds = []
    
    for data in dataloader:
        inputs = data['x'].to(device)

        with torch.no_grad():
            outputs = model(inputs)
        
        preds.append(outputs.sigmoid().detach().cpu().numpy())
        
    preds = np.concatenate(preds)
    
    return preds
   
    

# Model

In [77]:
class Model(nn.Module):
    def __init__(self, num_features, num_targets, hidden_size):
        super(Model, self).__init__()
        cha_1 = 256
        cha_2 = 512
        cha_3 = 512

        cha_1_reshape = int(hidden_size/cha_1)
        cha_po_1 = int(hidden_size/cha_1/2)
        cha_po_2 = int(hidden_size/cha_1/2/2) * cha_3

        self.cha_1 = cha_1
        self.cha_2 = cha_2
        self.cha_3 = cha_3
        self.cha_1_reshape = cha_1_reshape
        self.cha_po_1 = cha_po_1
        self.cha_po_2 = cha_po_2

        self.batch_norm1 = nn.BatchNorm1d(num_features)
        self.dropout1 = nn.Dropout(0.1)
        self.dense1 = nn.utils.weight_norm(nn.Linear(num_features, hidden_size))

        self.batch_norm_c1 = nn.BatchNorm1d(cha_1)
        self.dropout_c1 = nn.Dropout(0.1)
        self.conv1 = nn.utils.weight_norm(nn.Conv1d(cha_1,cha_2, kernel_size = 5, stride = 1, padding=2,  bias=False),dim=None)

        self.ave_po_c1 = nn.AdaptiveAvgPool1d(output_size = cha_po_1)

        self.batch_norm_c2 = nn.BatchNorm1d(cha_2)
        self.dropout_c2 = nn.Dropout(0.1)
        self.conv2 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_2, kernel_size = 3, stride = 1, padding=1, bias=True),dim=None)

        self.batch_norm_c2_1 = nn.BatchNorm1d(cha_2)
        self.dropout_c2_1 = nn.Dropout(0.3)
        self.conv2_1 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_2, kernel_size = 3, stride = 1, padding=1, bias=True),dim=None)

        self.batch_norm_c2_2 = nn.BatchNorm1d(cha_2)
        self.dropout_c2_2 = nn.Dropout(0.2)
        self.conv2_2 = nn.utils.weight_norm(nn.Conv1d(cha_2,cha_3, kernel_size = 5, stride = 1, padding=2, bias=True),dim=None)

        self.max_po_c2 = nn.MaxPool1d(kernel_size=4, stride=2, padding=1)

        self.flt = nn.Flatten()

        self.batch_norm3 = nn.BatchNorm1d(cha_po_2)
        self.dropout3 = nn.Dropout(0.2)
        self.dense3 = nn.utils.weight_norm(nn.Linear(cha_po_2, num_targets))

    def forward(self, x):
        x = self.batch_norm1(x)
        x = self.dropout1(x)
        x = F.celu(self.dense1(x), alpha=0.06)

        x = x.reshape(x.shape[0],self.cha_1,self.cha_1_reshape)

        x = self.batch_norm_c1(x)
        x = self.dropout_c1(x)
        x = F.relu(self.conv1(x))

        x = self.ave_po_c1(x)

        x = self.batch_norm_c2(x)
        x = self.dropout_c2(x)
        x = F.relu(self.conv2(x))
        x_s = x

        x = self.batch_norm_c2_1(x)
        x = self.dropout_c2_1(x)
        x = F.relu(self.conv2_1(x))

        x = self.batch_norm_c2_2(x)
        x = self.dropout_c2_2(x)
        x = F.relu(self.conv2_2(x))
        x =  x * x_s

        x = self.max_po_c2(x)

        x = self.flt(x)

        x = self.batch_norm3(x)
        x = self.dropout3(x)
        x = self.dense3(x)

        return x

from torch.nn.modules.loss import _WeightedLoss
class SmoothBCEwLogits(_WeightedLoss):
    def __init__(self, weight=None, reduction='mean', smoothing=0.0):
        super().__init__(weight=weight, reduction=reduction)
        self.smoothing = smoothing
        self.weight = weight
        self.reduction = reduction
    @staticmethod
    def _smooth(targets:torch.Tensor, n_labels:int, smoothing=0.0):
        assert 0 <= smoothing < 1
        with torch.no_grad():
            targets = targets * (1.0 - smoothing) + 0.5 * smoothing
        return targets
    def forward(self, inputs, targets):
        targets = SmoothBCEwLogits._smooth(targets, inputs.size(-1),self.smoothing)
        loss = F.binary_cross_entropy_with_logits(inputs, targets,self.weight,
                                                      pos_weight = pos_weight)
        if  self.reduction == 'sum':
            loss = loss.sum()
        elif  self.reduction == 'mean':
            loss = loss.mean()
        return loss

# Preprocessing steps

In [78]:
def process_data(data):
    
    data = pd.get_dummies(data, columns=['cp_time','cp_dose'])
    
    return data

In [79]:
feature_cols = [c for c in process_data(folds).columns if c not in target_cols]
feature_cols = [c for c in feature_cols if c not in non_scored_target_high_corr]
feature_cols = [c for c in feature_cols if c not in ['kfold','sig_id']]
len(feature_cols)

942

In [80]:
# HyperParameters

DEVICE = ('cuda' if torch.cuda.is_available() else 'cpu')

num_features=len(feature_cols)
num_targets=len(target_cols)
hidden_size=4096


# Single fold training

In [81]:
print(process_data(folds).shape)
print(process_data(test).shape)


(21972, 1182)
(3624, 943)


In [82]:


def run_training(fold, seed):
    
    seed_everything(seed)
    
    train = process_data(folds)
    test_ = process_data(test)
    
    trn_idx = train[train['kfold'] != fold].index
    val_idx = train[train['kfold'] == fold].index
    
    train_df = train[train['kfold'] != fold].reset_index(drop=True)
    valid_df = train[train['kfold'] == fold].reset_index(drop=True)


    #----
    # pretrain non_scored_target_high_corr
    print("pretrain ")
    print(" start")
    loss_fr = nn.BCEWithLogitsLoss()
    loss_va = nn.BCEWithLogitsLoss()    
    model = Model(
        num_features=num_features,
        num_targets=len(non_scored_target_high_corr),
        hidden_size=hidden_size,
    )
    model.to(DEVICE)

    x_train, y_train  = train_df[feature_cols].values, train_df[non_scored_target_high_corr].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[non_scored_target_high_corr].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)

    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE*0.1, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e5,max_lr=0.0001, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    #scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, eta_min=0, last_epoch=-1, verbose=False)

    for epoch in range(1):
        train_loss = train_fn(model, optimizer,scheduler, loss_fr, trainloader, DEVICE)
        valid_loss, valid_preds = valid_fn(model, loss_va, validloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch},train_loss: {train_loss}, valid_loss: {valid_loss}")

    model.dense3 = nn.utils.weight_norm(nn.Linear(model.cha_po_2, num_targets))
    model.to(DEVICE)
    #----


    x_train, y_train  = train_df[feature_cols].values, train_df[target_cols].values
    x_valid, y_valid =  valid_df[feature_cols].values, valid_df[target_cols].values
    
    train_dataset = MoADataset(x_train, y_train)
    valid_dataset = MoADataset(x_valid, y_valid)
    trainloader = torch.utils.data.DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
    validloader = torch.utils.data.DataLoader(valid_dataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model.to(DEVICE)
    
    optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer=optimizer, pct_start=0.1, div_factor=1e3,   max_lr=1e-2, epochs=EPOCHS, steps_per_epoch=len(trainloader))
    #scheduler = torch.optim.lr_scheduler.CosineAnnealingWarmRestarts(optimizer, T_0=10, eta_min=0, last_epoch=-1, verbose=False)

    loss_tr = SmoothBCEwLogits(smoothing = 0.001)
    loss_va = nn.BCEWithLogitsLoss()    

    early_stopping_steps = EARLY_STOPPING_STEPS
    early_step = 0
    
    oof = np.zeros((len(train), target.iloc[:, 1:].shape[1]))
    best_loss = np.inf
    
    for epoch in range(EPOCHS):
        
        train_loss = train_fn(model, optimizer, scheduler, loss_fr, trainloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, train_loss: {train_loss}")
        valid_loss, valid_preds = valid_fn(model, loss_va, validloader, DEVICE)
        print(f"FOLD: {fold}, EPOCH: {epoch}, valid_loss: {valid_loss}")
        
        if valid_loss < best_loss:
            
            best_loss = valid_loss
            oof[val_idx] = valid_preds
            torch.save(model.state_dict(), f"FOLD{fold}_.pth")
        
        elif(EARLY_STOP == True):
            
            early_step += 1
            if (early_step >= early_stopping_steps):
                break
            
    
    #--------------------- PREDICTION---------------------
    x_test = test_[feature_cols].values
    testdataset = TestDataset(x_test)
    testloader = torch.utils.data.DataLoader(testdataset, batch_size=BATCH_SIZE, shuffle=False)
    
    model = Model(
        num_features=num_features,
        num_targets=num_targets,
        hidden_size=hidden_size,
    )
    
    model.load_state_dict(torch.load(f"FOLD{fold}_.pth"))
    model.to(DEVICE)
    
    predictions = np.zeros((len(test_), target.iloc[:, 1:].shape[1]))
    predictions = inference_fn(model, testloader, DEVICE)
    
    return oof, predictions


In [83]:
def run_k_fold(NFOLDS, seed):
    oof = np.zeros((len(train), len(target_cols)))
    predictions = np.zeros((len(test), len(target_cols)))
    
    for fold in range(NFOLDS):
        oof_, pred_ = run_training(fold, seed)
        
        predictions += pred_ / NFOLDS
        oof += oof_
        
    return oof, predictions

In [85]:
# Averaging on multiple SEEDS


oof = np.zeros((len(train), len(target_cols)))
predictions = np.zeros((len(test), len(target_cols)))

for seed in SEED:
    folds = make_fold()
    oof_, predictions_ = run_k_fold(NFOLDS, seed)
    oof += oof_ / len(SEED)
    predictions += predictions_ / len(SEED)

train[target_cols] = oof
test[target_cols] = predictions


pretrain 
 start
FOLD: 0, EPOCH: 0,train_loss: 0.7368861959464308, valid_loss: 0.7072242719786508
FOLD: 0, EPOCH: 0, train_loss: 0.4670236624399389
FOLD: 0, EPOCH: 0, valid_loss: 0.023579517592276847
FOLD: 0, EPOCH: 1, train_loss: 0.020559584020056587
FOLD: 0, EPOCH: 1, valid_loss: 0.018554492933409553
FOLD: 0, EPOCH: 2, train_loss: 0.018331372192588406
FOLD: 0, EPOCH: 2, valid_loss: 0.01779971197247505
FOLD: 0, EPOCH: 3, train_loss: 0.01732049588168013
FOLD: 0, EPOCH: 3, valid_loss: 0.01749737097748688
FOLD: 0, EPOCH: 4, train_loss: 0.016990593580556088
FOLD: 0, EPOCH: 4, valid_loss: 0.017389837758881706
FOLD: 0, EPOCH: 5, train_loss: 0.016852821601365787
FOLD: 0, EPOCH: 5, valid_loss: 0.016983456616955144
FOLD: 0, EPOCH: 6, train_loss: 0.01682835237622477
FOLD: 0, EPOCH: 6, valid_loss: 0.01711713464132377
FOLD: 0, EPOCH: 7, train_loss: 0.016810515369086163
FOLD: 0, EPOCH: 7, valid_loss: 0.01712890150291579
FOLD: 0, EPOCH: 8, train_loss: 0.01688063299904267
FOLD: 0, EPOCH: 8, valid_lo

In [86]:
valid_results = train_targets_scored.drop(columns=target_cols).merge(train.iloc[:-augmented_num,:][['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
print(len(train_targets_scored))
print(len(valid_results))

y_true = train_targets_scored[target_cols].values
y_pred = valid_results[target_cols].values

score = 0
for i in range(len(target_cols)):
    score_ = log_loss(y_true[:, i], y_pred[:, i])
    score += score_ / target.shape[1]
    
print("CV log_loss: ", score)
    

23814
23814
CV log_loss:  0.014696218863042863


In [87]:
sub = sample_submission.drop(columns=target_cols).merge(test[['sig_id']+target_cols], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)

In [88]:
sub.shape

(3982, 207)