- pca to remove unnecessary c vars
- remove g vars by variance threshold
- try 2nd layer again
- refactoring

In [1]:
import os
import sys
import warnings
import numpy as np
import pandas as pd
import seaborn as sns
import lightgbm as lgb
import matplotlib.pyplot as plt
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold
from tqdm._tqdm_notebook import tqdm_notebook
from sklearn.model_selection import StratifiedKFold, KFold
sys.path.append('../input/multilabelstraifier/')
from ml_stratifiers import MultilabelStratifiedKFold
warnings.filterwarnings('ignore')

Please use `tqdm.notebook.*` instead of `tqdm._tqdm_notebook.*`
  del sys.path[0]


In [2]:
DIR = "/kaggle/input/lish-moa/"
train = pd.read_csv(DIR+"train_features.csv")
test = pd.read_csv(DIR+"test_features.csv")
nontargets = pd.read_csv(DIR+"train_targets_nonscored.csv")
targets = pd.read_csv(DIR+"train_targets_scored.csv")
sub = pd.read_csv(DIR+"sample_submission.csv")

In [3]:
target_feats = [ i for i in targets.columns if i != "sig_id"]
g_feats = [i for i in train.columns if "g-" in i]
c_feats = [i for i in train.columns if "c-" in i]

In [4]:
noncons_train_index = train[train.cp_type=="ctl_vehicle"].index
cons_train_index = train[train.cp_type!="ctl_vehicle"].index
noncons_test_index = test[test.cp_type=="ctl_vehicle"].index
cons_test_index = test[test.cp_type!="ctl_vehicle"].index

In [5]:
train = train[train.index.isin(cons_train_index)].copy().reset_index(drop=True)
n_targets = targets[targets.index.isin(cons_train_index)].copy().reset_index(drop=True)
n_nontargets = nontargets[nontargets.index.isin(cons_train_index)].copy().reset_index(drop=True)

In [6]:
c_num = 10
pca_c_cols = ["pca-c"+str(i) for i in range(c_num)]
pca = PCA(n_components=c_num)
pca_c_train = pca.fit_transform(train[c_feats])
pca_c_test = pca.transform(test[c_feats])
pca_c_train = pd.DataFrame(pca_c_train, columns=pca_c_cols)
pca_c_test = pd.DataFrame(pca_c_test, columns=pca_c_cols)

train = pd.concat([train, pca_c_train], axis=1)
test = pd.concat([test, pca_c_test], axis=1)

In [7]:
X = train[g_feats].copy().values
select = VarianceThreshold(threshold=1)
X_new = select.fit_transform(X)
X.shape, X_new.shape
drop_g_feats = list(np.array(g_feats)[select.get_support()==False])
len(drop_g_feats)

181

# preprocess

In [8]:
categoricals = ["cp_dose"]

def encoding(tr, te):
    for f in categoricals:
        lbl = preprocessing.LabelEncoder()
        lbl.fit(list(tr[f]))
        tr[f] = lbl.transform(list(tr[f]))
        te[f] = lbl.transform(list(te[f])) 
        
    return tr, te

n_train, n_test = encoding(train, test)

# feature engineering

In [9]:
def fe(df, remove_features):
    df.drop(remove_features, axis=1, inplace=True)
    return df

remove_features = ["cp_type", "sig_id"] + c_feats + drop_g_feats
        
n_train = fe(n_train, remove_features)
n_test = fe(n_test, remove_features)

print(train.shape, test.shape)

(21948, 603) (3982, 603)


# 1st model

In [10]:
def modelling_lgb(new_train, target_train, new_test, target, layer):
    
    X_train = new_train.copy()
    y_train = target_train[target].copy()
    X_test = new_test.copy()
    
    if layer == 1:
        params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.01, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
    else:
        params = {'objective': 'binary', 'boosting_type': 'gbdt', 'tree_learner': 'serial', 'learning_rate': 0.02, 
               "num_leaves": 10, 'random_seed':44, 'max_depth': 5} 
        
    n_folds=4
    if target not in ["erbb2_inhibitor", "atp-sensitive_potassium_channel_antagonist"]:
        skf=StratifiedKFold(n_splits = n_folds, shuffle=True, random_state=0)
    else:
        skf=KFold(n_splits = n_folds, shuffle=True, random_state=0)
    
    pred_value = np.zeros(X_test.shape[0])
    valid = np.zeros([X_train.shape[0]])
    for i , (train_index, test_index) in enumerate(skf.split(X_train, y_train)):
        X_train2 = X_train.iloc[train_index,:]
        y_train2 = y_train.iloc[train_index]

        X_valid2 = X_train.iloc[test_index,:]
        y_valid2 = y_train.iloc[test_index]
        
        lgb_train = lgb.Dataset(X_train2, y_train2)
        lgb_eval = lgb.Dataset(X_valid2, y_valid2, reference=lgb_train)
        
        clf = lgb.train(params, lgb_train,valid_sets=[lgb_train, lgb_eval], 
               num_boost_round=10000,early_stopping_rounds=25,verbose_eval = 0) 

        valid_predict = clf.predict(X_valid2, num_iteration = clf.best_iteration)
        valid[test_index] = valid_predict
        pred_value += clf.predict(X_test, num_iteration = clf.best_iteration) / n_folds
            
    score = log_loss(y_train, valid)
            
    return valid, pred_value, score

In [11]:
train_checkscore = targets.copy()
target_list = []
log_loss_list = []

for ind, target in enumerate(target_feats):
    print(ind, target)
    valid, pred_value, score = modelling_lgb(n_train, n_targets, n_test, target,1)
    train_checkscore.loc[cons_train_index, target] = valid
    train_checkscore.loc[noncons_train_index, target] = 0
    print("oof log_loss= {}, all log_loss= {}".format(score, log_loss(targets[target], train_checkscore[target])))
    target_list.append(target)
    log_loss_list.append(score)
    sub[target] = pred_value
    sub.loc[noncons_test_index, target] = 0

0 5-alpha_reductase_inhibitor
oof log_loss= 0.0059683912524128, all log_loss= 0.0055007244145443
1 11-beta-hsd1_inhibitor
oof log_loss= 0.006582542422770321, all log_loss= 0.006066752376541733
2 acat_inhibitor
oof log_loss= 0.008541915591584945, all log_loss= 0.007872594415222483
3 acetylcholine_receptor_agonist
oof log_loss= 0.048923522632396944, all log_loss= 0.04509000901721046
4 acetylcholine_receptor_antagonist
oof log_loss= 0.0706912686301947, all log_loss= 0.06515209388996035
5 acetylcholinesterase_inhibitor
oof log_loss= 0.022266243277750446, all log_loss= 0.02052152126732463
6 adenosine_receptor_agonist
oof log_loss= 0.017230753256759365, all log_loss= 0.015880598491616545
7 adenosine_receptor_antagonist
oof log_loss= 0.027462410753360627, all log_loss= 0.02531053125114474
8 adenylyl_cyclase_activator
oof log_loss= 0.0037872430592830555, all log_loss= 0.0034904850367492384
9 adrenergic_receptor_agonist
oof log_loss= 0.06147672954787622, all log_loss= 0.056659580923691485
10 ad

In [12]:
# local score
scores = []
for target_col in target_feats:
    scores.append(log_loss(targets[target_col], train_checkscore[target_col]))
print(np.mean(scores))

0.015958240181755234


# 2nd model

In [13]:
first_layer_train = train_checkscore[train_checkscore.index.isin(cons_train_index)].copy()
first_layer_test = sub[sub.index.isin(cons_test_index)].copy()

first_layer_train.drop("sig_id", axis=1, inplace=True)
first_layer_test.drop("sig_id", axis=1, inplace=True)

#n_train_feat2 = pd.merge(n_train, first_layer_train_feat, on="sig_id", how="outer") 
#n_test_feat2 = pd.merge(n_test, first_layer_test_feat, on="sig_id", how="outer")

In [14]:
final = pd.read_csv(DIR+"sample_submission.csv")

In [15]:
train_checkscore2 = targets.copy()
target_list2 = []
log_loss_list2 = []

for ind, target in enumerate(target_feats):
    print(ind, target)
    valid, pred_value, score = modelling_lgb(first_layer_train, n_targets, first_layer_test, target, 2)
    train_checkscore2.loc[cons_train_index, target] = valid
    train_checkscore2.loc[noncons_train_index, target] = 0
    print("oof log_loss= {}, all log_loss= {}".format(score, log_loss(targets[target], train_checkscore2[target])))
    target_list2.append(target)
    log_loss_list2.append(score)
    final.loc[cons_test_index, target] = pred_value
    final.loc[noncons_test_index, target] = 0

0 5-alpha_reductase_inhibitor
oof log_loss= 0.006135499462646391, all log_loss= 0.005654738481824341
1 11-beta-hsd1_inhibitor
oof log_loss= 0.006661695931129607, all log_loss= 0.0061397036321674
2 acat_inhibitor
oof log_loss= 0.00849497070345881, all log_loss= 0.007829328000315605
3 acetylcholine_receptor_agonist
oof log_loss= 0.04854364904554355, all log_loss= 0.04473990128712487
4 acetylcholine_receptor_antagonist
oof log_loss= 0.07035453199225535, all log_loss= 0.06484174301528606
5 acetylcholinesterase_inhibitor
oof log_loss= 0.02196866452474375, all log_loss= 0.020247259972666406
6 adenosine_receptor_agonist
oof log_loss= 0.01712068620411885, all log_loss= 0.015779155992609487
7 adenosine_receptor_antagonist
oof log_loss= 0.027699246147637257, all log_loss= 0.02552880887076276
8 adenylyl_cyclase_activator
oof log_loss= 0.0045651639792127175, all log_loss= 0.004207450198024801
9 adrenergic_receptor_agonist
oof log_loss= 0.061731746063859304, all log_loss= 0.05689461504197472
10 adr

In [16]:
# local score
scores = []
for target_col in target_feats:
    scores.append(log_loss(targets[target_col], train_checkscore2[target_col]))
print(np.mean(scores))

0.01595553579771805


# submission

In [17]:
final.to_csv("submission.csv", index=False)