%%javascript 
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

In [38]:
# lib
import os
import gc
import random
import math
import time

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd

from sklearn.model_selection import StratifiedKFold, GroupKFold, KFold
from sklearn.multioutput import ClassifierChain
from lightgbm import LGBMClassifier

#import warnings
#warnings.filterwarnings("ignore")

from fastprogress import master_bar, progress_bar
from iterstrat.ml_stratifiers import MultilabelStratifiedKFold

if not os.path.exists('models1'):
    os.mkdir('models1')


def seed_everything(seed=777):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    #torch.manual_seed(seed)
    #torch.cuda.manual_seed(seed)
    #torch.backends.cudnn.deterministic = True


SEED = 777
seed_everything(SEED)


In [39]:
data_path = '../input/lish-moa/'

In [3]:
def get_logger(filename='log', save=True):
    from logging import getLogger, INFO, StreamHandler, FileHandler, Formatter
    logger = getLogger(__name__)
    logger.setLevel(INFO)
    handler1 = StreamHandler()
    handler1.setFormatter(Formatter("%(message)s"))
    logger.addHandler(handler1)
    
    if save:
        handler2 = FileHandler(filename=f"{filename}.log")
        handler2.setFormatter(Formatter("%(message)s"))  
        logger.addHandler(handler2)
    return logger

nb_name = 'log'  # nb_name[:-6]

logger = get_logger(nb_name)
logger.info(f'file_name: {nb_name}')
logger.info(f'Time: {time.ctime()}')
logger.info(f'Set seed: {SEED}')

file_name: log
Time: Mon Nov 16 00:13:58 2020
Set seed: 777


# load df

In [4]:
# load df
# train_df
trn_feature = pd.read_csv(data_path + 'train_features.csv')
print(trn_feature.shape)

# train_df_target
trn_tar_df = pd.read_csv(data_path + 'train_targets_scored.csv')
print(trn_tar_df.shape)

# test_df
test_df = pd.read_csv(data_path + 'test_features.csv')
print(test_df.shape)

# sub_df
sub_df = pd.read_csv(data_path + 'sample_submission.csv')
print(sub_df.shape)

# label column keys
y_keys = list(trn_tar_df.keys())[1:];

# numerical col
genes = [col for col in trn_feature.columns if col.startswith('g-')]
cells = [col for col in trn_feature.columns if col.startswith('c-')]

# catgoriacal col
cat_col = ['cp_time', 'cp_dose']

(23814, 876)
(23814, 207)
(3982, 876)
(3982, 207)


In [5]:
# create df without ctl_vehicle in cp_type
trn_df = trn_feature.merge(trn_tar_df, on='sig_id')
trn_df = trn_df[trn_df.cp_type !='ctl_vehicle'].reset_index(drop=True)
test_df = test_df[test_df.cp_type !='ctl_vehicle'].reset_index(drop=True)
print(trn_df.shape, test_df.shape)
trn_df.sample(20)

(21948, 1082) (3624, 876)


Unnamed: 0,sig_id,cp_type,cp_time,cp_dose,g-0,g-1,g-2,g-3,g-4,g-5,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
14261,id_a6a4f3626,trt_cp,24,D2,-0.3265,0.1577,-0.3381,0.1789,0.2595,0.7117,...,0,0,0,0,0,0,0,0,0,0
14266,id_a6b456ebb,trt_cp,24,D1,-0.7239,0.0713,0.225,0.5864,2.628,-0.4762,...,0,0,0,0,0,0,0,0,0,0
5119,id_3bd739795,trt_cp,24,D1,0.5283,1.235,2.715,-0.0206,0.5865,0.2824,...,0,0,0,0,0,0,0,0,0,0
692,id_07b7426b2,trt_cp,24,D1,-0.2856,-0.6573,1.889,-0.3083,-0.5899,0.5369,...,0,0,0,0,0,0,0,0,0,0
6833,id_4fbf1ada4,trt_cp,72,D2,0.9492,-0.5027,-0.0934,-2.283,-0.1903,0.023,...,0,0,0,0,0,0,0,0,0,0
7051,id_52620ac73,trt_cp,24,D1,-0.4717,-0.2592,-0.5129,0.0462,0.5863,-0.6585,...,0,0,0,0,0,0,0,0,0,0
13131,id_995a3b2d4,trt_cp,48,D1,2.419,-1.138,-0.4596,-0.1602,2.548,1.523,...,0,0,0,0,0,0,0,0,0,0
2435,id_1c33757b3,trt_cp,24,D1,-0.0171,-0.0432,-0.4036,-0.6486,-0.7183,-0.5947,...,0,0,0,0,0,0,0,0,0,0
10021,id_7514b50fb,trt_cp,48,D1,0.5772,0.8006,1.508,-0.8161,-0.1879,-0.1965,...,0,0,0,0,0,0,0,0,0,0
13768,id_a0ad27715,trt_cp,48,D2,0.212,0.2485,1.03,-1.073,-0.018,-1.721,...,0,0,0,0,0,0,0,0,0,0


# cv

In [6]:
# kfolds index
folds = MultilabelStratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
for n, (trn_idx, val_idx) in enumerate(folds.split(trn_df, trn_df[y_keys])):
    print(trn_idx, val_idx)



[    0     1     2 ... 21945 21946 21947] [    3    15    21 ... 21939 21942 21944]
[    1     2     3 ... 21944 21945 21947] [    0     4     7 ... 21934 21936 21946]
[    0     2     3 ... 21945 21946 21947] [    1    14    18 ... 21928 21940 21943]
[    0     1     2 ... 21944 21946 21947] [    8     9    10 ... 21920 21932 21945]
[    0     1     3 ... 21944 21945 21946] [    2     5     6 ... 21937 21941 21947]


# dataset

In [7]:
cat_features = ['cp_time', 'cp_dose']
feature_cols = genes + cells + cat_features


def cate2num(df):
    df['cp_time'] = df['cp_time'].map({24: 0, 48: 1, 72: 2})
    df['cp_dose'] = df['cp_dose'].map({'D1': 1, 'D2': 0})
    return df


trn_df = cate2num(trn_df)
test_df = cate2num(test_df)

In [8]:
x_trn = trn_df[feature_cols]
y_trn = trn_df[y_keys]

x_test = test_df[feature_cols]

# model

In [23]:
def compute_metric(preds, ys):
    esp = 1e-15
    preds = np.clip(preds, esp, 1 - esp)
    score = -np.mean(np.mean(
        ys * np.log(preds + esp) + (1 - ys) * np.log(1 - preds + esp), axis=1),
                     axis=0)
    return score

# train

In [58]:
def run_single(model,
               trn_df,
               test_df,
               fold_idx,
               feature_cols,
               targets=None,
               fold_num=0):

    # index
    trn_idx = fold_idx[0]
    val_idx = fold_idx[1]

    # shuffle
    #trn_idx = np.random.permutation(trn_idx)
    
    # model
    model = ClassifierChain(model, random_state=SEED)

    # data
    x_trn = trn_df[feature_cols].iloc[trn_idx].values
    y_trn = trn_df[targets].iloc[trn_idx].values
    x_val = trn_df[feature_cols].iloc[val_idx].values
    y_val = trn_df[targets].iloc[val_idx].values

    # fit
    model.fit(x_trn, y_trn)

    # test
    test_preds = model.predict_proba(test_df[feature_cols])
    predict = test_preds

    # predict
    trn_preds = model.predict_proba(x_trn)
    score = compute_metric(trn_preds, y_trn)
    val_preds = model.predict_proba(x_val)
    val_score = compute_metric(val_preds, y_val)

    # oof
    oof = np.zeros((len(trn_df), len(y_keys)))
    oof[val_idx] = val_preds
    
    # log
    logger.info(f'==================={fold_num} fold========================')
    logger.info(f'train metric: {compute_metric(trn_preds, y_trn): .8f}')
    logger.info(f'val metric: {compute_metric(val_preds, y_val): .8f}')
    logger.info(f"\n")    

    return oof, predict, val_score



def run_kfolds(model,
               trn_df,
               test_df,
               feature_cols,
               targets=None,
               folds=5,
               submit=False):

    oof = np.zeros((len(trn_df), len(y_keys)))
    preds = np.zeros((len(test_df), len(y_keys)))

    cv_scores = []

    gkf = MultilabelStratifiedKFold(n_splits=folds, shuffle=True, random_state=SEED)
    for n, fold_idx in enumerate(gkf.split(trn_df, trn_df[y_keys])):

        oof_, pred_, score = run_single(
            model,
            trn_df,
            test_df,
            fold_idx,
            feature_cols,
            targets=targets,
            fold_num=n)
        
        oof += oof_
        preds += pred_ / folds
        cv_scores.append(score)

    # log
    logger.info(f'CV score: {np.mean(cv_scores): .6f} ± {np.std(cv_scores): .10f}')

    return oof, preds

In [59]:
param = {
    'num_leaves': 9,
    'bagging_fraction': 0.6,
    'feature_fraction': 0.8,
    'max_depth': 5,
    'metric': 'rmse',
    'min_child_samples': 29,
    'n_estimators': 70,
    'reg_alpha': 0,
    'reg_lambda': 0.001,
}

model = LGBMClassifier()  # **param

oof, predict = run_kfolds(
    model,
    trn_df,
    test_df,
    feature_cols,
    targets=y_keys,
    folds=5,
)

train metric:  0.04240407
val metric:  0.12904927


train metric:  0.01961647
val metric:  0.09967067


train metric:  0.01532416
val metric:  0.10271711


train metric:  0.02135757
val metric:  0.10891392


train metric:  0.01733714
val metric:  0.10577034


CV score:  0.109224 ±  0.0103795064


In [60]:
# OOF score without 'ctl_vehicle'
ys = trn_df[y_keys]
preds = oof

score = compute_metric(preds, ys)
logger.info(f"OOF result: {score}")

OOF result: 0.10922499478853526


In [62]:
ys = trn_df[y_keys]
trn_df[y_keys] = pd.DataFrame(oof)
trn_df[['sig_id'] + y_keys].to_csv('oof.csv', index=False)

test_df[y_keys] = pd.DataFrame(predict)
test_df[['sig_id'] + y_keys].to_csv('pred.csv', index=False)

In [63]:
# Final result with 'cp_type'=='ctl_vehicle' data
result = trn_tar_df.drop(columns=y_keys).merge(trn_df[['sig_id']+y_keys], on='sig_id', how='left').fillna(0)

ys = trn_tar_df[y_keys].values
preds = result[y_keys].values

score = compute_metric(preds, ys)
logger.info(f"Final result: {score}")

Final result: 0.10066642250855576


In [64]:
sub = sub_df.drop(columns=y_keys).merge(test_df[['sig_id'] + y_keys], on='sig_id', how='left').fillna(0)
sub.to_csv('submission.csv', index=False)
sub.head()

Unnamed: 0,sig_id,5-alpha_reductase_inhibitor,11-beta-hsd1_inhibitor,acat_inhibitor,acetylcholine_receptor_agonist,acetylcholine_receptor_antagonist,acetylcholinesterase_inhibitor,adenosine_receptor_agonist,adenosine_receptor_antagonist,adenylyl_cyclase_activator,...,tropomyosin_receptor_kinase_inhibitor,trpv_agonist,trpv_antagonist,tubulin_inhibitor,tyrosine_kinase_inhibitor,ubiquitin_specific_protease_inhibitor,vegfr_inhibitor,vitamin_b,vitamin_d_receptor_agonist,wnt_inhibitor
0,id_0004d9e33,1.231463e-12,2.710965e-08,1.549782e-07,0.000511,0.000733,7.382951e-07,3.187991e-07,1.650766e-05,0.002829148,...,0.0,1.613427e-06,2.151101e-07,3e-06,2.382135e-07,0.2,1.696424e-06,6.015713e-08,2.354105e-07,5.982959e-08
1,id_001897cda,2.985527e-09,1.939319e-08,1.06074e-07,7e-06,6.3e-05,7.803625e-07,1.270994e-07,7.213096e-07,5.170899e-09,...,0.0,6.382178e-09,2.603881e-07,1e-06,1.75103e-06,0.0,1.660233e-06,4.472936e-08,1.379482e-07,1.167755e-06
2,id_002429b5b,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,id_00276f245,1.408916e-07,1.856235e-08,7.432585e-08,1.5e-05,0.000172,1.144074e-06,1.791847e-07,1.202384e-06,1.545812e-08,...,0.0,5.134229e-09,4.85915e-07,1e-05,2.987221e-07,0.0,1.301399e-06,9.34155e-08,1.069867e-07,9.452473e-08
4,id_0027f1083,0.2,2.122316e-08,9.081297e-08,3.6e-05,0.00048,3.435959e-07,1.139989e-06,4.120607e-06,5.184215e-09,...,0.0,1.034091e-08,2.419079e-07,3e-06,3.3605e-07,8.338939e-86,7.049854e-07,9.013518e-07,8.622804e-08,7.092378e-08
