In [12]:
import os
import pandas as pd
import numpy as np
from model import kaggle_metric

from sklearn.metrics import roc_auc_score
from utils import to_logits, sigmoid
import joblib
from lightgbm import LGBMClassifier
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.model_selection import GroupKFold
import optuna
optuna.logging.set_verbosity(optuna.logging.WARNING)

# Load data

In [2]:
sub_names = ['b5-fold0-seed0']

In [3]:
def get_sub(sub_name):
    submission = pd.read_csv(f'../../submission/v22/{sub_name}.csv', index_col='StudyInstanceUID')
    submission = submission.rename(lambda x: x+sub_name, axis=1)
    return submission

In [4]:
submissions = [get_sub(sub_name) for sub_name in sub_names]
train = pd.read_csv('../../input/kaggle/train.csv', index_col='StudyInstanceUID')
label_cols = [
    'ETT - Abnormal', 'ETT - Borderline', 'ETT - Normal', 'NGT - Abnormal', 'NGT - Borderline',
    'NGT - Incompletely Imaged', 'NGT - Normal', 'CVC - Abnormal', 'CVC - Borderline', 'CVC - Normal',
    'Swan Ganz Catheter Present'
]
df = pd.concat([train.loc[submissions[0].index]] + submissions, axis=1)

In [5]:
df

Unnamed: 0_level_0,ETT - Abnormal,ETT - Borderline,ETT - Normal,NGT - Abnormal,NGT - Borderline,NGT - Incompletely Imaged,NGT - Normal,CVC - Abnormal,CVC - Borderline,CVC - Normal,...,ETT - Borderlineb5-fold0-seed0,ETT - Normalb5-fold0-seed0,NGT - Abnormalb5-fold0-seed0,NGT - Borderlineb5-fold0-seed0,NGT - Incompletely Imagedb5-fold0-seed0,NGT - Normalb5-fold0-seed0,CVC - Abnormalb5-fold0-seed0,CVC - Borderlineb5-fold0-seed0,CVC - Normalb5-fold0-seed0,Swan Ganz Catheter Presentb5-fold0-seed0
StudyInstanceUID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.2.826.0.1.3680043.8.498.28811236589118628938068261212733429211,0,0,0,0,0,0,0,0,0,1,...,0.000052,0.000118,0.000053,0.000060,8.401572e-06,0.000067,0.000316,0.003180,0.999078,0.000001
1.2.826.0.1.3680043.8.498.88157424941800234649339861086375648978,0,0,1,0,0,0,0,0,0,0,...,0.325414,0.627799,0.452231,0.182352,1.650982e-02,0.234033,0.021033,0.028103,0.002185,0.000021
1.2.826.0.1.3680043.8.498.13301225220440590738189758741551530261,0,0,0,0,0,0,0,0,0,1,...,0.000012,0.000020,0.000022,0.000070,9.049413e-07,0.000045,0.000069,0.003054,0.997641,0.000001
1.2.826.0.1.3680043.8.498.11248355962522836935440090714457754916,0,0,1,0,0,1,0,0,0,1,...,0.003879,0.995063,0.002109,0.049097,9.150911e-01,0.066824,0.000852,0.026313,0.995916,0.000149
1.2.826.0.1.3680043.8.498.50125510495983316504316300193406208108,0,0,0,0,0,0,0,0,0,1,...,0.000012,0.000028,0.000107,0.000103,8.238742e-06,0.000092,0.002000,0.038309,0.969052,0.000003
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1.2.826.0.1.3680043.8.498.11764913860547225796976468993633871783,0,0,0,0,0,0,0,0,1,0,...,0.000006,0.000019,0.000012,0.000043,1.533556e-06,0.000030,0.011028,0.673253,0.400632,0.000002
1.2.826.0.1.3680043.8.498.78972319974538066240357460297787693641,0,0,1,0,0,1,0,0,1,1,...,0.004181,0.994442,0.002274,0.002906,9.700571e-01,0.021126,0.001179,0.674615,0.796876,0.004134
1.2.826.0.1.3680043.8.498.56841284433498016581766180231512368207,0,0,0,0,0,0,0,0,0,1,...,0.000012,0.000037,0.000016,0.000032,1.828292e-06,0.000042,0.000288,0.001302,0.998567,0.000005
1.2.826.0.1.3680043.8.498.24986243088473858123783492767925302975,0,0,0,0,0,0,0,0,0,1,...,0.000044,0.000191,0.000078,0.000246,4.973071e-05,0.000225,0.004721,0.264532,0.737755,0.000001


In [6]:
def assign_folds(df):
    df['fold'] = 0
    folds = list(GroupKFold(n_splits=5).split(df, df[label_cols], df['PatientID']))
    for i, (trn_idx, val_idx) in enumerate(folds):
        df['fold'].iloc[val_idx] = i
    return df

df = assign_folds(df)
df['fold'].value_counts()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


0    1204
1    1204
4    1203
2    1203
3    1203
Name: fold, dtype: int64

# Baseline

In [7]:
X_COLS = [c for c in df.columns if c not in ['PatientID', 'fold']+label_cols]
Y_COLS = label_cols
for col in label_cols:
    df[f'init_score_{col}'] = to_logits(df[[c for c in X_COLS if c.startswith(col)]].mean(axis=1))

In [9]:
for c in label_cols:
    print(c, roc_auc_score(df[c].values, df[f'init_score_{c}']))

ETT - Abnormal 0.995656862745098
ETT - Borderline 0.9669219658799374
ETT - Normal 0.9917320119454895
NGT - Abnormal 0.9830989060058047
NGT - Borderline 0.968687364502802
NGT - Incompletely Imaged 0.9859753122234585
NGT - Normal 0.9877088630231782
CVC - Abnormal 0.9477036193237715
CVC - Borderline 0.8786828120809276
CVC - Normal 0.9317352992964276
Swan Ganz Catheter Present 0.9994619888163754


# LGB

In [29]:
def train_lgb(x_train, y_train, x_val, y_val, train_init_score, val_init_score, col_num, params={}, seed=0, es=100, n_jobs=24, callbacks=None):
    params['seed'] = seed
    params['device_type'] = 'cpu'
    params['n_jobs'] = n_jobs
    params['objective'] = 'binary'
    params['n_estimators'] = 100000
    params['metric'] = 'auc'
    lgb = LGBMClassifier(**params)
    lgb.fit(x_train, y_train[:, col_num],
            eval_set=[(x_val, y_val[:, col_num])],
            early_stopping_rounds=es,
            verbose=None,
            init_score=train_init_score[:, col_num],
            eval_init_score=[val_init_score[:, col_num]],
            callbacks=callbacks)
    pred = lgb.predict_proba(x_val, raw_score=True) + val_init_score[:, col_num]
    return lgb, pred, lgb.best_score_['valid_0']['auc']

def train_oof(pred_df, col_num, params={}, seed=0, es=100, n_jobs=24, nfold=5):
    models = []
    preds = pd.Series(np.nan, index=pred_df.index)
    for i in range(nfold):
        train, valid = pred_df[pred_df['fold']!=i], pred_df[pred_df['fold']==i]
        x_train = train[X_COLS].values
        x_valid = valid[X_COLS].values
        y_train = train[Y_COLS].values
        y_valid = valid[Y_COLS].values
        train_init_score = train[[c for c in train.columns if c.startswith('init_score')]].values
        val_init_score = valid[[c for c in valid.columns if c.startswith('init_score')]].values
        model, pred, score = train_lgb(x_train, y_train, x_valid, y_valid, 
                                       train_init_score, val_init_score, col_num,
                                       params=params, seed=0, es=es, n_jobs=n_jobs)
        models.append(model)
        preds.loc[valid.index] = pred
    rel_index = preds[~preds.isna()].index
    score = roc_auc_score(pred_df.loc[rel_index].values[:, col_num], preds.loc[rel_index].values)
    return models, preds, score

In [30]:
for col_num in range(11):
    if os.path.exists(f'lgb/best{col_num}.jl'):
        best_score = joblib.load(f'lgb/best{col_num}.jl')[2]
    else:
        best_score = 0

        
    def objective_lgb(trial):
        params = {
            'num_leaves': trial.suggest_int('num_leaves', 10, 512),  # 31
            'min_child_samples': trial.suggest_int('min_child_samples', 5, 100),  # 20
            'min_child_weight': trial.suggest_loguniform('min_child_weight', 1e-8, 1.0),  # 1e-3
            'learning_rate': trial.suggest_uniform('learning_rate', 0.03, 0.1),  # 0.1
            'reg_alpha': trial.suggest_loguniform('reg_alpha', 1e-8, 1.0),  # 0.0
            'reg_lambda': trial.suggest_loguniform('reg_lambda', 1e-8, 1.0),  # 0.0
            # 'min_split_gain': trial.suggest_loguniform('min_split_gain', 1e-8, 1.0),  # 0.0
            'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),  # 1.0
            'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),  # 1.0
            'subsample_freq': trial.suggest_int('subsample_freq', 1, 7),  # 0
            # 'max_bin': trial.suggest_int('max_bin', 63, 255),  # 255
        }

        # callbacks = [optuna.integration.LightGBMPruningCallback(trial, "auc")]
        callbacks = None

    #     lgb, pred, score = train_lgb(x_train, y_train, x_valid, y_valid, params, 0, 100, 24, callbacks=callbacks)
        lgbs, preds, score = train_oof(df, col_num, params, 0, 100, 32, nfold=1)

        print(trial.number, score)

        global best_score
        if score > best_score:
            best_score = score
            joblib.dump((lgbs, preds, score), f'lgb/best{col_num}.jl')

        return score

    def callback(study, _):
        joblib.dump(study, f'lgb/study{col_num}.jl')


    if os.path.exists(f'lgb/study{col_num}.jl'):
        lgb_study = joblib.load(f'lgb/study{col_num}.jl')
    else:
        lgb_study = optuna.create_study(
            sampler=optuna.samplers.TPESampler(n_startup_trials=100, multivariate=True),
            # pruner=optuna.pruners.PercentilePruner(90.0, n_startup_trials=10, n_warmup_steps=100),
            direction='maximize'
        )

    lgb_study.optimize(objective_lgb, n_trials=300, n_jobs=1, callbacks=[callback])

[33m[W 2021-03-04 23:59:34,662][0m Trial 0 failed because of the following error: ValueError('unknown format is not supported')
Traceback (most recent call last):
  File "/home/vnfmadl97/anaconda3/envs/gpu/lib/python3.8/site-packages/optuna/_optimize.py", line 189, in _run_trial
    value = func(trial)
  File "<ipython-input-30-815b3cb517a6>", line 27, in objective_lgb
    lgbs, preds, score = train_oof(df, col_num, params, 0, 100, 32, nfold=1)
  File "<ipython-input-29-502f770c8087>", line 37, in train_oof
    score = roc_auc_score(pred_df.loc[rel_index].values[:, col_num], preds.loc[rel_index].values)
  File "/home/vnfmadl97/anaconda3/envs/gpu/lib/python3.8/site-packages/sklearn/utils/validation.py", line 72, in inner_f
    return f(**kwargs)
  File "/home/vnfmadl97/anaconda3/envs/gpu/lib/python3.8/site-packages/sklearn/metrics/_ranking.py", line 395, in roc_auc_score
    return _average_binary_score(partial(_binary_roc_auc_score,
  File "/home/vnfmadl97/anaconda3/envs/gpu/lib/pyth

[0 0 0 ... 0 0 0] [-11.54539309 -12.41865356 -10.86040695 ... -12.19135854 -11.99796301
 -11.91387282]


ValueError: unknown format is not supported

In [None]:
study = joblib.load('study.jl')
params = study.best_params
print(study.best_value)
print(study.best_params)

0.816515

In [28]:
lgbs, preds, score = train_oof(pred_df, params, 0, 100, 32, nfold=5)
score

0.8168961288159211

In [31]:
lgbs, preds, score = train_oof(pred_df, params, 0, 100, 32, nfold=5)
score

0.8169142388260161

In [32]:
joblib.dump(lgbs, '../../model/final_models/lgbs.jl')

['lgbs_seq800.jl']

## END

In [34]:
study = joblib.load('study.jl')
params = study.best_params
print(study.best_value)
print(study.best_params)

0.817857952055393
{'num_leaves': 81, 'min_child_samples': 83, 'min_child_weight': 0.1303821087566591, 'learning_rate': 0.03757025751059658, 'reg_alpha': 0.0011759670245325162, 'reg_lambda': 0.00828634910217317, 'colsample_bytree': 0.830327111500236, 'subsample': 0.6598225978204229, 'subsample_freq': 3}


In [187]:
lgbs, preds, score = train_oof(pred_df, params, 0, 100, 24, nfold=7
score

0.8161451258114345

In [188]:
lgbs[0].feature_importances_

array([431, 388, 480, 592, 405, 351, 442, 395,  81, 543, 558,  36,  28,
       506, 426,  43, 458, 509, 559, 459, 518, 184,   8], dtype=int32)