In [1]:
# env setting
import sys
sys.path.append("../src")
sys.path.append("../models")

import numpy as np
import pandas as pd

from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from pycaret.classification import *
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
tqdm_notebook.get_lock().locks = []

#custom function
import config
import helpers
from custom_pipeline import Custom_Pipeline

In [2]:
(train, test, origin, submission) = helpers.data_loader()
train.head(5)

train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

categorical_features = config.CATEGORICAL_FEATURES
target = 'outcome'
numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

train = pd.concat(
    [train, origin], ignore_index=True
)
train = train.drop_duplicates()

print(train.shape)
print(test.shape)

(1534, 29)
(824, 28)


In [31]:
X_tr = train.copy()
X_test = test.copy()
y = train.outcome

USECOLS = categorical_features
DROPCOLS = ['lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(X_tr, y)
X_tr = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
print(X_tr.shape)

(1534, 29)


In [32]:
X_test = pipe.transform(X_test, USECOLS, ALPHA, DROPCOLS)

In [33]:
from lightgbm import LGBMClassifier

X_train, X_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.2, random_state=42)


lgbm = LGBMClassifier()
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgbm.fit(X_train, y_train)

scores = cross_val_score(lgbm, X_train, y_train,
                         cv=folds, scoring='f1_micro',
                         n_jobs=-1)

y_pred = lgbm.predict(X_val)
val_score = f1_score(y_true=y_val, y_pred=y_pred, average='micro')

print("==== TRAIN CV SCORE ====")
print("mean score: ", np.mean(scores))
print(scores)

print("==== Valid Score ==== ")
print("score: ", val_score)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000574 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 968
[LightGBM] [Info] Number of data points in the train set: 1227, number of used features: 28
[LightGBM] [Info] Start training from score -1.146181
[LightGBM] [Info] Start training from score -1.631689
[LightGBM] [Info] Start training from score -0.720410
==== TRAIN CV SCORE ====
mean score:  0.7546905591504893
[0.76829268 0.73577236 0.73469388 0.75510204 0.77959184]
==== Valid Score ==== 
score:  0.7622149837133552


In [None]:
import warnings
warnings.filterwarnings('ignore')

import json, joblib, optuna
from optuna.terminator import report_cross_validation_scores
from optuna.visualization import plot_terminator_improvement

def objective(trial):
    
    params = {
        "objective": "multiclass",
        "metric": "multi_logloss",
        "verbosity": -1,
        "boosting_type": "gbdt",
        "num_class": 3,
        "lambda_l1": trial.suggest_float("lambda_l1", 1e-8, 10.0, log=True),
        "lambda_l2": trial.suggest_float("lambda_l2", 1e-8, 10.0, log=True),
        "num_leaves": trial.suggest_int("num_leaves", 2, 256),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.4, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.4, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 7),
        "min_child_samples": trial.suggest_int("min_child_samples", 5, 100),
    }
    
    classifier_obj = LGBMClassifier(**params)
    scores = cross_val_score(classifier_obj, X_train, y_train, 
                            scoring='f1_micro',
                            n_jobs=-1)
    
    return np.mean(scores)

study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=200)

In [28]:
print('Best parameters:', study.best_params) # 0.779


Best parameters: {'lambda_l1': 6.078318923394912e-08, 'lambda_l2': 1.0686575886673962e-06, 'num_leaves': 157, 'feature_fraction': 0.686139775051318, 'bagging_fraction': 0.6823838787125444, 'bagging_freq': 2, 'min_child_samples': 7}


In [35]:
model = LGBMClassifier(**study.best_params)
model.fit(X_train, y_train)

f1_score(y_val, model.predict(X_val), average='micro')

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001172 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 971
[LightGBM] [Info] Number of data points in the train set: 1227, number of used features: 29
[LightGBM] [Info] Start training from score -1.146181
[LightGBM] [Info] Start training from score -1.631689
[LightGBM] [Info] Start training from score -0.720410


0.758957654723127

In [None]:
model = LGBMClassifier(**study.best_params)
model.fit(X_tr, y)

prediction = model.predict(X_test)
decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

sample_submission = pd.read_csv(config.SUBMISSION_FILE)
#sample_submission['outcome'] = prediction['prediction_label'].map(decode_map)
sample_submission['outcome'] = prediction
sample_submission.to_csv("../output/sample_submission_V3.csv", index=False)
sample_submission

In [44]:
pd.DataFrame(model.feature_importances_, index=model.feature_name_).sort_values(0, ascending=False)

Unnamed: 0,0
hospital_number,2652
total_protein,2417
pulse,2290
packed_cell_volume,2259
lesion_1,2164
rectal_temp,2049
abdomo_protein,1928
nasogastric_reflux_ph,1846
respiratory_rate,1733
deviation_from_normal_temp,1631
