# V5.

Custom Pipeline, Ensemble 기법 사용

In [1]:
# env setting
import sys
sys.path.append("../src")
sys.path.append("../models")

import numpy as np
import pandas as pd

from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
# from sklearn.pipeline import Pipeline
from sklearn.impute import KNNImputer

# model import
from sklearn.ensemble import GradientBoostingClassifier, HistGradientBoostingClassifier, RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

#custom function
import config
import helpers
from custom_pipeline import Custom_Pipeline

Read Data

In [2]:
(train, test, origin, submission) = helpers.data_loader()
train.head(5)

train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

categorical_features = config.CATEGORICAL_FEATURES
categorical_features.append('number_of_treatements')
# categorical_features.remove('lesion_2')
# categorical_features.remove('lesion_3')
target = 'outcome'

numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

train = pd.concat(
    [train, origin], ignore_index=True
)
train = train.drop_duplicates()

print(train.shape)
print(test.shape)

(1534, 29)
(824, 28)


Data Preprocessing

In [3]:
X_tr = train.copy()
X_test = test.copy()

train['outcome'] = train['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

y = train.outcome
USECOLS = categorical_features
DROPCOLS = ['lesion_2', 'lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(X_tr, y)
X_tr = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
X_test = pipe.transform(X_test, USECOLS, ALPHA, DROPCOLS)
print(X_tr.shape)

(1534, 27)


With Optimizing

In [5]:
estimators = [
    ('xgb', XGBClassifier(**config.XGB_PARAMS)),
    ('lgbm' , LGBMClassifier(**config.LGBM_PARAMS)),
    ('hgb', HistGradientBoostingClassifier(**config.HGB_PARAMS))
]

#create our voting classifier, inputting our models
ensemble = VotingClassifier(estimators, voting='soft', weights = [1.15176, 0.37643, 1.20592])

x_train, x_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(ensemble, x_train, y_train,
                         cv=folds,
                         scoring='f1_micro',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
ensemble.fit(x_train, y_train)
prediction = ensemble.predict(x_val)
#test our model on the test data
print(f1_score(y_val, prediction, average='micro'))

[0.78985507 0.75362319 0.80072464 0.7826087  0.76449275]
0.7782608695652173
0.7922077922077922


In [10]:
ensemble.fit(X_tr, y)
prediction = ensemble.predict(X_test)

sample_submission = pd.read_csv(config.SUBMISSION_FILE)
decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

sample_submission['outcome'] = prediction
sample_submission['outcome'] = sample_submission['outcome'].map(decode_map)
sample_submission.to_csv('../output/sample_submission_V4(ensemble_sklearn 2nd).csv',index=False)
sample_submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


In [6]:
# import pickle
# from joblib import dump, load
# ensemble.fit(X_tr, y)

# dump(ensemble, '../models/ensembleV5.pkl') 
# X_tr.to_csv("../input/X_train.csv", index=False)
# y.to_csv("../input/target.csv", index=False)

### stacking

In [12]:
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.svm import LinearSVC

estimators = [
    ('xgb', XGBClassifier(**config.XGB_PARAMS)),
    ('lgbm' , LGBMClassifier(**config.LGBM_PARAMS)),
    ('hgb', HistGradientBoostingClassifier(**config.HGB_PARAMS)),
    ('svr', make_pipeline(StandardScaler(),
                          LinearSVC(dual='auto', random_state=42)))
]

x_train, x_val, y_train, y_val = train_test_split(X_tr, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

# meta model - logistic regression - 0.7789 / 0.7922
# meta model - random forest - 0.7601 / 0.7467
staking = StackingClassifier(
    estimators = estimators,
    n_jobs=-1
)

scores = cross_val_score(staking, x_train, y_train,
                         cv=folds,
                         scoring='f1_micro',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
staking.fit(x_train, y_train)
prediction = staking.predict(x_val)

#test our model on the test data
print(f1_score(y_val, prediction, average='micro'))

[0.77898551 0.76449275 0.79710145 0.8115942  0.78623188]
0.7876811594202899
0.7987012987012987


In [13]:
staking.fit(X_tr, y)
prediction = staking.predict(X_test)

sample_submission = pd.read_csv(config.SUBMISSION_FILE)
decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

sample_submission['outcome'] = prediction
sample_submission['outcome'] = sample_submission['outcome'].map(decode_map)
sample_submission.to_csv('../output/sample_submission_V5(stacking).csv',index=False)
sample_submission

Unnamed: 0,id,outcome
0,1235,lived
1,1236,died
2,1237,lived
3,1238,euthanized
4,1239,lived
...,...,...
819,2054,died
820,2055,euthanized
821,2056,died
822,2057,lived


### Train only Enthanized

In [3]:
enth_tr = train.copy()
enth_test = test.copy()

train['outcome'] = train['outcome'].map({
    'died' : 1,
    'euthanized' : 0,
    'lived' : 0
})

y = train.outcome
USECOLS = categorical_features
DROPCOLS = ['lesion_2', 'lesion_3', 'id']
ALPHA = 0.5

pipe = Custom_Pipeline(enth_tr, y)
enth_tr = pipe.fit_transform(USECOLS, ALPHA, DROPCOLS)
enth_test = pipe.transform(enth_test, USECOLS, ALPHA, DROPCOLS)
print(enth_tr.shape)

(1534, 27)


In [4]:
# 안락사
model = XGBClassifier()

x_train, x_val, y_train, y_val = train_test_split(enth_tr, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(model, x_train, y_train,
                         cv=folds,
                         scoring='f1',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
model.fit(x_train, y_train)
prediction = model.predict(x_val)
#test our model on the test data
print(f1_score(y_val, prediction))

[0.71578947 0.61363636 0.63736264 0.60606061 0.49382716]
0.6133352482475289
0.7924528301886793


In [5]:
from sklearn.metrics import precision_score

precision_score(y_val, prediction)

0.9130434782608695

In [4]:
# 죽음
model = XGBClassifier()

x_train, x_val, y_train, y_val = train_test_split(enth_tr, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(model, x_train, y_train,
                         cv=folds,
                         scoring='f1',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
model.fit(x_train, y_train)
prediction = model.predict(x_val)   
#test our model on the test data
print(f1_score(y_val, prediction))

[0.72941176 0.73033708 0.69090909 0.72289157 0.7431694 ]
0.7233437798877645
0.7692307692307693


In [5]:
from sklearn.metrics import precision_score

precision_score(y_val, prediction)

0.7272727272727273

In [4]:
# 생존
model = XGBClassifier()

x_train, x_val, y_train, y_val = train_test_split(enth_tr, y, test_size=0.1, stratify=y, random_state=42)
folds = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)

scores = cross_val_score(model, x_train, y_train,
                         cv=folds,
                         scoring='f1',
                         n_jobs=-1)

print(scores)
print(np.mean(scores))

#fit model to training data
model.fit(x_train, y_train)
prediction = model.predict(x_val)   
#test our model on the test data
print(f1_score(y_val, prediction))

[0.81712062 0.81538462 0.82527881 0.80740741 0.7896679 ]
0.810971870489601
0.8211920529801323


In [6]:
from sklearn.metrics import precision_score

precision_score(y_val, prediction)

0.8157894736842105

F1 score : enth - 0.79, died - 0.76, lived - 0.82  
Precision score : enth - 0.91, died - 0.72, lived - 0.815

(154,)
55
