In [1]:
# env setting
import sys
sys.path.append("../src")
sys.path.append("../models")

import numpy as np
import pandas as pd
from numpy.random import normal
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from pycaret.classification import *
from tqdm import tqdm
from tqdm.notebook import tqdm as tqdm_notebook
tqdm_notebook.get_lock().locks = []

#custom function
import config
import helpers

# V2. Code
* Target Encoding으로 진행 (모델 고정 후 결과를 비교해보기)
* Missing Value를 예측값으로 Impute 하기
* 추가적인 변수 생성 해보기

In [38]:
(train, test, origin, submission) = helpers.data_loader()
train.head(5)

Unnamed: 0,id,surgery,age,hospital_number,rectal_temp,pulse,respiratory_rate,temp_of_extremities,peripheral_pulse,mucous_membrane,...,packed_cell_volume,total_protein,abdomo_appearance,abdomo_protein,surgical_lesion,lesion_1,lesion_2,lesion_3,cp_data,outcome
0,0,yes,adult,530001,38.1,132.0,24.0,cool,reduced,dark_cyanotic,...,57.0,8.5,serosanguious,3.4,yes,2209,0,0,no,died
1,1,yes,adult,533836,37.5,88.0,12.0,cool,normal,pale_cyanotic,...,33.0,64.0,serosanguious,2.0,yes,2208,0,0,no,euthanized
2,2,yes,adult,529812,38.3,120.0,28.0,cool,reduced,pale_pink,...,37.0,6.4,serosanguious,3.4,yes,5124,0,0,no,lived
3,3,yes,adult,5262541,37.1,72.0,30.0,cold,reduced,pale_pink,...,53.0,7.0,cloudy,3.9,yes,2208,0,0,yes,lived
4,4,no,adult,5299629,38.0,52.0,48.0,normal,normal,normal_pink,...,47.0,7.3,cloudy,2.6,no,0,0,0,yes,lived


In [39]:
train = train.replace('None', np.NaN)
test = test.replace('None', np.NaN)
origin = origin.replace('None', np.NaN)

categorical_features = config.CATEGORICAL_FEATURES
target = 'outcome'
numerical_features = list(set(train.columns) - set(categorical_features) - set(config.USELESS_FEATURES) - set([target]))

In [40]:
print(f"TRAIN SHAPE : {train.shape}")
print(f"TEST SHAPE : {test.shape}")
print(f"ORIGINAL SHAPE : {origin.shape}")

TRAIN SHAPE : (1235, 29)
TEST SHAPE : (824, 28)
ORIGINAL SHAPE : (299, 28)


In [41]:
train['outcome'] = train['outcome'].map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

## Concat Data

In [14]:
def encode(y, target_map):
    y = np.array(y)
    encoded_y = [target_map[f] for f in y]
    return encoded_y

In [15]:
# concat train, test, origin
# first, concat train & origin
train = pd.concat(
    [train, origin], ignore_index=True
)
train = train.drop_duplicates().drop(['id'], axis=1)
test = test.drop(['id'], axis=1)


target_map = {
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
}

train['outcome'] = encode(train.outcome, target_map)

## Missing Values
train 데이터를 helpers.preprocessing output으로 고정시켜 Impute 실험을 총 4개로 진행. F1 score를 비교하였음.
1. SimpleImputer (mode, mean)
2. BeysianRidge Imputer
3. RandomForest Imputer
4. KNN Imputer

결과 :  
Train Set에 대해선 2, 3, 4 모두 0.73 정도로 약간의 차이만 있었지만  
Valid Set에 대해선 KNN Imputer가 가장 높은 <font color='orange'>F1 Score(0.702)</font>를 기록하여 KNN Imputer로 채택해보았음.

문제점:  
Hyperparameter세팅을 하지 않고 진행하였음.  
Target Encoding으로 Preprocess를 진행하면 결과가 또 달라질 수 있다고 생각됨.

In [7]:
print("Train NaN : \n", train.isna().sum())
print("Test Nan : \n", test.isna().sum())

Train NaN : 
 surgery                    0
age                        0
hospital_number            0
rectal_temp               60
pulse                     24
respiratory_rate          58
temp_of_extremities       95
peripheral_pulse         129
mucous_membrane           68
capillary_refill_time     38
pain                      99
peristalsis               64
abdominal_distention      79
nasogastric_tube         184
nasogastric_reflux       127
nasogastric_reflux_ph    246
rectal_exam_feces        292
abdomen                  331
packed_cell_volume        29
total_protein             33
abdomo_appearance        213
abdomo_protein           198
surgical_lesion            0
lesion_1                   0
lesion_2                   0
lesion_3                   0
cp_data                    0
outcome                    0
dtype: int64
Test Nan : 
 surgery                    0
age                        0
hospital_number            0
rectal_temp                0
pulse                      0
res

In [36]:
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer
from sklearn.metrics import f1_score
from sklearn.model_selection import StratifiedKFold
from sklearn.ensemble import RandomForestRegressor

le_cols = ['surgery', 'age', 'surgical_lesion', 'cp_data']
ohe_cols = ['mucous_membrane']

train_preprocessing = helpers.preprocessing(train.copy(), le_cols, ohe_cols)

train_encoding_v1 = train.copy().drop(['outcome'], axis=1) # Mode, Mean
train_encoding_v2 = train_preprocessing.copy().drop(['outcome'], axis=1) # Baysian Ridge
train_encoding_v3 = train_preprocessing.copy().drop(['outcome'], axis=1) # Tree
train_encoding_v4 = train_preprocessing.copy().drop(['outcome'], axis=1) # KNN

train_encoding_v1[categorical_features] = (SimpleImputer(strategy="most_frequent").set_output(transform="pandas")
                                           .fit_transform(train_encoding_v1[categorical_features]))

train_encoding_v1[numerical_features] = (SimpleImputer(strategy="mean").set_output(transform="pandas")
                                         .fit_transform(train_encoding_v1[numerical_features]))

train_encoding_v2 = (IterativeImputer(random_state=42).set_output(transform="pandas")
                    .fit_transform(train_encoding_v2, train.outcome))

train_encoding_v3 = (IterativeImputer(estimator= RandomForestRegressor(random_state=42), random_state=42).set_output(transform="pandas")
                                     .fit_transform(train_encoding_v3, train.outcome))

train_encoding_v4 = (KNNImputer(n_neighbors=5).set_output(transform="pandas")
                    .fit_transform(train_encoding_v4))


In [52]:
from sklearn.ensemble import RandomForestClassifier

train_experiments = [helpers.preprocessing(train_encoding_v1, le_cols, ohe_cols), train_encoding_v2, train_encoding_v3, train_encoding_v4]
target = train.outcome.map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

rf = RandomForestClassifier(max_depth=6, random_state=42)
folds = StratifiedKFold(n_splits=5)

f1_score_fold = {}
for ind, train_ in enumerate(train_experiments):
    print(f'imputation version {ind + 1}')
    scores_trn, scores_val = [], []
    for fold_, (train_ind, val_ind) in enumerate(folds.split(train_, target)):
        trn_data, val_data = train_.iloc[train_ind], train_.iloc[val_ind]
        y_trn, y_val = target.iloc[train_ind], target.iloc[val_ind]
        
        rf = RandomForestClassifier(max_depth=4, random_state=42)
        rf.fit(trn_data, y_trn)
        y_hat = rf.predict(trn_data)
        y_hat_val = rf.predict(val_data)
        
        train_score = f1_score(y_trn, y_hat, average = 'micro')
        val_score = f1_score(y_val, y_hat_val, average = 'micro')
        
        scores_trn.append(train_score)
        scores_val.append(val_score)
        
    print(f"        [Train]: {np.mean(scores_trn)}")
    print(f"        [Valid]: {np.mean(scores_val)}")


imputation version 1
        [Train]: 0.7310951474558587
        [Valid]: 0.6910029592727428
imputation version 2
        [Train]: 0.7337028689449385
        [Valid]: 0.7001447701773434
imputation version 3
        [Train]: 0.7333770033104232
        [Valid]: 0.6929722594792531
imputation version 4
        [Train]: 0.7333774015169012
        [Valid]: 0.7020991675714803


## Target Encoding with KNN Imputation
https://www.kaggle.com/code/vprokopev/mean-likelihood-encodings-a-comprehensive-study/notebook -> 공부하기 좋은 링크  

위 링크에서 먼저 label Encoding, Frequency Encoding을 사용해보라고 하여서 Frequency Encoding과 비교 
Encoding 후 KNN Imputer로 결측값 채워넣고 진행.

결과:
Freq, Target 방법이 n_neighbors = 11에서 `CV F1 Score (0.72425)`의 최고점을 기록하였음.  
따라서 둘 중에 하나만 선택해도 될 것 같음.

In [21]:
from numpy.random import normal
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.preprocessing import TargetEncoder
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, IterativeImputer, KNNImputer


    
def freq_encode(train_data, columns):
    encoded_cols = []
    n_samples = train_data.shape[0]
    for col in columns:
        freqs_cat = train_data.groupby(col)[col].count() / n_samples
        encoded_col_train = train_data[col].map(freqs_cat)
        encoded_cols.append(pd.DataFrame({'freq_'+ col:encoded_col_train}))
    return pd.concat(encoded_cols, axis=1)

def target_encode(train_data, target, columns, alpha=0.5):
    n_rows = train_data.shape[0]
    global_mean = train_data[target].mean()
    encoded_cols = []
    
    for col in columns:
        target_mean = train_data.groupby(col)[target].mean()
        smoothing = (target_mean * n_rows + global_mean * alpha) / (n_rows + alpha)
        encoded_col_train = train_data[col].map(smoothing)
        #encoded_col_test = test_data[col].map(smoothing)
        encoded_cols.append(pd.DataFrame({'target_' + col:encoded_col_train}))
        
    return pd.concat(encoded_cols, axis=1)
    


In [51]:
le_cols = ['surgery', 'age', 'surgical_lesion', 'cp_data']
ohe_cols = ['mucous_membrane']

rf = RandomForestClassifier(max_depth=6, random_state=42)
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_experiments = []
for _ in range(3):
    train_experiments.append(train.copy())

y = train.copy().outcome

freq_cat = freq_encode(train_experiments[0],  categorical_features)
train_experiments[0] = pd.concat([train_experiments[0].drop(categorical_features, axis=1), freq_cat], axis=1).drop(['outcome'], axis=1) # concat with numerics

train_experiments[1] = helpers.preprocessing(train_experiments[1], le_cols, ohe_cols).drop(['outcome'], axis=1)

targ_cat = target_encode(train_experiments[2], 'outcome', categorical_features, alpha=0.5)
train_experiments[2] = pd.concat([train_experiments[2].drop(categorical_features, axis=1), freq_cat], axis=1).drop(['outcome'], axis=1) # concat with numerics

# freq, target은 n_neighbors=11일때 잘 작동

for setup_x in train_experiments:
    setup_x = (KNNImputer(n_neighbors=11).set_output(transform="pandas")
               .fit_transform(setup_x))
    
    _scores = cross_val_score(rf, setup_x, y,
                                cv=folds, scoring='f1_micro',
                                n_jobs=-1)
    
    print(f"CV Average Score : {round(np.mean(_scores), 5)}")
    print(f"    {_scores}")


CV Average Score : 0.72425
    [0.72312704 0.71335505 0.70684039 0.75895765 0.71895425]
CV Average Score : 0.70079
    [0.70358306 0.71009772 0.65798046 0.72312704 0.70915033]
CV Average Score : 0.72425
    [0.72312704 0.71335505 0.70684039 0.75895765 0.71895425]


### KNN Imputer
KNN Imputer의 경우 n_neighbors parameter를 바꿔가며 실험해 보자. 
<font color='gray'>PS S3E22 | EDA - Preprocessing - Ensemble</font>

결과:  
n_neighbors 수가 10인 경우가 CV Score 가 7.13으로 가장 높게 나타났다.


In [138]:
def knn_imputer_test(X, y, n_search_grid:list):
    train_knn = X.copy()
    
    rf = RandomForestClassifier(max_depth=6, random_state=42)
    folds = StratifiedKFold(n_splits=5)
    
    target = y.map({
        'died' : 0,
        'euthanized' : 1,
        'lived' : 2
    })
    
    knn_expirement_data = []
    for n in n_search_grid:
        _result = (KNNImputer(n_neighbors=n).set_output(transform="pandas")
                .fit_transform(train_knn))
        
        scores = cross_val_score(rf, _result, target,
                                cv=folds, scoring='f1_micro',
                                n_jobs=-1)
        
        print(f"{n} Neighbors CV Average Score : {round(np.mean(scores), 5)}")
        print(f"    {scores}")

In [142]:
# test n_neighbors = 10
train_knn = train_preprocessing.copy().drop(['outcome'], axis=1) # KNN

# init model, folds
rf = RandomForestClassifier(max_depth=6, random_state=42)
folds = StratifiedKFold(n_splits=5)

target = train_preprocessing.copy().outcome.map({
    'died' : 0,
    'euthanized' : 1,
    'lived' : 2
})

grid = [3, 5, 7, 10, 12, 15, 20]
knn_expirement_data = []
for n in grid:
    _result = (KNNImputer(n_neighbors=n).set_output(transform="pandas")
               .fit_transform(train_knn))
    
    scores = cross_val_score(rf, _result, target,
                             cv=folds, scoring='f1_micro',
                             n_jobs=-1)
    
    print(f"{n} Neighbors CV Average Score : {round(np.mean(scores), 5)}")
    print(f"    {scores}")

3 Neighbors CV Average Score : 0.70276
    [0.66775244 0.70358306 0.70358306 0.70684039 0.73202614]
5 Neighbors CV Average Score : 0.70796
    [0.6742671  0.71009772 0.71661238 0.72312704 0.71568627]
7 Neighbors CV Average Score : 0.70471
    [0.66449511 0.70032573 0.70032573 0.73289902 0.7254902 ]
10 Neighbors CV Average Score : 0.71318
    [0.67100977 0.70358306 0.72312704 0.72964169 0.73856209]
12 Neighbors CV Average Score : 0.70797
    [0.67752443 0.69381107 0.71335505 0.72638436 0.72875817]
15 Neighbors CV Average Score : 0.70471
    [0.66775244 0.69381107 0.70684039 0.72638436 0.72875817]
20 Neighbors CV Average Score : 0.7034
    [0.65798046 0.70358306 0.71009772 0.71986971 0.7254902 ]


## Pycaret Modeling
중간 Test 결과:  
Highest Score : 0.705 -> 0.76

In [52]:
from pycaret.classification import *

s = setup(data = train, target = 'outcome', categorical_features=categorical_features,
          normalize=True, fold=5, session_id=42)

compare_models()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,outcome
2,Target type,Multiclass
3,Original data shape,"(1534, 28)"
4,Transformed data shape,"(1534, 70)"
5,Transformed train set shape,"(1073, 70)"
6,Transformed test set shape,"(461, 70)"
7,Ordinal features,5
8,Numeric features,8
9,Categorical features,19


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7102,0.8569,0.7102,0.7094,0.7048,0.524,0.5273,0.134
rf,Random Forest Classifier,0.7037,0.8578,0.7037,0.7055,0.7006,0.5143,0.5164,0.128
lightgbm,Light Gradient Boosting Machine,0.7028,0.8232,0.7028,0.7036,0.7017,0.5171,0.5181,0.588
gbc,Gradient Boosting Classifier,0.699,0.8308,0.699,0.6998,0.6973,0.5089,0.5106,0.274
xgboost,Extreme Gradient Boosting,0.699,0.8254,0.699,0.6993,0.6979,0.5111,0.5121,0.12
catboost,CatBoost Classifier,0.6972,0.8396,0.6972,0.6967,0.6953,0.5063,0.5074,1.51
ridge,Ridge Classifier,0.6907,0.0,0.6907,0.6898,0.6855,0.4899,0.4927,0.066
lda,Linear Discriminant Analysis,0.6813,0.8316,0.6813,0.6799,0.6791,0.4821,0.4831,0.072
lr,Logistic Regression,0.6795,0.8328,0.6795,0.6776,0.6768,0.4779,0.4791,0.914
ada,Ada Boost Classifier,0.6459,0.7897,0.6459,0.6452,0.6432,0.4235,0.4256,0.104


In [57]:
def target_encode(train_data, test_data, target, columns, alpha=0.5):
    n_rows = train_data.shape[0]
    global_mean = train_data[target].mean()
    encoded_cols_train = []
    encoded_cols_test = []
    
    for col in columns:
        target_mean = train_data.groupby(col)[target].mean()
        smoothing = (target_mean * n_rows + global_mean * alpha) / (n_rows + alpha)
        
        encoded_col_train = train_data[col].map(smoothing)
        encoded_col_test = test_data[col].map(smoothing)
        
        encoded_cols_train.append(pd.DataFrame({col:encoded_col_train}))
        encoded_cols_test.append(pd.DataFrame({col:encoded_col_test}))
        
    return pd.concat(encoded_cols_train, axis=1), pd.concat(encoded_cols_test, axis=1)

df_train = train.copy()
df_test = test.copy()

tar_train, tar_test = target_encode(df_train, df_test, "outcome", categorical_features, alpha=0.5)
df_train = pd.concat([df_train.drop(categorical_features, axis=1), tar_train], axis=1) # concat with numerics

df_train = (KNNImputer(n_neighbors=11).set_output(transform="pandas")
            .fit_transform(df_train))

s = setup(data = df_train, target = 'outcome', categorical_features=categorical_features,
          normalize=True, fold=5, session_id=42)

compare_models()



Unnamed: 0,Description,Value
0,Session id,42
1,Target,outcome
2,Target type,Multiclass
3,Original data shape,"(1534, 28)"
4,Transformed data shape,"(1534, 42)"
5,Transformed train set shape,"(1073, 42)"
6,Transformed test set shape,"(461, 42)"
7,Ordinal features,5
8,Numeric features,8
9,Categorical features,19


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.7708,0.8988,0.7708,0.7717,0.7681,0.626,0.6282,0.252
lightgbm,Light Gradient Boosting Machine,0.7689,0.8958,0.7689,0.7715,0.7669,0.6245,0.6268,0.53
xgboost,Extreme Gradient Boosting,0.7661,0.8948,0.7661,0.7677,0.7649,0.6207,0.6221,0.086
catboost,CatBoost Classifier,0.7577,0.8934,0.7577,0.7588,0.7559,0.6054,0.6068,1.09
lr,Logistic Regression,0.7494,0.8841,0.7494,0.7473,0.7465,0.5922,0.5937,0.84
et,Extra Trees Classifier,0.7494,0.8897,0.7494,0.7546,0.7455,0.591,0.5951,0.082
rf,Random Forest Classifier,0.7438,0.8902,0.7438,0.748,0.7404,0.5816,0.5848,0.1
ada,Ada Boost Classifier,0.7419,0.8422,0.7419,0.7414,0.7408,0.5826,0.5833,0.062
ridge,Ridge Classifier,0.741,0.0,0.741,0.7422,0.735,0.5748,0.5786,0.036
lda,Linear Discriminant Analysis,0.7372,0.8869,0.7372,0.7377,0.7358,0.5762,0.5775,0.034


## Feature 추가 & Feature Selection

변수들끼리 사칙연산을 통해 새 Feature들을 생성하고 top N개만 사용하여 모델링

새 변수들 생성 후 결과:  
CV F1 Score (all features) : 0.76 그러나 편차가 심해보임 (0.79 ~ 0.71)  
-> Pycaret highest F1 Score : 0.7559

In [122]:
def feature_transform(data):
    data['lesion_2'] = data['lesion_2'].apply(lambda x: 1 if x > 0 else 0)
    data["deviation_from_normal_temp"] = data["rectal_temp"].apply(lambda x: abs(x - 37.8))
    
    return data

In [123]:
df_train = train.copy()
df_test = test.copy()

df_train = feature_transform(df_train)
df_test = feature_transform(df_test)

tar_train, tar_test = target_encode(df_train, df_test, "outcome", categorical_features, alpha=0.5)
df_train = pd.concat([df_train.drop(categorical_features, axis=1), tar_train], axis=1) # concat with numerics

df_train = (KNNImputer(n_neighbors=11).set_output(transform="pandas")
            .fit_transform(df_train))

In [124]:
cols = df_train.columns.drop('outcome')
X = df_train.drop(['outcome'], axis=1)
y = df_train.outcome
for idx in range(0, len(cols) -1):
    col1 = cols[idx]
    col2 = cols[idx + 1]
    
    # 곱하기
    X[col1 + "*" + col2] = X[col1] + X[col2]
    
    # 나누기
    X[col1 + "/" + col2] = X[col1] / (X[col2] + 1e-6)
    
    # 빼기
    X[col1 + "-" + col2] = X[col1] - X[col2]
    
    # 더하기
    X[col1 + "+" + col2] = X[col1] + X[col2]

    

  X[col1 + "+" + col2] = X[col1] + X[col2]
  X[col1 + "*" + col2] = X[col1] + X[col2]
  X[col1 + "/" + col2] = X[col1] / (X[col2] + 1e-6)
  X[col1 + "-" + col2] = X[col1] - X[col2]
  X[col1 + "+" + col2] = X[col1] + X[col2]
  X[col1 + "*" + col2] = X[col1] + X[col2]
  X[col1 + "/" + col2] = X[col1] / (X[col2] + 1e-6)
  X[col1 + "-" + col2] = X[col1] - X[col2]
  X[col1 + "+" + col2] = X[col1] + X[col2]


In [127]:
from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
lgbm.fit(X, y)

scores = cross_val_score(lgbm, X, y,
                         cv=folds, scoring='f1_micro',
                         n_jobs=-1)

print(np.mean(scores))
print(scores)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001607 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14681
[LightGBM] [Info] Number of data points in the train set: 1534, number of used features: 134
[LightGBM] [Info] Start training from score -1.147370
[LightGBM] [Info] Start training from score -1.648659
[LightGBM] [Info] Start training from score -0.712898
0.7620489238040493
[0.79478827 0.71661238 0.76221498 0.79153094 0.74509804]


In [128]:
top_cols = pd.DataFrame(lgbm.feature_importances_, index=lgbm.feature_name_).sort_values(0, ascending=False).head(60).index

In [129]:
X = X[top_cols]
X['outcome'] = y

s = setup(data=X, target='outcome', normalize=True, fold=5, session_id=42)
compare_models()

Unnamed: 0,Description,Value
0,Session id,42
1,Target,outcome
2,Target type,Multiclass
3,Original data shape,"(1534, 61)"
4,Transformed data shape,"(1534, 61)"
5,Transformed train set shape,"(1073, 61)"
6,Transformed test set shape,"(461, 61)"
7,Numeric features,60
8,Preprocess,True
9,Imputation type,simple


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7596,0.8978,0.7596,0.7619,0.7559,0.6067,0.6096,0.082
gbc,Gradient Boosting Classifier,0.7521,0.888,0.7521,0.7525,0.7495,0.5964,0.5982,0.96
rf,Random Forest Classifier,0.7503,0.8936,0.7503,0.7527,0.7453,0.5896,0.5934,0.096
lightgbm,Light Gradient Boosting Machine,0.7503,0.8915,0.7503,0.7519,0.7482,0.594,0.5959,0.398
catboost,CatBoost Classifier,0.7493,0.8937,0.7493,0.7516,0.7459,0.5904,0.5935,6.362
lr,Logistic Regression,0.7475,0.8848,0.7475,0.7456,0.7449,0.5898,0.5911,0.02
xgboost,Extreme Gradient Boosting,0.7475,0.8836,0.7475,0.748,0.7444,0.5875,0.5894,0.244
ridge,Ridge Classifier,0.7335,0.0,0.7335,0.7326,0.7278,0.5633,0.5666,0.01
lda,Linear Discriminant Analysis,0.7326,0.883,0.7326,0.733,0.731,0.5685,0.5698,0.016
ada,Ada Boost Classifier,0.6972,0.8412,0.6972,0.6986,0.6955,0.512,0.5135,0.08


## Submission

최종적으로는 Feature를 추가하고 상위 60개만 사용  
결과:  
(ExtraTree) Test Score는 0.7481로 V1보다는 상승했지만 실제 제출 후 score는 0.72로 하락하였음. ?
(LGBM) Test Score는 0.7590

In [16]:
def feature_transform(data):
    data['lesion_2'] = data['lesion_2'].apply(lambda x: 1 if x > 0 else 0)
    data["deviation_from_normal_temp"] = data["rectal_temp"].apply(lambda x: abs(x - 37.8))
    
    return data

def target_encode(train_data, test_data, target, columns, alpha=0.5):
    n_rows = train_data.shape[0]
    global_mean = train_data[target].mean()
    encoded_cols_train = []
    encoded_cols_test = []
    
    for col in columns:
        target_mean = train_data.groupby(col)[target].mean()
        smoothing = (target_mean * n_rows + global_mean * alpha) / (n_rows + alpha)
        
        encoded_col_train = train_data[col].map(smoothing)
        encoded_col_test = test_data[col].map(smoothing)
        
        encoded_cols_train.append(pd.DataFrame({col:encoded_col_train}))
        encoded_cols_test.append(pd.DataFrame({col:encoded_col_test}))
        
    return pd.concat(encoded_cols_train, axis=1), pd.concat(encoded_cols_test, axis=1)

def feature_engineering(data):
    cols = data.columns
    X = data.copy()

    for idx in range(0, len(cols) -1):
        col1 = cols[idx]
        col2 = cols[idx + 1]
        
        # 곱하기
        X[col1 + "*" + col2] = X[col1] + X[col2]
        
        # 나누기
        X[col1 + "/" + col2] = X[col1] / (X[col2] + 1e-6)
        
        # 빼기
        X[col1 + "-" + col2] = X[col1] - X[col2]
        
        # 더하기
        X[col1 + "+" + col2] = X[col1] + X[col2]
    
    return X


# get train, test
df_train = train.copy()
df_test = test.copy()

# Add New Features
df_train = feature_transform(df_train)
df_test = feature_transform(df_test)

# Encoding
tar_train, tar_test = target_encode(df_train, df_test, "outcome", columns = categorical_features, alpha=0.5)
df_train = pd.concat([df_train.drop(categorical_features, axis=1), tar_train], axis=1) # concat with numerics

df_test = pd.concat([df_test.drop(categorical_features, axis=1), tar_test], axis=1)

# Split data, target
y = df_train.outcome
df_train = df_train.drop(['outcome'], axis=1)

# Imputation
knn = KNNImputer(n_neighbors=11).set_output(transform="pandas")
df_train = knn.fit_transform(df_train)

df_test = knn.transform(df_test)

df_train = feature_engineering(df_train)
df_test = feature_engineering(df_test)


from lightgbm import LGBMClassifier

lgbm = LGBMClassifier()
folds = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
lgbm.fit(df_train, y)

scores = cross_val_score(lgbm, df_train, y,
                         cv=folds, scoring='f1_micro',
                         n_jobs=-1)

print(np.mean(scores))
print(scores)

top_cols = pd.DataFrame(lgbm.feature_importances_, index=lgbm.feature_name_).sort_values(0, ascending=False).head(60).index

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.003045 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 14676
[LightGBM] [Info] Number of data points in the train set: 1534, number of used features: 134
[LightGBM] [Info] Start training from score -1.147370
[LightGBM] [Info] Start training from score -1.648659
[LightGBM] [Info] Start training from score -0.712898
0.7666070554171722
[0.7980456  0.71661238 0.75895765 0.81758958 0.74183007]


In [17]:
X = df_train[top_cols]
X['outcome'] = y

s = setup(data=X, target='outcome', normalize=True, fold=5, session_id=42, verbose=False)
top1 = compare_models(sort='f1')

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.7568,0.899,0.7568,0.7589,0.7537,0.6037,0.606,0.12
rf,Random Forest Classifier,0.7568,0.8916,0.7568,0.76,0.7532,0.6017,0.6049,0.178
catboost,CatBoost Classifier,0.7549,0.8937,0.7549,0.7569,0.7521,0.6,0.6022,7.656
gbc,Gradient Boosting Classifier,0.7512,0.8893,0.7512,0.7523,0.7486,0.595,0.5971,1.752
xgboost,Extreme Gradient Boosting,0.7512,0.8866,0.7512,0.7514,0.7486,0.595,0.5969,0.306
lr,Logistic Regression,0.7438,0.8846,0.7438,0.7421,0.7414,0.5844,0.5856,0.088
lightgbm,Light Gradient Boosting Machine,0.7419,0.8907,0.7419,0.7434,0.7393,0.5799,0.5821,0.662
lda,Linear Discriminant Analysis,0.7326,0.8828,0.7326,0.7327,0.7308,0.568,0.5694,0.024
ridge,Ridge Classifier,0.7344,0.0,0.7344,0.7336,0.7286,0.5648,0.5682,0.018
ada,Ada Boost Classifier,0.7139,0.8463,0.7139,0.7129,0.7123,0.5367,0.5375,0.144


In [30]:
model = create_model('lightgbm')
tuned_model = tune_model(model, fold=10, optimize='f1', search_library='optuna')
predict_model(tuned_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7721,0.9185,0.7721,0.7708,0.771,0.6299,0.6304
1,0.7163,0.879,0.7163,0.7176,0.7058,0.5249,0.5319
2,0.707,0.8513,0.707,0.708,0.7064,0.5306,0.5315
3,0.7523,0.892,0.7523,0.753,0.7515,0.5993,0.6001
4,0.7617,0.9127,0.7617,0.7674,0.7617,0.6147,0.6165
Mean,0.7419,0.8907,0.7419,0.7434,0.7393,0.5799,0.5821
Std,0.0256,0.0243,0.0256,0.0258,0.0278,0.0437,0.0422


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7963,0.9196,0.7963,0.793,0.794,0.6689,0.6696
1,0.8241,0.9316,0.8241,0.828,0.8254,0.72,0.7207
2,0.7315,0.8476,0.7315,0.7523,0.7268,0.5559,0.5624
3,0.757,0.8905,0.757,0.7543,0.7522,0.5965,0.6001
4,0.757,0.8899,0.757,0.7616,0.7554,0.6073,0.6105
5,0.6729,0.8092,0.6729,0.6649,0.6675,0.4628,0.4637
6,0.757,0.8993,0.757,0.7564,0.7562,0.6067,0.6069
7,0.729,0.872,0.729,0.7295,0.7269,0.5638,0.5655
8,0.8037,0.9086,0.8037,0.8113,0.8023,0.6865,0.6912
9,0.8037,0.9087,0.8037,0.8043,0.8027,0.6839,0.6848


[I 2023-09-24 14:15:39,586] Searching the best hyperparameters using 1073 samples...
[I 2023-09-24 14:16:55,448] Finished hyperparameter search!


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Light Gradient Boosting Machine,0.7592,0.8923,0.7592,0.7595,0.759,0.6135,0.6137




Unnamed: 0,hospital_number*temp_of_extremities,packed_cell_volume/total_protein,hospital_number,deviation_from_normal_temp*hospital_number,lesion_1/deviation_from_normal_temp,abdomo_protein/lesion_1,nasogastric_reflux_ph/packed_cell_volume,hospital_number/temp_of_extremities,deviation_from_normal_temp-hospital_number,total_protein*abdomo_protein,...,nasogastric_reflux_ph,capillary_refill_time/pain,pain/peristalsis,deviation_from_normal_temp,lesion_1,pain,pain-peristalsis,outcome,prediction_label,prediction_score
601,1.560281,5.072463,0.562699,0.762699,6.999965e+03,0.002071,0.128571,0.564062,-0.362699,9.800000,...,4.500000,1.362350,0.807161,0.2,1400.0,0.987765,-0.235986,0,0,0.9481
840,2.597443,7.833332,1.599861,3.299861,3.594704e+03,0.000638,0.091489,1.603737,0.100139,9.900000,...,4.300000,1.432821,0.593600,1.7,6111.0,0.555757,-0.380491,2,0,0.9942
422,2.282359,7.428570,0.750138,0.750138,3.205000e+09,0.000624,0.019231,0.489575,-0.750138,9.000000,...,1.000000,1.362350,0.807161,0.0,3205.0,0.987765,-0.235986,1,1,0.8999
966,2.047622,7.230768,1.050040,1.550040,4.417991e+03,0.001856,0.095745,1.052584,-0.550040,10.600000,...,4.500000,2.421349,0.593600,0.5,2209.0,0.555757,-0.380491,0,0,0.6661
1414,1.664413,6.266666,0.666832,1.266832,3.833327e+03,0.001443,0.138298,0.668447,-0.066832,10.818182,...,6.500000,1.135559,0.573025,0.6,2300.0,0.701241,-0.522510,0,0,0.9641
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1320,2.412677,7.258063,0.857246,1.057246,1.999990e+03,0.008886,0.102424,0.551130,-0.657246,9.754545,...,4.609091,0.819010,0.877658,0.2,400.0,1.526201,-0.212745,2,2,0.9078
856,2.045836,0.792683,1.153852,2.953852,2.388887e+03,0.000465,0.030769,1.293579,0.646148,84.000000,...,2.000000,0.806163,1.055025,1.8,4300.0,0.987765,0.051518,2,1,0.7646
1442,2.747394,0.719101,1.749812,1.749812,3.205000e+09,0.000604,0.031960,1.754052,-1.749812,90.936363,...,2.045455,0.814279,0.799117,0.0,3205.0,0.977921,-0.245830,1,1,0.8684
503,2.960709,7.058823,1.428488,2.128488,3.154281e+03,0.001042,0.072917,0.932298,-0.728488,9.100000,...,3.500000,0.881720,1.247149,0.7,2208.0,1.526201,0.302450,0,2,0.7407


In [31]:
final_model = finalize_model(tuned_model)

X_test = df_test[top_cols]
prediction = predict_model(final_model, data = X_test)

decode_map = {
    0 : 'died',
    1 : 'euthanized',
    2 : 'lived'
}

sample_submission = pd.read_csv(config.SUBMISSION_FILE)
sample_submission['outcome'] = prediction['prediction_label'].map(decode_map)
sample_submission.to_csv("../output/sample_submission_V2(2nd).csv", index=False)



In [None]:
# models = {
#     'et' : create_model("et", fold=5),
#     'catboost' : create_model("catboost", fold=5),
#     'gbc' : create_model("gbc", fold=5),
#     'xgboost' : create_model("xgboost", fold=5),
#     'lightgbm' : create_model("lightgbm", fold=5)
# }


# for model_name, model in models.items():
#     try:
#         models[model_name] = tune_model(model, 
#                                         optimize="F1",
#                                         fold=5,
#                                         search_library="optuna")
        
#     except:
#         models[model_name] = tune_model(model, 
#                                         optimize="F1",
#                                         fold=5,
#                                         search_library="scikit-learn")
        


In [None]:
blended_models = blend_models(models, fold=5, optimize="f1")

In [None]:
def fit(X, y):
        n_rows = X.shape[0]
        columns = X.columns.values
        global_mean = y.mean()
        encoded_cols_X = []
        
        for col in columns:
            target_mean = X.groupby(col)[y.name].mean()
            smoothing = (target_mean * n_rows + global_mean * alpha) / (n_rows + alpha)
            self._smoothing[col] = smoothing
            
            encoded_col_X = X[col].map(smoothing)
            
            encoded_cols_X.append(pd.DataFrame({col:encoded_col_X}))
        
        return None

[]

In [16]:
train.outcome

0             died
1       euthanized
2            lived
3            lived
4            lived
           ...    
1230         lived
1231          died
1232         lived
1233         lived
1234         lived
Name: outcome, Length: 1235, dtype: object