In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *

In [4]:
# 데이터 불러오기
train = pd.read_csv('./smhrd_data/Train.csv', index_col = 'ID')
test = pd.read_csv('./smhrd_data/test.csv', index_col = 'ID')

# ?를 널값으로 변환
train_df = train.replace('?', pd.NA)
test_df = test.replace('?', pd.NA)

# Gender 컬럼 삭제
train_df.drop('Gender', axis = 1, inplace = True)
test_df.drop('Gender', axis = 1, inplace = True)

# 이상치 제거
train_df = train_df[(train_df['Customer_rating']!=99) & (train_df['Cost_of_the_Product']!=9999)]

# 이상치 변경
test_df['Customer_rating'] = test_df['Customer_rating'].replace(99, 3.0)
test_df['Cost_of_the_Product'] = test_df['Cost_of_the_Product'].replace(9999, 213)

# Product_importance 오타 수정
train_df['Product_importance'] = train_df['Product_importance'].str.replace('mediumm', 'medium')
train_df['Product_importance'] = train_df['Product_importance'].str.replace('loww', 'low')
train_df['Product_importance'] = train_df['Product_importance'].str.replace('highh', 'high')

test_df['Product_importance'] = test_df['Product_importance'].str.replace('mediumm', 'medium')

# Mode_of_Shipment 오타 수정
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Shipzk', 'Ship')
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Flightzk', 'Flight')
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Roadzk', 'Road')

test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].str.replace('Shipzk', 'Ship')
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].str.replace('Roadzk', 'Road')

# Weight_in_gms 컬럼 int형 변환을 위해 널값을 '0'으로 변환
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].fillna('0')
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].astype('int64')

test_df['Weight_in_gms'] = test_df['Weight_in_gms'].fillna('0')
test_df['Weight_in_gms'] = test_df['Weight_in_gms'].astype('int64')

# Weight_in_gms 결측치 평균값으로 대체
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].replace(0, 3424)
test_df['Weight_in_gms'] = test_df['Weight_in_gms'].replace(0, 3424)

# Discounted_offerd 결측치 평균값으로 채우기
train_df['Discount_offered'] = train_df['Discount_offered'].fillna(13.0)
test_df['Discount_offered'] = test_df['Discount_offered'].fillna(13.0)

# Mode_of_Shipment 결측치 최빈값으로 대체
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].fillna('Ship')
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].fillna('Ship')

# Prior_purchases 결측치 평균값으로 대체
train_df['Prior_purchases'] = train_df['Prior_purchases'].fillna(3.5)
test_df['Prior_purchases'] = test_df['Prior_purchases'].fillna(3.5)

# Customer_care_calls 결측치 최빈값이자 평균, 중위값으로 대체
train_df['Customer_care_calls'] = train_df['Customer_care_calls'].fillna(4.0)
test_df['Customer_care_calls'] = test_df['Customer_care_calls'].fillna(4.0)

# Product_importance 최빈값으로 대체
train_df['Product_importance'].fillna('low', inplace = True)
test_df['Product_importance'].fillna('low', inplace = True)

# 라벨 인코딩
from sklearn.preprocessing import LabelEncoder
object_columns = train_df.select_dtypes('object').columns

for i in object_columns:

    lb = LabelEncoder()
    lb.fit(train_df[i])
    train_df[i] = lb.transform(train_df[i])
    
    print(f'category : {np.unique(train_df[i])}\nclasses : {lb.classes_}\n')

for i in object_columns:

    lb = LabelEncoder()
    lb.fit(test_df[i])
    test_df[i] = lb.transform(test_df[i])
    
    print(f'category : {np.unique(test_df[i])}\nclasses : {lb.classes_}\n')
    
# ship 공백 제거
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].replace(3, 2)
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].replace(3, 2)

X_train = train_df.drop('Reached.on.Time_Y.N', axis = 1)    
y_train = train_df['Reached.on.Time_Y.N']    
X_test = test_df

category : [0 1 2 3 4]
classes : ['A' 'B' 'C' 'D' 'F']

category : [0 1 2 3]
classes : [' Flight' ' Road' ' Ship' 'Ship']

category : [0 1 2]
classes : ['high' 'low' 'medium']

category : [0 1 2 3 4]
classes : ['A' 'B' 'C' 'D' 'F']

category : [0 1 2 3]
classes : [' Flight' ' Road' ' Ship' 'Ship']

category : [0 1 2]
classes : ['high' 'low' 'medium']



In [5]:
exp_clf = setup(data = X_train, target = y_train, session_id=123, normalize_method='minmax', 
                categorical_features=['Warehouse_block ', 'Mode_of_Shipment', 'Product_importance'])
# 65.61
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6994, 10)"
4,Transformed data shape,"(6994, 18)"
5,Transformed train set shape,"(4895, 18)"
6,Transformed test set shape,"(2099, 18)"
7,Numeric features,6
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6635,0.7289,0.6,0.7839,0.6791,0.3383,0.3522,1.255
catboost,CatBoost Classifier,0.6515,0.7272,0.6309,0.7443,0.6825,0.3019,0.3074,1.294
lightgbm,Light Gradient Boosting Machine,0.6502,0.7302,0.6491,0.7327,0.6878,0.2933,0.2966,1.25
rf,Random Forest Classifier,0.6443,0.7242,0.6577,0.7203,0.6871,0.2769,0.2789,1.261
xgboost,Extreme Gradient Boosting,0.6437,0.7238,0.6704,0.7133,0.6909,0.2712,0.2722,1.264
et,Extra Trees Classifier,0.6433,0.706,0.6952,0.7018,0.6981,0.2621,0.2626,1.293
knn,K Neighbors Classifier,0.6431,0.7178,0.6804,0.7079,0.6937,0.2665,0.2669,1.448
ada,Ada Boost Classifier,0.6394,0.7243,0.6804,0.7039,0.6916,0.2577,0.2583,1.271
ridge,Ridge Classifier,0.6357,0.0,0.7577,0.6715,0.7119,0.2213,0.2246,1.245
lda,Linear Discriminant Analysis,0.6347,0.7024,0.7536,0.6718,0.7102,0.2204,0.2233,1.245


In [7]:
# 방법 1
catboost = create_model('catboost')
tuned_catboost = tune_model(catboost, search_library='optuna', optimize='f1' )
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library='optuna' , optimize='f1')
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(lightgbm, search_library='optuna', optimize='f1' )
rf = create_model('rf')
tuned_rf = tune_model(rf, search_library='optuna' , optimize='f1')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6367,0.7214,0.6495,0.7132,0.6799,0.2622,0.2637
1,0.649,0.7361,0.6117,0.7511,0.6742,0.3022,0.3098
2,0.6429,0.7053,0.5842,0.7589,0.6602,0.297,0.3084
3,0.6776,0.7607,0.6838,0.7509,0.7158,0.3451,0.3471
4,0.6694,0.7498,0.6632,0.751,0.7044,0.3326,0.3359
5,0.6789,0.7345,0.6392,0.7815,0.7032,0.3611,0.3698
6,0.6585,0.7237,0.6495,0.7441,0.6936,0.3119,0.3156
7,0.636,0.7255,0.6289,0.7233,0.6728,0.2671,0.2705
8,0.6217,0.7038,0.5876,0.7246,0.649,0.2483,0.2548
9,0.6442,0.7112,0.6117,0.7448,0.6717,0.2914,0.2981


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.649,0.7211,0.6357,0.7371,0.6827,0.2947,0.2988
1,0.6653,0.7217,0.6357,0.7613,0.6929,0.3316,0.3382
2,0.6347,0.6908,0.6186,0.7258,0.6679,0.2677,0.2719
3,0.7102,0.7611,0.6838,0.7992,0.737,0.4186,0.425
4,0.6776,0.727,0.6598,0.7649,0.7085,0.3521,0.357
5,0.6646,0.718,0.6529,0.751,0.6985,0.3248,0.3288
6,0.6544,0.7179,0.6495,0.7383,0.691,0.3026,0.3058
7,0.6748,0.7137,0.6701,0.7558,0.7104,0.3428,0.346
8,0.6258,0.6769,0.5842,0.7328,0.6501,0.2587,0.2665
9,0.6442,0.6972,0.6357,0.7312,0.6801,0.2836,0.2871


[I 2023-08-10 12:33:05,857] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:34:33,308] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6347,0.7165,0.5876,0.7435,0.6564,0.2777,0.2865
1,0.6959,0.7498,0.6117,0.8318,0.705,0.4059,0.4265
2,0.649,0.7116,0.5533,0.7931,0.6518,0.3199,0.3412
3,0.6939,0.7576,0.6495,0.7975,0.7159,0.3915,0.4012
4,0.6878,0.7443,0.6426,0.7924,0.7097,0.3798,0.3896
5,0.6626,0.734,0.6048,0.7788,0.6809,0.3347,0.3469
6,0.6483,0.7262,0.622,0.7449,0.6779,0.2973,0.3032
7,0.6789,0.7266,0.622,0.7939,0.6975,0.366,0.3785
8,0.6421,0.7062,0.5498,0.7843,0.6465,0.3061,0.3261
9,0.6421,0.7165,0.5567,0.7788,0.6493,0.304,0.3221


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5939,0.7205,1.0,0.5939,0.7452,0.0,0.0
1,0.5939,0.7584,1.0,0.5939,0.7452,0.0,0.0
2,0.5939,0.7099,1.0,0.5939,0.7452,0.0,0.0
3,0.5939,0.7664,1.0,0.5939,0.7452,0.0,0.0
4,0.5939,0.7457,1.0,0.5939,0.7452,0.0,0.0
5,0.5951,0.7318,1.0,0.5951,0.7462,0.0,0.0
6,0.5951,0.7303,1.0,0.5951,0.7462,0.0,0.0
7,0.5951,0.7367,1.0,0.5951,0.7462,0.0,0.0
8,0.5951,0.7101,1.0,0.5951,0.7462,0.0,0.0
9,0.5951,0.7195,1.0,0.5951,0.7462,0.0,0.0


[I 2023-08-10 12:35:21,466] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:37:47,552] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6388,0.7169,0.6632,0.7096,0.6856,0.2623,0.2631
1,0.649,0.7348,0.6289,0.7409,0.6803,0.2969,0.3018
2,0.6388,0.7229,0.5979,0.7436,0.6629,0.2836,0.2915
3,0.6939,0.7632,0.7079,0.7601,0.7331,0.3753,0.3766
4,0.6673,0.7697,0.6907,0.7336,0.7115,0.3196,0.3204
5,0.6708,0.7381,0.6598,0.7559,0.7046,0.3366,0.3406
6,0.6564,0.7341,0.6632,0.7338,0.6968,0.3029,0.3049
7,0.6196,0.7155,0.6598,0.6882,0.6737,0.2183,0.2186
8,0.6442,0.7111,0.6151,0.7427,0.6729,0.2903,0.2965
9,0.6237,0.6958,0.6048,0.7184,0.6567,0.2472,0.2516


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5939,0.7206,1.0,0.5939,0.7452,0.0,0.0
1,0.5939,0.7326,1.0,0.5939,0.7452,0.0,0.0
2,0.5939,0.6936,1.0,0.5939,0.7452,0.0,0.0
3,0.5939,0.7688,1.0,0.5939,0.7452,0.0,0.0
4,0.5939,0.7365,1.0,0.5939,0.7452,0.0,0.0
5,0.5951,0.7225,1.0,0.5951,0.7462,0.0,0.0
6,0.5951,0.7558,1.0,0.5951,0.7462,0.0,0.0
7,0.5951,0.7247,1.0,0.5951,0.7462,0.0,0.0
8,0.5951,0.7339,1.0,0.5951,0.7462,0.0,0.0
9,0.5951,0.7109,1.0,0.5951,0.7462,0.0,0.0


[I 2023-08-10 12:38:33,525] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:40:18,584] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6224,0.7094,0.6564,0.692,0.6737,0.2265,0.227
1,0.649,0.7328,0.646,0.7315,0.6861,0.2914,0.2943
2,0.6163,0.6926,0.5911,0.7137,0.6466,0.235,0.24
3,0.6531,0.7466,0.6873,0.7168,0.7018,0.2876,0.2879
4,0.6673,0.7644,0.6907,0.7336,0.7115,0.3196,0.3204
5,0.683,0.7226,0.6632,0.772,0.7135,0.3633,0.3686
6,0.6626,0.7538,0.6976,0.725,0.711,0.306,0.3063
7,0.6421,0.7258,0.6942,0.7014,0.6978,0.2592,0.2592
8,0.6115,0.6897,0.622,0.6935,0.6558,0.2128,0.2145
9,0.636,0.7041,0.6289,0.7233,0.6728,0.2671,0.2705


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.649,0.7177,0.6357,0.7371,0.6827,0.2947,0.2988
1,0.6653,0.7258,0.6323,0.7635,0.6917,0.3327,0.3398
2,0.6347,0.6913,0.6186,0.7258,0.6679,0.2677,0.2719
3,0.7082,0.7743,0.6804,0.7984,0.7347,0.415,0.4216
4,0.6755,0.7349,0.6529,0.7661,0.705,0.3495,0.3551
5,0.6646,0.7261,0.6529,0.751,0.6985,0.3248,0.3288
6,0.6544,0.7263,0.6495,0.7383,0.691,0.3026,0.3058
7,0.6728,0.7365,0.6667,0.7549,0.708,0.3392,0.3426
8,0.6217,0.7014,0.5773,0.7304,0.6449,0.2518,0.2598
9,0.6442,0.7099,0.6357,0.7312,0.6801,0.2836,0.2871


[I 2023-08-10 12:41:07,337] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:43:12,825] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5939,0.7289,1.0,0.5939,0.7452,0.0,0.0
1,0.5939,0.7206,1.0,0.5939,0.7452,0.0,0.0
2,0.5939,0.7017,1.0,0.5939,0.7452,0.0,0.0
3,0.5939,0.7757,1.0,0.5939,0.7452,0.0,0.0
4,0.5939,0.7342,1.0,0.5939,0.7452,0.0,0.0
5,0.5951,0.731,1.0,0.5951,0.7462,0.0,0.0
6,0.5951,0.7395,1.0,0.5951,0.7462,0.0,0.0
7,0.5951,0.7326,1.0,0.5951,0.7462,0.0,0.0
8,0.5951,0.7158,1.0,0.5951,0.7462,0.0,0.0
9,0.5951,0.7188,1.0,0.5951,0.7462,0.0,0.0


In [8]:
blender_3 = blend_models(estimator_list = [tuned_catboost, gbc, tuned_rf])

final_model = finalize_model(blender_3)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6449,0.7277,0.6323,0.7331,0.679,0.2865,0.2904
1,0.6714,0.7448,0.6426,0.7664,0.6991,0.3434,0.3498
2,0.6388,0.7117,0.622,0.7298,0.6716,0.2759,0.2803
3,0.7102,0.772,0.6942,0.7922,0.7399,0.4159,0.4206
4,0.6816,0.7416,0.6735,0.7626,0.7153,0.3574,0.3609
5,0.6687,0.7386,0.6667,0.749,0.7055,0.3299,0.3328
6,0.6503,0.7281,0.6495,0.7326,0.6885,0.2932,0.296
7,0.6769,0.7339,0.677,0.7548,0.7138,0.3454,0.3481
8,0.6237,0.6992,0.5842,0.7296,0.6489,0.2541,0.2614
9,0.6524,0.7264,0.646,0.7373,0.6886,0.299,0.3023
