In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *

In [7]:
# 데이터 불러오기
train = pd.read_csv('./smhrd_data/Train.csv', index_col = 'ID')
test = pd.read_csv('./smhrd_data/test.csv', index_col = 'ID')

# ?를 널값으로 변환
train_df = train.replace('?', pd.NA)
test_df = test.replace('?', pd.NA)

# Gender 컬럼 삭제
train_df.drop('Gender', axis = 1, inplace = True)
test_df.drop('Gender', axis = 1, inplace = True)

# 이상치 제거
train_df = train_df[(train_df['Customer_rating']!=99) & (train_df['Cost_of_the_Product']!=9999)]

# 이상치 변경
test_df['Customer_rating'] = test_df['Customer_rating'].replace(99, 3.0)
test_df['Cost_of_the_Product'] = test_df['Cost_of_the_Product'].replace(9999, 213)

# Product_importance 오타 수정
train_df['Product_importance'] = train_df['Product_importance'].str.replace('mediumm', 'medium')
train_df['Product_importance'] = train_df['Product_importance'].str.replace('loww', 'low')
train_df['Product_importance'] = train_df['Product_importance'].str.replace('highh', 'high')

test_df['Product_importance'] = test_df['Product_importance'].str.replace('mediumm', 'medium')

# Mode_of_Shipment 오타 수정
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Shipzk', 'Ship')
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Flightzk', 'Flight')
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Roadzk', 'Road')

test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].str.replace('Shipzk', 'Ship')
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].str.replace('Roadzk', 'Road')

# Weight_in_gms 컬럼 int형 변환을 위해 널값을 '0'으로 변환
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].fillna('0')
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].astype('int64')

test_df['Weight_in_gms'] = test_df['Weight_in_gms'].fillna('0')
test_df['Weight_in_gms'] = test_df['Weight_in_gms'].astype('int64')

# Weight_in_gms 결측치 평균값으로 대체
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].replace(0, 3424)
test_df['Weight_in_gms'] = test_df['Weight_in_gms'].replace(0, 3424)

# Mode_of_Shipment 결측치 최빈값으로 대체
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].fillna(' Ship')
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].fillna(' Ship')

X_train = train_df.drop('Reached.on.Time_Y.N', axis = 1)    
y_train = train_df['Reached.on.Time_Y.N']    
X_test = test_df

In [8]:
exp_clf = setup(data = X_train, target = y_train, session_id=123, normalize_method='minmax', 
                categorical_features=['Warehouse_block ', 'Mode_of_Shipment', 'Product_importance'])
# 65.61
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6994, 10)"
4,Transformed data shape,"(6994, 18)"
5,Transformed train set shape,"(4895, 18)"
6,Transformed test set shape,"(2099, 18)"
7,Numeric features,6
8,Categorical features,3
9,Rows with missing values,66.9%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.665,0.7297,0.6,0.7865,0.68,0.3415,0.3558,1.01
rf,Random Forest Classifier,0.6523,0.7244,0.6595,0.7304,0.6922,0.2946,0.2975,1.028
lightgbm,Light Gradient Boosting Machine,0.6509,0.7307,0.6526,0.7322,0.6894,0.2935,0.2967,1.077
xgboost,Extreme Gradient Boosting,0.6502,0.7288,0.6715,0.721,0.6951,0.2861,0.2873,1.092
catboost,CatBoost Classifier,0.6472,0.7255,0.6351,0.7362,0.6812,0.2907,0.2953,1.386
knn,K Neighbors Classifier,0.6439,0.7189,0.6818,0.7083,0.6946,0.2679,0.2683,1.201
ada,Ada Boost Classifier,0.6404,0.7243,0.6818,0.7046,0.6926,0.2596,0.2602,0.989
et,Extra Trees Classifier,0.6394,0.7084,0.6938,0.6978,0.6954,0.2534,0.2538,1.009
lda,Linear Discriminant Analysis,0.6353,0.7023,0.7536,0.6724,0.7106,0.2219,0.2248,0.943
ridge,Ridge Classifier,0.6349,0.0,0.7574,0.6708,0.7114,0.2195,0.2228,0.952


In [12]:
# 방법 1
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library='optuna', optimize='f1')
xgboost = create_model('xgboost')
tuned_xgboost = tune_model(xgboost, search_library='optuna', optimize='f1')
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(lightgbm, search_library='optuna', optimize='f1' )
rf = create_model('rf')
tuned_rf = tune_model(rf, search_library='optuna', optimize='f1' )

blender_4 = blend_models(estimator_list = [tuned_gbc, tuned_xgboost, tuned_lightgbm, tuned_rf])

final_model = finalize_model(blender_4)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6347,0.7039,0.5498,0.7692,0.6413,0.2895,0.3067
1,0.6714,0.7311,0.5739,0.8186,0.6747,0.3629,0.3865
2,0.6286,0.7008,0.5223,0.7795,0.6255,0.2846,0.3073
3,0.7163,0.7662,0.6151,0.8689,0.7203,0.4491,0.477
4,0.6755,0.7378,0.6048,0.8,0.6888,0.3632,0.3789
5,0.6544,0.7159,0.567,0.7933,0.6613,0.3279,0.3473
6,0.6585,0.7357,0.6048,0.7719,0.6782,0.3256,0.3367
7,0.6667,0.7157,0.5876,0.7991,0.6772,0.3488,0.3666
8,0.6462,0.7201,0.5464,0.795,0.6477,0.3161,0.3388
9,0.6564,0.7116,0.5533,0.809,0.6571,0.3364,0.3611


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6469,0.729,0.6082,0.75,0.6717,0.2987,0.3064
1,0.6776,0.7275,0.6323,0.783,0.6996,0.36,0.3696
2,0.6408,0.7063,0.5842,0.7556,0.6589,0.2925,0.3033
3,0.7163,0.7741,0.6838,0.8089,0.7412,0.4322,0.4397
4,0.6714,0.7298,0.6529,0.76,0.7024,0.3403,0.3452
5,0.6524,0.7171,0.6117,0.7574,0.6768,0.3098,0.3181
6,0.6421,0.722,0.6392,0.7266,0.6801,0.2778,0.2807
7,0.6667,0.7329,0.6392,0.7623,0.6953,0.3336,0.3399
8,0.6524,0.7023,0.6048,0.7619,0.6743,0.3119,0.3216
9,0.638,0.7243,0.6082,0.7375,0.6667,0.2786,0.2848


[I 2023-08-10 12:14:05,256] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:16:21,388] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6531,0.7297,0.7182,0.7037,0.7109,0.2773,0.2774
1,0.6224,0.724,0.6392,0.6992,0.6679,0.2325,0.2338
2,0.6531,0.7195,0.6564,0.7318,0.692,0.2975,0.2998
3,0.6714,0.7451,0.7148,0.7273,0.721,0.3215,0.3216
4,0.6776,0.7561,0.701,0.7418,0.7208,0.3399,0.3407
5,0.6564,0.7476,0.6976,0.7173,0.7073,0.2917,0.2918
6,0.6544,0.7254,0.6804,0.7226,0.7009,0.2926,0.2933
7,0.6339,0.7124,0.6564,0.7074,0.6809,0.2531,0.2541
8,0.6176,0.6905,0.6151,0.7047,0.6569,0.2295,0.2322
9,0.6053,0.6796,0.646,0.6763,0.6608,0.1895,0.1898


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.5939,0.7125,1.0,0.5939,0.7452,0.0,0.0
1,0.5939,0.7339,1.0,0.5939,0.7452,0.0,0.0
2,0.5939,0.6838,1.0,0.5939,0.7452,0.0,0.0
3,0.5939,0.7382,1.0,0.5939,0.7452,0.0,0.0
4,0.5939,0.7392,1.0,0.5939,0.7452,0.0,0.0
5,0.5951,0.74,1.0,0.5951,0.7462,0.0,0.0
6,0.5951,0.7506,1.0,0.5951,0.7462,0.0,0.0
7,0.5951,0.7174,1.0,0.5951,0.7462,0.0,0.0
8,0.5951,0.7111,1.0,0.5951,0.7462,0.0,0.0
9,0.5951,0.7027,1.0,0.5951,0.7462,0.0,0.0


[I 2023-08-10 12:17:04,271] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:18:52,736] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6531,0.7351,0.6701,0.7249,0.6964,0.2931,0.2944
1,0.6429,0.7348,0.6323,0.7302,0.6777,0.2819,0.2855
2,0.6245,0.704,0.5601,0.7443,0.6392,0.2637,0.2753
3,0.6531,0.7392,0.6907,0.7153,0.7028,0.2864,0.2867
4,0.6878,0.7721,0.7113,0.75,0.7302,0.3603,0.361
5,0.6605,0.733,0.6254,0.7615,0.6868,0.324,0.3315
6,0.6687,0.7502,0.6701,0.7471,0.7065,0.3288,0.3314
7,0.6789,0.7459,0.6529,0.7724,0.7076,0.3571,0.3633
8,0.6483,0.7023,0.6151,0.749,0.6755,0.2995,0.3065
9,0.6339,0.6994,0.6048,0.7333,0.6629,0.2704,0.2765


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6469,0.7258,0.7285,0.6928,0.7102,0.2592,0.2598
1,0.6245,0.7251,0.7079,0.6754,0.6913,0.2128,0.2132
2,0.6306,0.7059,0.6838,0.691,0.6874,0.2361,0.2361
3,0.651,0.7579,0.7388,0.6935,0.7155,0.2655,0.2663
4,0.6592,0.743,0.732,0.7053,0.7184,0.2872,0.2875
5,0.6339,0.7321,0.7216,0.6818,0.7012,0.2299,0.2305
6,0.6646,0.7643,0.7973,0.6884,0.7389,0.2772,0.2831
7,0.6299,0.7303,0.7182,0.6786,0.6978,0.2212,0.2218
8,0.6401,0.7155,0.7113,0.6923,0.7017,0.2483,0.2484
9,0.6299,0.7087,0.6598,0.7007,0.6796,0.2423,0.2429


[I 2023-08-10 12:19:38,658] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:21:28,030] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6245,0.6922,0.646,0.6989,0.6714,0.2349,0.2359
1,0.6469,0.7231,0.6495,0.7269,0.686,0.2857,0.288
2,0.6245,0.6884,0.6082,0.7166,0.658,0.2478,0.2519
3,0.6776,0.7491,0.6873,0.7491,0.7168,0.344,0.3458
4,0.6735,0.7538,0.6701,0.7529,0.7091,0.3399,0.3429
5,0.6585,0.7199,0.6598,0.7385,0.6969,0.3086,0.3112
6,0.6871,0.7543,0.6942,0.7594,0.7253,0.3636,0.3656
7,0.6401,0.7147,0.6838,0.7032,0.6934,0.2579,0.2581
8,0.6278,0.6884,0.5979,0.728,0.6566,0.2588,0.2648
9,0.6135,0.6904,0.6151,0.6992,0.6545,0.22,0.2223


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6531,0.6928,0.6117,0.7574,0.6768,0.3114,0.3197
1,0.6633,0.7022,0.6151,0.7716,0.6845,0.3332,0.3431
2,0.6286,0.6679,0.5911,0.7319,0.654,0.2628,0.2698
3,0.702,0.7434,0.6564,0.8059,0.7235,0.4077,0.4179
4,0.6694,0.711,0.6357,0.7676,0.6955,0.3408,0.3481
5,0.6646,0.7019,0.6323,0.7635,0.6917,0.3311,0.3382
6,0.6667,0.6928,0.6289,0.7689,0.6919,0.3367,0.3448
7,0.6748,0.7155,0.6426,0.7727,0.7017,0.351,0.3582
8,0.6176,0.6521,0.5601,0.7342,0.6355,0.2483,0.2585
9,0.6421,0.6755,0.6151,0.7397,0.6717,0.2856,0.2915


[I 2023-08-10 12:22:14,982] Searching the best hyperparameters using 4895 samples...
[I 2023-08-10 12:24:28,145] Finished hyperparemeter search!


Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.649,0.7092,0.6082,0.7532,0.673,0.3033,0.3114
1,0.6816,0.7209,0.6357,0.7872,0.7034,0.3681,0.378
2,0.6306,0.7026,0.5704,0.7477,0.6472,0.274,0.2852
3,0.7041,0.7615,0.701,0.7786,0.7378,0.4004,0.4033
4,0.6735,0.7455,0.6529,0.7631,0.7037,0.3449,0.3501
5,0.6605,0.719,0.6323,0.7572,0.6891,0.3219,0.3282
6,0.6728,0.743,0.6529,0.7631,0.7037,0.3433,0.3485
7,0.6483,0.7185,0.6598,0.7245,0.6906,0.2851,0.2868
8,0.6299,0.6901,0.5739,0.7455,0.6485,0.2713,0.2818
9,0.6258,0.7084,0.5911,0.7288,0.6528,0.2564,0.2631
