In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *

In [7]:
# 데이터 불러오기
train = pd.read_csv('./smhrd_data/Train.csv', index_col = 'ID')
test = pd.read_csv('./smhrd_data/test.csv', index_col = 'ID')

# ?를 널값으로 변환
train_df = train.replace('?', pd.NA)
test_df = test.replace('?', pd.NA)

# Gender 컬럼 삭제
train_df.drop('Gender', axis = 1, inplace = True)
test_df.drop('Gender', axis = 1, inplace = True)

# 이상치 제거
train_df = train_df[(train_df['Customer_rating']!=99) & (train_df['Cost_of_the_Product']!=9999)]

# 이상치 변경
test_df['Customer_rating'] = test_df['Customer_rating'].replace(99, 3.0)
test_df['Cost_of_the_Product'] = test_df['Cost_of_the_Product'].replace(9999, 213)

# Product_importance 오타 수정
train_df['Product_importance'] = train_df['Product_importance'].str.replace('mediumm', 'medium')
train_df['Product_importance'] = train_df['Product_importance'].str.replace('loww', 'low')
train_df['Product_importance'] = train_df['Product_importance'].str.replace('highh', 'high')

test_df['Product_importance'] = test_df['Product_importance'].str.replace('mediumm', 'medium')

# Mode_of_Shipment 오타 수정
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Shipzk', 'Ship')
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Flightzk', 'Flight')
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].str.replace('Roadzk', 'Road')

test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].str.replace('Shipzk', 'Ship')
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].str.replace('Roadzk', 'Road')

# Weight_in_gms 컬럼 int형 변환을 위해 널값을 '0'으로 변환
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].fillna('0')
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].astype('int64')

test_df['Weight_in_gms'] = test_df['Weight_in_gms'].fillna('0')
test_df['Weight_in_gms'] = test_df['Weight_in_gms'].astype('int64')

# Weight_in_gms 결측치 평균값으로 대체
train_df['Weight_in_gms'] = train_df['Weight_in_gms'].replace(0, 3424)
test_df['Weight_in_gms'] = test_df['Weight_in_gms'].replace(0, 3424)

# Mode_of_Shipment 결측치 최빈값으로 대체
train_df['Mode_of_Shipment'] = train_df['Mode_of_Shipment'].fillna(' Ship')
test_df['Mode_of_Shipment'] = test_df['Mode_of_Shipment'].fillna(' Ship')

X_train = train_df.drop('Reached.on.Time_Y.N', axis = 1)    
y_train = train_df['Reached.on.Time_Y.N']    
X_test = test_df

In [8]:
exp_clf = setup(data = X_train, target = y_train, session_id=123, normalize_method='minmax', 
                categorical_features=['Warehouse_block ', 'Mode_of_Shipment', 'Product_importance'])
# 65.61
best_model = compare_models()

# 방법 1
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library='optuna', optimize='f1')
xgboost = create_model('xgboost')
tuned_xgboost = tune_model(xgboost, search_library='optuna', optimize='f1')
lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(lightgbm, search_library='optuna', optimize='f1' )
rf = create_model('rf')
tuned_rf = tune_model(rf, search_library='optuna', optimize='f1' )

blender_4 = blend_models(estimator_list = [tuned_gbc, tuned_xgboost, tuned_lightgbm, tuned_rf])

final_model = finalize_model(blender_4)

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6994, 10)"
4,Transformed data shape,"(6994, 18)"
5,Transformed train set shape,"(4895, 18)"
6,Transformed test set shape,"(2099, 18)"
7,Numeric features,6
8,Categorical features,3
9,Rows with missing values,66.9%


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.665,0.7297,0.6,0.7865,0.68,0.3415,0.3558,1.01
rf,Random Forest Classifier,0.6523,0.7244,0.6595,0.7304,0.6922,0.2946,0.2975,1.028
lightgbm,Light Gradient Boosting Machine,0.6509,0.7307,0.6526,0.7322,0.6894,0.2935,0.2967,1.077
xgboost,Extreme Gradient Boosting,0.6502,0.7288,0.6715,0.721,0.6951,0.2861,0.2873,1.092
catboost,CatBoost Classifier,0.6472,0.7255,0.6351,0.7362,0.6812,0.2907,0.2953,1.386
knn,K Neighbors Classifier,0.6439,0.7189,0.6818,0.7083,0.6946,0.2679,0.2683,1.201
ada,Ada Boost Classifier,0.6404,0.7243,0.6818,0.7046,0.6926,0.2596,0.2602,0.989
et,Extra Trees Classifier,0.6394,0.7084,0.6938,0.6978,0.6954,0.2534,0.2538,1.009
lda,Linear Discriminant Analysis,0.6353,0.7023,0.7536,0.6724,0.7106,0.2219,0.2248,0.943
ridge,Ridge Classifier,0.6349,0.0,0.7574,0.6708,0.7114,0.2195,0.2228,0.952
