In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN

from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *

In [3]:
train = pd.read_csv("./smhrd_data/Train.csv")
train.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
train = train.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
train.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms', 'Reached.on.Time_Y.N']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
train = train[(train['Customer_rating']!=99) & (train['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
train = train[train['Product_importance'].notnull()]

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
train['Product_importance'] = train['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
train['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
train['Customer_care_calls'] = train.apply(fill_calls, axis = 1).astype('int64')

train['Weight_in_gms'].fillna(3424, inplace=True)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x:int(x))

train['Prior_purchases'] = train['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6897 entries, 0 to 6998
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6897 non-null   object 
 1   Mode_of_Shipment     6897 non-null   object 
 2   Customer_care_calls  6897 non-null   int64  
 3   Customer_rating      6897 non-null   int64  
 4   Cost_of_the_Product  6897 non-null   int64  
 5   Prior_purchases      6897 non-null   float64
 6   Product_importance   6897 non-null   object 
 7   Weight_in_gms        6897 non-null   int64  
 8   Reached.on.Time_Y.N  6897 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 538.8+ KB


In [4]:
test = pd.read_csv("./smhrd_data/test.csv")

test.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
test = test.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
test.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
# test = test[(test['Customer_rating']!=99) & (test['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
test['Product_importance'].fillna('low', inplace=True)

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
test['Product_importance'] = test['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
test['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
test['Customer_care_calls'] = test.apply(fill_calls, axis = 1).astype('int64')

test['Weight_in_gms'].fillna(3424, inplace=True)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x:int(x))

test['Prior_purchases'] = test['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  4000 non-null   int64  
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Weight_in_gms        4000 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 250.1+ KB


In [5]:
input = train.iloc[:,:-1]
target = train.iloc[:,-1]

In [12]:
exp_clf = setup(data = input, target = target, session_id=123, normalize_method='minmax',  fix_imbalance = True)
# 65.61
best_model = compare_models()


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6897, 9)"
4,Transformed data shape,"(7810, 17)"
5,Transformed train set shape,"(5740, 17)"
6,Transformed test set shape,"(2070, 17)"
7,Numeric features,5
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6613,0.7299,0.5826,0.7937,0.6715,0.3383,0.3561,0.73
ada,Ada Boost Classifier,0.6517,0.7266,0.67,0.7237,0.6957,0.29,0.2913,0.669
lightgbm,Light Gradient Boosting Machine,0.6462,0.7216,0.6488,0.7273,0.6856,0.2839,0.2866,0.803
catboost,CatBoost Classifier,0.6462,0.7247,0.6279,0.7383,0.6785,0.2907,0.2956,1.421
rf,Random Forest Classifier,0.646,0.7205,0.6714,0.7157,0.6928,0.276,0.2769,0.718
xgboost,Extreme Gradient Boosting,0.6408,0.7217,0.6718,0.7088,0.6896,0.264,0.2646,0.693
et,Extra Trees Classifier,0.636,0.6974,0.7,0.6918,0.6957,0.2428,0.243,0.723
knn,K Neighbors Classifier,0.635,0.7041,0.599,0.7386,0.6611,0.2746,0.2821,0.641
dt,Decision Tree Classifier,0.6331,0.6201,0.6889,0.6926,0.6906,0.2398,0.2399,0.641
lr,Logistic Regression,0.6198,0.6639,0.5944,0.718,0.6499,0.2419,0.2474,0.713


In [16]:
# 방법 1
ada = create_model('ada')
tuned_ada = tune_model(ada, search_library='optuna' )
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library='optuna' )

blender_2 = blend_models(estimator_list = [tuned_ada, tuned_gbc])

final_model = finalize_model(blender_2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6708,0.7375,0.6551,0.7581,0.7028,0.3383,0.3428
1,0.6708,0.7323,0.7073,0.7302,0.7186,0.3223,0.3226
2,0.617,0.6963,0.6411,0.6917,0.6655,0.219,0.2199
3,0.6294,0.7031,0.6446,0.7061,0.674,0.2468,0.2481
4,0.6149,0.6953,0.5958,0.7095,0.6477,0.2301,0.2344
5,0.6625,0.7272,0.6725,0.7366,0.7031,0.3141,0.3158
6,0.6936,0.7557,0.7073,0.7603,0.7329,0.3747,0.3761
7,0.6826,0.7679,0.6899,0.7557,0.7213,0.3544,0.3564
8,0.6349,0.7084,0.6551,0.7094,0.6812,0.2556,0.2567
9,0.6432,0.7257,0.6551,0.7203,0.6861,0.2748,0.2765


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7022,0.6237,0.8063,0.7033,0.3841,0.3984
1,0.6874,0.6957,0.6516,0.7857,0.7124,0.3764,0.3844
2,0.6253,0.6305,0.6028,0.7208,0.6565,0.2514,0.2563
3,0.6563,0.6647,0.6202,0.7574,0.682,0.3161,0.3236
4,0.6439,0.6542,0.5993,0.7511,0.6667,0.2947,0.3034
5,0.6708,0.6826,0.6202,0.7807,0.6913,0.3485,0.3591
6,0.6915,0.6984,0.662,0.7851,0.7183,0.3828,0.3896
7,0.7012,0.7089,0.669,0.7967,0.7273,0.4025,0.41
8,0.639,0.6443,0.6167,0.7344,0.6705,0.278,0.2832
9,0.6535,0.6663,0.5993,0.7679,0.6732,0.3163,0.3273


[I 2023-08-10 11:27:09,572] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 11:28:58,758] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6708,0.7622,0.5679,0.8232,0.6722,0.3632,0.3888
1,0.6687,0.7459,0.5889,0.8009,0.6787,0.3529,0.3708
2,0.6232,0.673,0.5679,0.7376,0.6417,0.2582,0.2681
3,0.6729,0.7377,0.5889,0.8086,0.6815,0.3619,0.3813
4,0.6522,0.7129,0.547,0.8051,0.6515,0.3287,0.3534
5,0.6646,0.7048,0.5854,0.7962,0.6747,0.3448,0.3623
6,0.6729,0.7496,0.6307,0.7768,0.6962,0.3501,0.359
7,0.695,0.7473,0.5993,0.8431,0.7006,0.4074,0.4323
8,0.6411,0.7036,0.5784,0.7615,0.6574,0.295,0.3074
9,0.6784,0.7479,0.5889,0.8204,0.6856,0.3742,0.396


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6729,0.7809,0.5226,0.8772,0.655,0.3799,0.4266
1,0.6936,0.753,0.5784,0.8601,0.6917,0.4095,0.4418
2,0.6377,0.6856,0.5157,0.8043,0.6285,0.3065,0.3357
3,0.6957,0.745,0.5749,0.8684,0.6918,0.4148,0.4497
4,0.6377,0.7026,0.5017,0.8182,0.622,0.3106,0.3454
5,0.6708,0.7135,0.5401,0.8516,0.661,0.3708,0.4077
6,0.6791,0.7559,0.5749,0.8333,0.6804,0.3792,0.4059
7,0.6888,0.7572,0.5366,0.9006,0.6725,0.4103,0.461
8,0.6598,0.6987,0.5366,0.8324,0.6525,0.3484,0.3811
9,0.6784,0.7386,0.5366,0.875,0.6652,0.3883,0.432


[I 2023-08-10 11:29:24,795] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 11:32:05,852] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7836,0.6237,0.8063,0.7033,0.3841,0.3984
1,0.6874,0.7559,0.6516,0.7857,0.7124,0.3764,0.3844
2,0.6253,0.6849,0.6028,0.7208,0.6565,0.2514,0.2563
3,0.6563,0.7384,0.6202,0.7574,0.682,0.3161,0.3236
4,0.6439,0.7118,0.5993,0.7511,0.6667,0.2947,0.3034
5,0.6708,0.723,0.6202,0.7807,0.6913,0.3485,0.3591
6,0.6915,0.7653,0.662,0.7851,0.7183,0.3828,0.3896
7,0.7012,0.7675,0.669,0.7967,0.7273,0.4025,0.41
8,0.639,0.6992,0.6167,0.7344,0.6705,0.278,0.2832
9,0.6535,0.7297,0.5993,0.7679,0.6732,0.3163,0.3273


In [19]:
prediction = predict_model(tuned_gbc, data = test)
prediction

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Weight_in_gms,prediction_label,prediction_score
0,F,Ship,3,1,274,3.0,high,4352,0,0.5365
1,F,Ship,3,4,136,2.0,medium,1056,1,0.6492
2,A,Flight,3,5,140,3.0,low,5383,0,0.5218
3,C,Ship,3,1,291,4.0,low,1880,1,0.5060
4,F,Ship,4,2,147,3.0,low,5174,0,0.5533
...,...,...,...,...,...,...,...,...,...,...
3995,A,Ship,4,1,204,4.0,low,1667,1,0.5812
3996,C,Ship,4,3,195,2.0,medium,3869,1,0.6381
3997,B,Flight,4,3,206,2.0,medium,4531,0,0.5247
3998,C,Ship,6,4,255,4.0,low,1869,0,0.5164


In [11]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = prediction['prediction_label']
result.to_csv('result_pycaret07.csv', index=False)


In [20]:
result = pd.read_csv('result_pycaret07.csv')
count = 0
for i in range(len(prediction['prediction_label'])):
    if result['Reached.on.Time_Y.N'][i]==prediction['prediction_label'][i]:
        count += 1
        
print(count)


3849


In [10]:
final_model

In [22]:
target.value_counts()

1    4101
0    2796
Name: Reached.on.Time_Y.N, dtype: int64