In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *

In [3]:
train = pd.read_csv("./smhrd_data/Train.csv")
train.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
train = train.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
train.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms', 'Reached.on.Time_Y.N']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
train = train[(train['Customer_rating']!=99) & (train['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
train = train[train['Product_importance'].notnull()]

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
train['Product_importance'] = train['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
train['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
train['Customer_care_calls'] = train.apply(fill_calls, axis = 1).astype('int64')

train['Weight_in_gms'].fillna(3424, inplace=True)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x:int(x))

train['Prior_purchases'] = train['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6897 entries, 0 to 6998
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6897 non-null   object 
 1   Mode_of_Shipment     6897 non-null   object 
 2   Customer_care_calls  6897 non-null   int64  
 3   Customer_rating      6897 non-null   int64  
 4   Cost_of_the_Product  6897 non-null   int64  
 5   Prior_purchases      6897 non-null   float64
 6   Product_importance   6897 non-null   object 
 7   Weight_in_gms        6897 non-null   int64  
 8   Reached.on.Time_Y.N  6897 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 538.8+ KB


In [4]:
test = pd.read_csv("./smhrd_data/test.csv")

test.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
test = test.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
test.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
# test = test[(test['Customer_rating']!=99) & (test['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
test['Product_importance'].fillna('low', inplace=True)

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
test['Product_importance'] = test['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
test['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
test['Customer_care_calls'] = test.apply(fill_calls, axis = 1).astype('int64')

test['Weight_in_gms'].fillna(3424, inplace=True)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x:int(x))

test['Prior_purchases'] = test['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  4000 non-null   int64  
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Weight_in_gms        4000 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 250.1+ KB


In [5]:
input = train.iloc[:,:-1]
target = train.iloc[:,-1]

In [6]:
exp_clf = setup(data = input, target = target, session_id=123, normalize_method='minmax')

best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6897, 9)"
4,Transformed data shape,"(6897, 17)"
5,Transformed train set shape,"(4827, 17)"
6,Transformed test set shape,"(2070, 17)"
7,Numeric features,5
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6561,0.73,0.6021,0.7699,0.6753,0.3211,0.3325,0.332
catboost,CatBoost Classifier,0.6453,0.7248,0.6432,0.7295,0.6833,0.2837,0.287,0.387
rf,Random Forest Classifier,0.6429,0.7218,0.6774,0.7093,0.6929,0.2667,0.2672,0.34
ada,Ada Boost Classifier,0.6424,0.727,0.7132,0.6939,0.7032,0.2535,0.254,0.324
xgboost,Extreme Gradient Boosting,0.642,0.7214,0.6739,0.7094,0.6911,0.2661,0.2667,0.458
dt,Decision Tree Classifier,0.6412,0.629,0.6934,0.7006,0.6967,0.2573,0.2577,0.323
lightgbm,Light Gradient Boosting Machine,0.6387,0.7226,0.6481,0.7177,0.6809,0.2668,0.269,0.335
ridge,Ridge Classifier,0.6325,0.0,0.7787,0.6625,0.7158,0.2052,0.2108,0.322
lda,Linear Discriminant Analysis,0.6312,0.6766,0.7732,0.6629,0.7137,0.2042,0.2092,0.325
et,Extra Trees Classifier,0.6279,0.6895,0.7063,0.6803,0.6929,0.2213,0.2217,0.366


In [7]:
# 방법 1
catboost = create_model('catboost')
tuned_catboost = tune_model(catboost)
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc)

blender_2 = blend_models(estimator_list = [tuned_catboost, tuned_gbc])

final_model = finalize_model(blender_2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6687,0.7578,0.6481,0.7592,0.6992,0.3357,0.3409
1,0.6439,0.7192,0.6585,0.7186,0.6873,0.2756,0.2771
2,0.6253,0.6953,0.6446,0.7008,0.6715,0.2371,0.2382
3,0.6812,0.7509,0.6829,0.7568,0.7179,0.3535,0.356
4,0.6066,0.6923,0.6272,0.6844,0.6545,0.1998,0.2009
5,0.6253,0.7094,0.5993,0.7227,0.6552,0.2526,0.2579
6,0.6439,0.7332,0.6551,0.7203,0.6861,0.2768,0.2785
7,0.6867,0.7535,0.6376,0.7957,0.7079,0.3789,0.3897
8,0.6307,0.708,0.6411,0.7104,0.674,0.2507,0.2525
9,0.6411,0.7281,0.6376,0.7262,0.679,0.2758,0.2789


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6749,0.7806,0.5122,0.8963,0.6519,0.387,0.4412
1,0.6894,0.768,0.5505,0.8827,0.6781,0.4078,0.4508
2,0.6605,0.6826,0.5017,0.8727,0.6372,0.3592,0.4086
3,0.6936,0.7384,0.547,0.8971,0.6797,0.4174,0.465
4,0.648,0.6951,0.4948,0.8503,0.6256,0.3347,0.3791
5,0.6667,0.725,0.5017,0.8889,0.6414,0.3723,0.4263
6,0.6708,0.7592,0.5401,0.8516,0.661,0.3708,0.4077
7,0.6929,0.7584,0.5226,0.9317,0.6696,0.4225,0.4852
8,0.6432,0.7095,0.4843,0.8528,0.6178,0.3278,0.3748
9,0.6805,0.7288,0.5192,0.903,0.6593,0.3973,0.4521


Fitting 10 folds for each of 10 candidates, totalling 100 fits


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6667,0.7585,0.5784,0.8058,0.6734,0.3513,0.3716
1,0.6687,0.7472,0.6341,0.7679,0.6947,0.3398,0.3473
2,0.6335,0.6967,0.6028,0.7331,0.6616,0.2702,0.2764
3,0.6667,0.7488,0.6202,0.7739,0.6886,0.3393,0.3489
4,0.6356,0.695,0.5889,0.7445,0.6576,0.2794,0.2882
5,0.6542,0.7164,0.5819,0.7804,0.6667,0.323,0.3382
6,0.6812,0.7506,0.6585,0.7714,0.7105,0.3606,0.3662
7,0.666,0.7459,0.6028,0.7864,0.6824,0.3429,0.3565
8,0.6286,0.7071,0.561,0.7523,0.6427,0.2728,0.2857
9,0.6598,0.7334,0.5923,0.7834,0.6746,0.3322,0.3466


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.7019,0.7803,0.784,0.7329,0.7576,0.3716,0.373
1,0.6273,0.7278,0.7526,0.6646,0.7059,0.2027,0.2057
2,0.6004,0.688,0.7073,0.6506,0.6778,0.1543,0.1552
3,0.646,0.7405,0.7282,0.6921,0.7097,0.2568,0.2574
4,0.6377,0.6949,0.7143,0.6879,0.7009,0.2419,0.2422
5,0.6273,0.717,0.6934,0.6838,0.6886,0.2247,0.2248
6,0.6501,0.7431,0.777,0.6799,0.7252,0.2496,0.2538
7,0.6929,0.7588,0.7317,0.7473,0.7394,0.3658,0.3659
8,0.6349,0.7071,0.7387,0.6773,0.7067,0.2255,0.227
9,0.6411,0.7459,0.7247,0.6887,0.7063,0.2457,0.2462


Fitting 10 folds for each of 10 candidates, totalling 100 fits
Original model was better than the tuned model, hence it will be returned. NOTE: The display metrics are for the tuned model (not the original one).


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6729,0.7617,0.5749,0.8209,0.6762,0.3658,0.3898
1,0.6667,0.7487,0.6167,0.7763,0.6874,0.3403,0.3507
2,0.6377,0.6963,0.5923,0.7456,0.6602,0.2829,0.2916
3,0.6646,0.7495,0.6167,0.7729,0.686,0.3357,0.3456
4,0.6439,0.6956,0.5819,0.7626,0.6601,0.3001,0.3123
5,0.6522,0.7167,0.5714,0.7847,0.6613,0.3215,0.3388
6,0.6729,0.7512,0.6376,0.7722,0.6985,0.348,0.3557
7,0.6701,0.748,0.5923,0.8019,0.6814,0.3551,0.3727
8,0.6307,0.7066,0.5575,0.7583,0.6426,0.2786,0.2928
9,0.6598,0.7332,0.5889,0.786,0.6733,0.3332,0.3485


In [8]:
prediction = predict_model(final_model, data = test)
prediction

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Weight_in_gms,prediction_label,prediction_score
0,F,Ship,3,1,274,3.0,high,4352,0,0.5154
1,F,Ship,3,4,136,2.0,medium,1056,1,0.7760
2,A,Flight,3,5,140,3.0,low,5383,0,0.5466
3,C,Ship,3,1,291,4.0,low,1880,1,0.5079
4,F,Ship,4,2,147,3.0,low,5174,0,0.5633
...,...,...,...,...,...,...,...,...,...,...
3995,A,Ship,4,1,204,4.0,low,1667,1,0.7170
3996,C,Ship,4,3,195,2.0,medium,3869,1,0.7600
3997,B,Flight,4,3,206,2.0,medium,4531,0,0.5425
3998,C,Ship,6,4,255,4.0,low,1869,0,0.5140


In [10]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = prediction['prediction_label']
result.to_csv('result_pycaret03.csv', index=False)

In [9]:
result = pd.read_csv('result_pycaret.csv')
count = 0
for i in range(len(prediction['prediction_label'])):
    if result['Reached.on.Time_Y.N'][i]==prediction['prediction_label'][i]:
        count += 1
        
print(count)

3835


In [11]:
result

Unnamed: 0,ID,Reached.on.Time_Y.N
0,7000,0
1,7001,1
2,7002,0
3,7003,1
4,7004,0
...,...,...
3995,10995,1
3996,10996,1
3997,10997,0
3998,10998,0
