In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *
from sklearn.linear_model import LinearRegression

In [24]:
train = pd.read_csv("./smhrd_data/Train.csv")
temp_dis = train['Discount_offered']
train.drop(['ID', 'Gender'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
train = train.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
train.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Discount_offered', 'Weight_in_gms', 'Reached.on.Time_Y.N']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
train = train[(train['Customer_rating']!=99) & (train['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
train = train[train['Product_importance'].notnull()]

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
train['Product_importance'] = train['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
train['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
train['Customer_care_calls'] = train.apply(fill_calls, axis = 1).astype('int64')

train['Weight_in_gms'].fillna(3424, inplace=True)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x:int(x))

train['Prior_purchases'] = train['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
model = LinearRegression()
def stochastic_regression_imputation(df, target_column, feature_column):
    global model
    # 결측값과 비결측값을 분리하여 학습 데이터와 예측 데이터 생성
    train_data = df.dropna(subset=[target_column])
    test_data = df[df[target_column].isnull()]
    
    # 선형 회귀 모델 학습
    model.fit(train_data[feature_column], train_data[target_column])
    
    # 예측값 생성
    predictions = model.predict(test_data[feature_column])

    # 예측값으로 결측값 대체
    df.loc[df[target_column].isnull(), target_column] = predictions

# Stochastic regression imputation 실행
stochastic_regression_imputation(train, 'Discount_offered', ['Weight_in_gms', 'Cost_of_the_Product'])

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6897 entries, 0 to 6998
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6897 non-null   object 
 1   Mode_of_Shipment     6897 non-null   object 
 2   Customer_care_calls  6897 non-null   int64  
 3   Customer_rating      6897 non-null   int64  
 4   Cost_of_the_Product  6897 non-null   int64  
 5   Prior_purchases      6897 non-null   float64
 6   Product_importance   6897 non-null   object 
 7   Discount_offered     6897 non-null   float64
 8   Weight_in_gms        6897 non-null   int64  
 9   Reached.on.Time_Y.N  6897 non-null   int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 592.7+ KB


In [25]:
test = pd.read_csv("./smhrd_data/test.csv")
test.drop(['ID', 'Gender'], axis=1, inplace=True)

# 결측치의 형태가 ?인 데이터를 NA로 대체
test = test.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
test.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Discount_offered', 'Weight_in_gms']


# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
test['Product_importance'].fillna('low', inplace=True)

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
test['Product_importance'] = test['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
test['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
test['Customer_care_calls'] = test.apply(fill_calls, axis = 1).astype('int64')

test['Weight_in_gms'].fillna(3424, inplace=True)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x:int(x))

test['Prior_purchases'] = test['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
def stochastic_regression_imputation(df, target_column, feature_column):
    global model
    # 결측값과 비결측값을 분리하여 학습 데이터와 예측 데이터 생성
    test_data = df[df[target_column].isnull()]
    
    # 예측값 생성
    predictions = model.predict(test_data[feature_column])

    # 예측값으로 결측값 대체
    df.loc[df[target_column].isnull(), target_column] = predictions

# Stochastic regression imputation 실행
stochastic_regression_imputation(test, 'Discount_offered', ['Weight_in_gms', 'Cost_of_the_Product'])

test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  4000 non-null   int64  
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Discount_offered     4000 non-null   float64
 8   Weight_in_gms        4000 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 281.4+ KB


In [26]:
input = train.iloc[:,:-1]
target = train.iloc[:,-1]

In [27]:
exp_clf = setup(data = input, target = target, session_id=123, normalize_method='minmax', fix_imbalance = True)

best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6897, 10)"
4,Transformed data shape,"(7810, 18)"
5,Transformed train set shape,"(5740, 18)"
6,Transformed test set shape,"(2070, 18)"
7,Numeric features,6
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6675,0.7333,0.5906,0.7988,0.6786,0.3496,0.3674,0.593
ada,Ada Boost Classifier,0.6573,0.7247,0.578,0.7911,0.6671,0.3309,0.3493,0.499
catboost,CatBoost Classifier,0.6557,0.7296,0.6324,0.7501,0.686,0.3108,0.3167,1.658
rf,Random Forest Classifier,0.6545,0.7331,0.6638,0.7307,0.6955,0.2981,0.3001,0.465
lightgbm,Light Gradient Boosting Machine,0.6495,0.7276,0.6404,0.7361,0.6847,0.2943,0.2981,0.789
xgboost,Extreme Gradient Boosting,0.6439,0.7237,0.6638,0.7168,0.689,0.2737,0.2751,0.538
et,Extra Trees Classifier,0.6387,0.7156,0.6822,0.702,0.6919,0.2554,0.2556,0.519
knn,K Neighbors Classifier,0.637,0.7035,0.6,0.7408,0.6627,0.2789,0.2865,0.385
dt,Decision Tree Classifier,0.636,0.6229,0.6923,0.6949,0.6935,0.2454,0.2455,0.379
ridge,Ridge Classifier,0.6331,0.0,0.5481,0.7692,0.6397,0.2866,0.3042,0.373


In [28]:
# 방법 1
ada = create_model('ada')
tuned_ada = tune_model(ada)
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library = 'optuna')

blender_2 = blend_models(estimator_list = [tuned_ada, tuned_gbc])

final_model = finalize_model(blender_2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6791,0.7504,0.5958,0.8143,0.6881,0.3736,0.3931
1,0.677,0.7538,0.6132,0.7964,0.6929,0.3642,0.3781
2,0.6335,0.7069,0.5505,0.767,0.641,0.2868,0.3034
3,0.6542,0.7124,0.6063,0.7632,0.6757,0.3157,0.3253
4,0.6294,0.6918,0.547,0.7621,0.6369,0.2788,0.2949
5,0.646,0.7129,0.5645,0.7788,0.6545,0.31,0.327
6,0.6646,0.7466,0.6237,0.7682,0.6885,0.3336,0.3422
7,0.6743,0.7547,0.6028,0.8009,0.6879,0.3612,0.3773
8,0.6473,0.716,0.5714,0.7773,0.6586,0.311,0.3269
9,0.6515,0.7161,0.5679,0.7874,0.6599,0.3212,0.3394


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7362,0.6237,0.8063,0.7033,0.3841,0.3984
1,0.6894,0.7282,0.6551,0.7866,0.7148,0.3801,0.3878
2,0.6253,0.6704,0.6028,0.7208,0.6565,0.2514,0.2563
3,0.6563,0.7041,0.6202,0.7574,0.682,0.3161,0.3236
4,0.6439,0.6922,0.5993,0.7511,0.6667,0.2947,0.3034
5,0.6708,0.7106,0.6202,0.7807,0.6913,0.3485,0.3591
6,0.6915,0.7309,0.662,0.7851,0.7183,0.3828,0.3896
7,0.7012,0.7441,0.669,0.7967,0.7273,0.4025,0.41
8,0.639,0.6815,0.6167,0.7344,0.6705,0.278,0.2832
9,0.6535,0.716,0.5993,0.7679,0.6732,0.3163,0.3273


[I 2023-08-10 09:40:20,218] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 09:41:58,920] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6853,0.7851,0.5645,0.8571,0.6807,0.3954,0.4293
1,0.6708,0.7468,0.5958,0.7991,0.6826,0.3554,0.3721
2,0.6398,0.6913,0.5923,0.7489,0.6615,0.2876,0.2967
3,0.6749,0.7355,0.6063,0.7982,0.6891,0.3616,0.3767
4,0.6439,0.7002,0.5505,0.7861,0.6475,0.3096,0.3299
5,0.677,0.7143,0.5889,0.8164,0.6842,0.371,0.3919
6,0.6791,0.759,0.6411,0.7797,0.7036,0.3609,0.3692
7,0.6701,0.7448,0.5854,0.8077,0.6788,0.3571,0.3768
8,0.6432,0.7143,0.5784,0.765,0.6587,0.2996,0.3126
9,0.6784,0.7476,0.5854,0.8235,0.6843,0.3752,0.3981


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6729,0.7719,0.5122,0.8909,0.6504,0.3826,0.4352
1,0.6936,0.7475,0.5575,0.884,0.6838,0.4148,0.4569
2,0.6522,0.7,0.5192,0.8324,0.6395,0.3367,0.3722
3,0.6812,0.7434,0.5366,0.88,0.6667,0.3938,0.4387
4,0.6542,0.7147,0.4913,0.8704,0.6281,0.3489,0.3995
5,0.6563,0.7109,0.5087,0.8538,0.6376,0.3485,0.3914
6,0.6832,0.7492,0.5436,0.8764,0.671,0.3964,0.439
7,0.7033,0.7379,0.5505,0.9186,0.6885,0.4374,0.4905
8,0.6639,0.7072,0.5157,0.8655,0.6463,0.3631,0.408
9,0.6805,0.7483,0.5331,0.8844,0.6652,0.3937,0.4405


[I 2023-08-10 09:42:15,418] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 09:44:05,172] Finished hyperparemeter search!


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7707,0.6237,0.8063,0.7033,0.3841,0.3984
1,0.6894,0.7381,0.6551,0.7866,0.7148,0.3801,0.3878
2,0.6253,0.6963,0.6028,0.7208,0.6565,0.2514,0.2563
3,0.6563,0.7253,0.6202,0.7574,0.682,0.3161,0.3236
4,0.6439,0.7043,0.5993,0.7511,0.6667,0.2947,0.3034
5,0.6708,0.7214,0.6202,0.7807,0.6913,0.3485,0.3591
6,0.6915,0.7533,0.662,0.7851,0.7183,0.3828,0.3896
7,0.7012,0.7548,0.669,0.7967,0.7273,0.4025,0.41
8,0.639,0.6955,0.6167,0.7344,0.6705,0.278,0.2832
9,0.6535,0.7339,0.5993,0.7679,0.6732,0.3163,0.3273


In [30]:
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library = 'optuna')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6853,0.7851,0.5645,0.8571,0.6807,0.3954,0.4293
1,0.6708,0.7468,0.5958,0.7991,0.6826,0.3554,0.3721
2,0.6398,0.6913,0.5923,0.7489,0.6615,0.2876,0.2967
3,0.6749,0.7355,0.6063,0.7982,0.6891,0.3616,0.3767
4,0.6439,0.7002,0.5505,0.7861,0.6475,0.3096,0.3299
5,0.6749,0.7217,0.5819,0.8186,0.6802,0.3684,0.3908
6,0.6791,0.759,0.6411,0.7797,0.7036,0.3609,0.3692
7,0.6701,0.7448,0.5854,0.8077,0.6788,0.3571,0.3768
8,0.6432,0.7143,0.5784,0.765,0.6587,0.2996,0.3126
9,0.6784,0.7476,0.5854,0.8235,0.6843,0.3752,0.3981


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6853,0.7702,0.5331,0.8947,0.6681,0.4034,0.4531
1,0.6853,0.7515,0.5645,0.8571,0.6807,0.3954,0.4293
2,0.6667,0.6957,0.5052,0.8841,0.643,0.3713,0.4234
3,0.6874,0.7382,0.5366,0.8953,0.671,0.4069,0.4561
4,0.646,0.7115,0.4948,0.8452,0.6242,0.3303,0.3733
5,0.6605,0.722,0.4983,0.8773,0.6356,0.3601,0.4115
6,0.6874,0.7451,0.5436,0.8864,0.6739,0.4051,0.4505
7,0.6784,0.7574,0.5052,0.9177,0.6517,0.3965,0.4585
8,0.666,0.6929,0.5122,0.875,0.6462,0.3685,0.4166
9,0.6784,0.7446,0.5296,0.8837,0.6623,0.3902,0.4375


[I 2023-08-10 09:45:39,598] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 09:48:07,558] Finished hyperparemeter search!


In [11]:
# 66.58 
# exp_clf = setup(data = input, target = target, session_id=123, normalize_method='sta')

# best_model = compare_models()

# tuned_ada = tune_model(ada)

# prediction = predict_model(tuned_ada, data = test)
# prediction

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.73,0.6237,0.8063,0.7033,0.3841,0.3984
1,0.6894,0.7244,0.6551,0.7866,0.7148,0.3801,0.3878
2,0.6253,0.6692,0.6028,0.7208,0.6565,0.2514,0.2563
3,0.6563,0.702,0.6202,0.7574,0.682,0.3161,0.3236
4,0.6439,0.6861,0.5993,0.7511,0.6667,0.2947,0.3034
5,0.6708,0.7127,0.6202,0.7807,0.6913,0.3485,0.3591
6,0.6915,0.7318,0.662,0.7851,0.7183,0.3828,0.3896
7,0.7012,0.7397,0.669,0.7967,0.7273,0.4025,0.41
8,0.639,0.6821,0.6167,0.7344,0.6705,0.278,0.2832
9,0.6535,0.6977,0.5993,0.7679,0.6732,0.3163,0.3273


Fitting 10 folds for each of 10 candidates, totalling 100 fits


In [31]:
prediction1 = predict_model(tuned_gbc, data = test)
prediction1

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Discount_offered,Weight_in_gms,prediction_label,prediction_score
0,F,Ship,3,1,274,3.0,high,6.567113,4352,0,0.5001
1,F,Ship,3,4,136,2.0,medium,27.881275,1056,1,0.5002
2,A,Flight,3,5,140,3.0,low,7.000000,5383,0,0.5001
3,C,Ship,3,1,291,4.0,low,15.082297,1880,0,0.5000
4,F,Ship,4,2,147,3.0,low,5.000000,5174,0,0.5001
...,...,...,...,...,...,...,...,...,...,...,...
3995,A,Ship,4,1,204,4.0,low,21.300413,1667,0,0.5000
3996,C,Ship,4,3,195,2.0,medium,13.334435,3869,1,0.5001
3997,B,Flight,4,3,206,2.0,medium,7.000000,4531,0,0.5001
3998,C,Ship,6,4,255,4.0,low,7.000000,1869,0,0.5000


In [37]:
prediction1['prediction_label']

0       0
1       1
2       0
3       0
4       0
       ..
3995    0
3996    1
3997    0
3998    0
3999    0
Name: prediction_label, Length: 4000, dtype: int32

In [32]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = prediction1['prediction_label']
result.to_csv('result_pycaret05.csv', index=False)

In [36]:
result = pd.read_csv('result_pycaret03.csv')
count = 0
for i in range(len(prediction['prediction_label'])):
    if result['Reached.on.Time_Y.N'][i]==prediction['prediction_label'][i]:
        count += 1
        
print(count)

3365
