In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score,recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *
from sklearn.linear_model import LinearRegression

In [2]:
train = pd.read_csv("./smhrd_data/Train.csv")
train.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
train = train.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
train.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms', 'Reached.on.Time_Y.N']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
train = train[(train['Customer_rating']!=99) & (train['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
train = train[train['Product_importance'].notnull()]

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
train['Product_importance'] = train['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
train['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
train['Customer_care_calls'] = train.apply(fill_calls, axis = 1).astype('int64')

train['Weight_in_gms'].fillna(3424, inplace=True)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x:int(x))

train['Prior_purchases'] = train['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6897 entries, 0 to 6998
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6897 non-null   object 
 1   Mode_of_Shipment     6897 non-null   object 
 2   Customer_care_calls  6897 non-null   int64  
 3   Customer_rating      6897 non-null   int64  
 4   Cost_of_the_Product  6897 non-null   int64  
 5   Prior_purchases      6897 non-null   float64
 6   Product_importance   6897 non-null   object 
 7   Weight_in_gms        6897 non-null   int64  
 8   Reached.on.Time_Y.N  6897 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 538.8+ KB


In [3]:
test = pd.read_csv("./smhrd_data/test.csv")

test.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
test = test.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
test.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
# test = test[(test['Customer_rating']!=99) & (test['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
test['Product_importance'].fillna('low', inplace=True)

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
test['Product_importance'] = test['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
test['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
test['Customer_care_calls'] = test.apply(fill_calls, axis = 1).astype('int64')

test['Weight_in_gms'].fillna(3424, inplace=True)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x:int(x))

test['Prior_purchases'] = test['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  4000 non-null   int64  
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Weight_in_gms        4000 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 250.1+ KB


In [4]:
input = train.iloc[:,:-1]
target = train.iloc[:,-1]

### 0.68892 GBM 

In [19]:
object_columns = train.select_dtypes('object').columns
for column in object_columns:
    dummy = pd.get_dummies(input[column], prefix=column)
    input = pd.concat([input, dummy], axis = 1)
    input.drop(column, axis = 1, inplace=True)
    dummy = pd.get_dummies(test[column], prefix=column)
    test = pd.concat([test, dummy], axis = 1)
    test.drop(column, axis = 1, inplace=True)


ss = MinMaxScaler()
input_ss = ss.fit_transform(input)
test_ss = ss.transform(test)

gb = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.7, random_state=42)
gb.fit(input_ss, target)
pred = gb.predict(test_ss)

In [7]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = pred
result.to_csv('result_gbm_yj.csv', index=False)

In [None]:
result = pd.read_csv('result_gbm_yj.csv')
count = 0
for i in range(len(pred)):
    if result['Reached.on.Time_Y.N'][i]==pred[i]:
        count += 1
        
print(count)

In [8]:
exp_clf = setup(data = input, target = target, session_id=123, normalize_method='minmax', fold=5, fix_imbalance=True,
                categorical_features=['Warehouse_block', 'Mode_of_Shipment', 'Product_importance'])
# 65.61
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6897, 9)"
4,Transformed data shape,"(7810, 17)"
5,Transformed train set shape,"(5740, 17)"
6,Transformed test set shape,"(2070, 17)"
7,Numeric features,5
8,Categorical features,3
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
gbc,Gradient Boosting Classifier,0.6602,0.73,0.5826,0.7916,0.671,0.336,0.3533,2.892
catboost,CatBoost Classifier,0.6534,0.7284,0.6366,0.7435,0.6859,0.3045,0.3091,5.372
ada,Ada Boost Classifier,0.6526,0.7253,0.6627,0.7289,0.6941,0.2941,0.296,2.848
lightgbm,Light Gradient Boosting Machine,0.6462,0.7204,0.6547,0.7239,0.6875,0.282,0.284,2.888
rf,Random Forest Classifier,0.6447,0.7238,0.6732,0.7131,0.6925,0.2726,0.2734,3.018
xgboost,Extreme Gradient Boosting,0.6391,0.7183,0.6732,0.7061,0.6892,0.2595,0.26,2.688
et,Extra Trees Classifier,0.6356,0.6982,0.7042,0.6894,0.6967,0.2404,0.2406,2.762
knn,K Neighbors Classifier,0.6335,0.6995,0.5972,0.7368,0.6595,0.2719,0.2792,2.616
dt,Decision Tree Classifier,0.6244,0.6105,0.684,0.6849,0.6841,0.2208,0.221,3.356
ridge,Ridge Classifier,0.6167,0.0,0.5927,0.7141,0.6477,0.2354,0.2404,2.666


In [19]:
# 방법 1
rf = create_model('rf')
tuned_rf = tune_model(rf)
print('rf')

lightgbm = create_model('lightgbm')
tuned_lightgbm = tune_model(lightgbm)
print('lightgbm')

catboost = create_model('catboost')
tuned_catboost = tune_model(catboost)
print('catboost')

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6439,0.7251,0.6812,0.7083,0.6945,0.2681,0.2684
1,0.6408,0.7191,0.6794,0.7052,0.6921,0.2614,0.2617
2,0.6249,0.7073,0.6341,0.7054,0.6679,0.2397,0.2415
3,0.6539,0.7334,0.6742,0.7247,0.6986,0.2935,0.2945
4,0.6601,0.7342,0.6969,0.722,0.7092,0.3006,0.3008
Mean,0.6447,0.7238,0.6732,0.7131,0.6925,0.2726,0.2734
Std,0.0121,0.01,0.0209,0.0085,0.0136,0.0221,0.0218


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6863,0.7413,0.601,0.8234,0.6949,0.388,0.4085
1,0.647,0.7018,0.5976,0.7572,0.668,0.3022,0.3119
2,0.6528,0.71,0.5784,0.7812,0.6647,0.321,0.3368
3,0.6943,0.7592,0.6272,0.8163,0.7094,0.3984,0.4139
4,0.6446,0.7115,0.6098,0.7463,0.6711,0.2929,0.3
Mean,0.665,0.7248,0.6028,0.7849,0.6816,0.3405,0.3542
Std,0.021,0.0218,0.0159,0.0308,0.0175,0.0441,0.0481


Fitting 5 folds for each of 10 candidates, totalling 50 fits
rf


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6346,0.7187,0.6376,0.7162,0.6747,0.2611,0.2634
1,0.6594,0.7301,0.6742,0.7316,0.7017,0.3064,0.3078
2,0.6155,0.7004,0.6341,0.6933,0.6624,0.218,0.2192
3,0.6694,0.7355,0.6707,0.7476,0.7071,0.3303,0.3329
4,0.6518,0.7174,0.6568,0.7306,0.6917,0.2943,0.2965
Mean,0.6462,0.7204,0.6547,0.7239,0.6875,0.282,0.284
Std,0.0191,0.0121,0.0165,0.0182,0.0167,0.039,0.0393


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6749,0.749,0.5314,0.8714,0.6602,0.382,0.4256
1,0.6718,0.7042,0.5261,0.8703,0.6558,0.3767,0.421
2,0.656,0.7051,0.5017,0.8623,0.6344,0.3499,0.3964
3,0.6788,0.7428,0.5261,0.8882,0.6608,0.3916,0.4408
4,0.6684,0.7096,0.5157,0.8757,0.6491,0.3724,0.4201
Mean,0.67,0.7221,0.5202,0.8736,0.6521,0.3745,0.4208
Std,0.0078,0.0196,0.0105,0.0085,0.0098,0.0139,0.0143


Fitting 5 folds for each of 10 candidates, totalling 50 fits
lightgbm


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6532,0.7352,0.6359,0.7434,0.6854,0.3042,0.3089
1,0.646,0.7224,0.6359,0.7329,0.681,0.2878,0.2914
2,0.6301,0.7012,0.6063,0.7265,0.661,0.2611,0.2663
3,0.6798,0.7546,0.662,0.7677,0.7109,0.3564,0.3613
4,0.658,0.7285,0.6429,0.747,0.691,0.313,0.3174
Mean,0.6534,0.7284,0.6366,0.7435,0.6859,0.3045,0.3091
Std,0.0162,0.0174,0.0179,0.0141,0.0161,0.0314,0.0314


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6801,0.7603,0.5296,0.8863,0.663,0.3934,0.4414
1,0.6698,0.7106,0.5244,0.8674,0.6536,0.3728,0.4166
2,0.658,0.6999,0.4895,0.8836,0.63,0.3576,0.4125
3,0.6798,0.7518,0.5244,0.8932,0.6608,0.3942,0.4452
4,0.6663,0.7235,0.5209,0.8642,0.65,0.3666,0.4102
Mean,0.6708,0.7292,0.5178,0.8789,0.6515,0.3769,0.4252
Std,0.0084,0.0233,0.0144,0.0112,0.0117,0.0146,0.015


Fitting 5 folds for each of 10 candidates, totalling 50 fits
catboost


In [15]:
blender_4 = blend_models(estimator_list = [tuned_rf, tuned_catboost, tuned_et, tuned_gbc])

final_model = finalize_model(blender_4)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7476,0.6725,0.772,0.7188,0.3706,0.3751
1,0.6532,0.7234,0.6551,0.7329,0.6918,0.2982,0.3007
2,0.6394,0.7163,0.6202,0.7325,0.6717,0.2778,0.2825
3,0.6684,0.7491,0.662,0.751,0.7037,0.3306,0.334
4,0.6477,0.7339,0.6533,0.7267,0.6881,0.2859,0.2881
Mean,0.6592,0.7341,0.6526,0.743,0.6948,0.3126,0.3161
Std,0.017,0.013,0.0175,0.0166,0.0158,0.0341,0.0345


In [16]:
prediction = predict_model(final_model, data = test)
prediction

Unnamed: 0,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Weight_in_gms,prediction_label,prediction_score
0,F,Ship,3,1,274,3.0,high,4352,0,0.6441
1,F,Ship,3,4,136,2.0,medium,1056,1,0.8035
2,A,Flight,3,5,140,3.0,low,5383,0,0.6672
3,C,Ship,3,1,291,4.0,low,1880,1,0.6166
4,F,Ship,4,2,147,3.0,low,5174,0,0.5824
...,...,...,...,...,...,...,...,...,...,...
3995,A,Ship,4,1,204,4.0,low,1667,1,0.6825
3996,C,Ship,4,3,195,2.0,medium,3869,1,0.7952
3997,B,Flight,4,3,206,2.0,medium,4531,0,0.5287
3998,C,Ship,6,4,255,4.0,low,1869,1,0.5389


In [17]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = prediction['prediction_label']
result.to_csv('result_caret_yj.csv', index=False)

In [18]:
result = pd.read_csv('result_caret_yj.csv')
count = 0
for i in range(len(prediction['prediction_label'])):
    if result['Reached.on.Time_Y.N'][i]==prediction['prediction_label'][i]:
        count += 1
        
print(count)

4000


In [22]:
result = pd.read_csv('./result_gbm_yj.csv')
result

Unnamed: 0,ID,Reached.on.Time_Y.N
0,7000,1
1,7001,1
2,7002,0
3,7003,1
4,7004,0
...,...,...
3995,10995,1
3996,10996,1
3997,10997,0
3998,10998,1


In [23]:
result['Reached.on.Time_Y.N'].value_counts()

0    2246
1    1754
Name: Reached.on.Time_Y.N, dtype: int64