In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *
from sklearn.linear_model import LinearRegression

In [5]:
train = pd.read_csv("./smhrd_data/Train.csv")

temp_dis = train['Discount_offered']
train.drop(['ID', 'Gender'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
train = train.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
train.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Discount_offered', 'Weight_in_gms', 'Reached.on.Time_Y.N']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
train = train[(train['Customer_rating']!=99) & (train['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
train = train[train['Product_importance'].notnull()]

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
train['Product_importance'] = train['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
train['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
train['Customer_care_calls'] = train.apply(fill_calls, axis = 1).astype('int64')

train['Weight_in_gms'].fillna(3424, inplace=True)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x:int(x))

train['Prior_purchases'] = train['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
model = LinearRegression()
def stochastic_regression_imputation(df, target_column, feature_column):
    global model
    # 결측치와 비결측치 데이터 분리
    train_data = df.dropna(subset=[target_column])
    train_data = train_data[train_data[target_column] >= 0]
    test_data = df[(df[target_column].isnull()) | (df[target_column] < 0)]
    
    # 모델 학습
    model.fit(train_data[feature_column], train_data[target_column])
    
    # 예측값 생성
    predictions = model.predict(test_data[feature_column])

    # 예측값으로 결측치 및 0 미만 값 대체
    df.loc[(df[target_column].isnull()) | (df[target_column] < 0), target_column] = predictions

# Stochastic regression imputation 실행
stochastic_regression_imputation(train, 'Discount_offered', ['Weight_in_gms', 'Cost_of_the_Product'])
train['Discount_offered'] = train['Discount_offered'].apply(lambda x : train['Discount_offered'].mean() if x < 0 else x)

train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 6897 entries, 0 to 6998
Data columns (total 10 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6897 non-null   object 
 1   Mode_of_Shipment     6897 non-null   object 
 2   Customer_care_calls  6897 non-null   int64  
 3   Customer_rating      6897 non-null   int64  
 4   Cost_of_the_Product  6897 non-null   int64  
 5   Prior_purchases      6897 non-null   float64
 6   Product_importance   6897 non-null   object 
 7   Discount_offered     6897 non-null   float64
 8   Weight_in_gms        6897 non-null   int64  
 9   Reached.on.Time_Y.N  6897 non-null   int64  
dtypes: float64(2), int64(5), object(3)
memory usage: 592.7+ KB


In [6]:
test = pd.read_csv('./smhrd_data/test.csv')
test.drop(['ID', 'Gender'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
test = test.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
test.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Discount_offered', 'Weight_in_gms']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
# test = test[(test['Customer_rating']!=99) & (test['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
test['Product_importance'].fillna('low', inplace=True)

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
test['Product_importance'] = test['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
test['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
test['Customer_care_calls'] = test.apply(fill_calls, axis = 1).astype('int64')

test['Weight_in_gms'].fillna(3424, inplace=True)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x:int(x))

test['Prior_purchases'] = test['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
def stochastic_regression_imputation(df, target_column, feature_column):
    global model
    test_data = df[(df[target_column].isnull()) | (df[target_column] < 0)]
    print(len(test_data))
    # 예측값 생성
    predictions = model.predict(test_data[feature_column])
    print(len(predictions))
    # 예측값으로 결측치 및 0 미만 값 대체
    df.loc[(df[target_column].isnull()) | (df[target_column] < 0), target_column] = predictions

# Stochastic regression imputation 실행
stochastic_regression_imputation(test, 'Discount_offered', ['Weight_in_gms', 'Cost_of_the_Product'])
test['Discount_offered'] = test['Discount_offered'].apply(lambda x : train['Discount_offered'].mean() if x < 0 else x)

test.info()

1915
1915
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  4000 non-null   int64  
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Discount_offered     4000 non-null   float64
 8   Weight_in_gms        4000 non-null   int64  
dtypes: float64(2), int64(4), object(3)
memory usage: 281.4+ KB


In [7]:
input = train.iloc[:,:-1]
target = train.iloc[:,-1]

In [11]:
input.columns

Index(['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Discount_offered', 'Weight_in_gms'],
      dtype='object')

In [15]:
exp_clf = setup(data = input, target = target, session_id=123, normalize_method='minmax',  fix_imbalance = True, 
                categorical_features = ['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Customer_rating'],)
# 65.61
best_model = compare_models()

Unnamed: 0,Description,Value
0,Session id,123
1,Target,Reached.on.Time_Y.N
2,Target type,Binary
3,Original data shape,"(6897, 10)"
4,Transformed data shape,"(7810, 22)"
5,Transformed train set shape,"(5740, 22)"
6,Transformed test set shape,"(2070, 22)"
7,Numeric features,5
8,Categorical features,4
9,Preprocess,True


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
ada,Ada Boost Classifier,0.6621,0.7327,0.6098,0.7752,0.682,0.3321,0.3437,0.606
gbc,Gradient Boosting Classifier,0.6604,0.7374,0.5927,0.7843,0.6748,0.3335,0.3483,0.685
catboost,CatBoost Classifier,0.6553,0.7321,0.6334,0.749,0.6862,0.3095,0.3152,1.535
rf,Random Forest Classifier,0.6495,0.7259,0.6711,0.7206,0.6949,0.2842,0.2854,0.645
lightgbm,Light Gradient Boosting Machine,0.6491,0.7293,0.6369,0.7374,0.6833,0.2944,0.2986,0.723
xgboost,Extreme Gradient Boosting,0.6449,0.727,0.6666,0.7168,0.6906,0.2752,0.2764,0.75
dt,Decision Tree Classifier,0.6381,0.6251,0.6937,0.6969,0.6951,0.2498,0.25,0.562
knn,K Neighbors Classifier,0.6372,0.705,0.6059,0.7378,0.665,0.2775,0.2843,0.57
lda,Linear Discriminant Analysis,0.6354,0.7033,0.5526,0.7696,0.6431,0.2903,0.3073,0.568
et,Extra Trees Classifier,0.635,0.7094,0.6927,0.6935,0.6929,0.2429,0.2431,0.639


In [16]:
# 방법 1
ada = create_model('ada')
tuned_ada = tune_model(ada, search_library = 'optuna')
print('hu')
gbc = create_model('gbc')
tuned_gbc = tune_model(gbc, search_library = 'optuna')
print('hu')
blender_2 = blend_models(estimator_list = [tuned_catboost, tuned_gbc])

final_model = finalize_model(blender_2)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7618,0.6028,0.8238,0.6962,0.3897,0.4101
1,0.6812,0.7589,0.6272,0.793,0.7004,0.3695,0.3811
2,0.6315,0.6917,0.5819,0.7422,0.6523,0.2723,0.2815
3,0.6522,0.711,0.6307,0.7449,0.683,0.3035,0.3087
4,0.6273,0.6898,0.547,0.7585,0.6356,0.2742,0.2897
5,0.648,0.7072,0.5923,0.7623,0.6667,0.3061,0.3171
6,0.6708,0.7444,0.6272,0.7759,0.6936,0.3465,0.3557
7,0.6867,0.7595,0.6098,0.8178,0.6986,0.3866,0.4048
8,0.6452,0.7203,0.5923,0.7589,0.6654,0.2999,0.3104
9,0.6577,0.7205,0.5923,0.7798,0.6733,0.3276,0.3414


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7351,0.6237,0.8063,0.7033,0.3841,0.3984
1,0.6894,0.7241,0.6551,0.7866,0.7148,0.3801,0.3878
2,0.6253,0.6692,0.6028,0.7208,0.6565,0.2514,0.2563
3,0.6563,0.7044,0.6202,0.7574,0.682,0.3161,0.3236
4,0.6439,0.687,0.5993,0.7511,0.6667,0.2947,0.3034
5,0.6708,0.7169,0.6202,0.7807,0.6913,0.3485,0.3591
6,0.6894,0.7314,0.6585,0.7842,0.7159,0.3791,0.3862
7,0.7012,0.7397,0.669,0.7967,0.7273,0.4025,0.41
8,0.639,0.6762,0.6167,0.7344,0.6705,0.278,0.2832
9,0.6535,0.6984,0.5993,0.7679,0.6732,0.3163,0.3273


[I 2023-08-10 10:14:34,085] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 10:16:27,465] Finished hyperparemeter search!


hu


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6874,0.7892,0.5714,0.8542,0.6848,0.398,0.43
1,0.6749,0.7536,0.6132,0.7928,0.6916,0.3596,0.373
2,0.6253,0.6919,0.5784,0.7345,0.6472,0.2595,0.268
3,0.6667,0.7456,0.6063,0.7838,0.6837,0.3433,0.3561
4,0.6418,0.7212,0.5679,0.7689,0.6533,0.2998,0.3146
5,0.6708,0.7194,0.5958,0.7991,0.6826,0.3554,0.3721
6,0.6687,0.7498,0.6132,0.7822,0.6875,0.3459,0.3576
7,0.6722,0.7493,0.5784,0.8177,0.6776,0.3636,0.3863
8,0.6577,0.7199,0.5784,0.7905,0.668,0.3318,0.3492
9,0.6701,0.7402,0.5784,0.8137,0.6762,0.359,0.381


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6749,0.8068,0.5157,0.8916,0.6534,0.3861,0.4382
1,0.6915,0.7645,0.554,0.8833,0.6809,0.4113,0.4538
2,0.6563,0.7018,0.5226,0.838,0.6438,0.3446,0.381
3,0.6749,0.7587,0.5331,0.8693,0.6609,0.3815,0.4242
4,0.6522,0.722,0.4948,0.8606,0.6283,0.3435,0.3908
5,0.6625,0.7093,0.4948,0.8875,0.6353,0.3654,0.4204
6,0.6874,0.743,0.561,0.8656,0.6808,0.4007,0.4374
7,0.6971,0.7441,0.5296,0.9325,0.6756,0.4294,0.4909
8,0.6494,0.716,0.5052,0.843,0.6318,0.3351,0.3758
9,0.6763,0.7486,0.5296,0.8786,0.6609,0.3858,0.4317


[I 2023-08-10 10:16:52,750] Searching the best hyperparameters using 4827 samples...
[I 2023-08-10 10:18:52,552] Finished hyperparemeter search!


hu


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.6812,0.8076,0.5226,0.8982,0.6608,0.3973,0.4501
1,0.6894,0.7687,0.5575,0.8743,0.6809,0.406,0.4455
2,0.6605,0.6977,0.5052,0.8683,0.6388,0.3582,0.4057
3,0.6812,0.7506,0.5401,0.8757,0.6681,0.3929,0.436
4,0.646,0.7186,0.4843,0.858,0.6192,0.3333,0.3817
5,0.6605,0.708,0.4983,0.8773,0.6356,0.3601,0.4115
6,0.677,0.747,0.5401,0.8659,0.6652,0.3841,0.4246
7,0.6846,0.7545,0.5226,0.9091,0.6637,0.4051,0.4611
8,0.6556,0.7114,0.5017,0.8623,0.6344,0.3493,0.3959
9,0.6784,0.7425,0.5192,0.8976,0.6578,0.3929,0.4462


In [20]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = prediction['prediction_label']
result.to_csv('result_pycaret06.csv', index=False)

In [23]:
result = pd.read_csv('result_pycaret.csv')
count = 0
for i in range(len(prediction['prediction_label'])):
    if result['Reached.on.Time_Y.N'][i]==prediction['prediction_label'][i]:
        count += 1
        
print(count)

3846
