In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score
from catboost import  CatBoostClassifier
from sklearn.preprocessing import  StandardScaler, MinMaxScaler
from sklearn.metrics import accuracy_score, f1_score,recall_score, precision_score
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from imblearn.under_sampling import OneSidedSelection, TomekLinks
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE, ADASYN, BorderlineSMOTE
from imblearn.combine import SMOTETomek, SMOTEENN
from lightgbm import LGBMClassifier
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import VotingClassifier
from pycaret.classification import *

In [2]:
train = pd.read_csv("./smhrd_data/Train.csv")
train.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
train = train.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
train.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms', 'Reached.on.Time_Y.N']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
train = train[(train['Customer_rating']!=99) & (train['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
train['Mode_of_Shipment'] = train['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
train = train[train['Product_importance'].notnull()]

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
train['Product_importance'] = train['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
train['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
train['Customer_care_calls'] = train.apply(fill_calls, axis = 1).astype('int64')

train['Weight_in_gms'].fillna(3424, inplace=True)

train['Weight_in_gms'] = train['Weight_in_gms'].apply(lambda x:int(x))

train['Prior_purchases'] = train['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
train.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 6897 entries, 0 to 6998
Data columns (total 9 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      6897 non-null   object 
 1   Mode_of_Shipment     6897 non-null   object 
 2   Customer_care_calls  6897 non-null   int64  
 3   Customer_rating      6897 non-null   int64  
 4   Cost_of_the_Product  6897 non-null   int64  
 5   Prior_purchases      6897 non-null   float64
 6   Product_importance   6897 non-null   object 
 7   Weight_in_gms        6897 non-null   int64  
 8   Reached.on.Time_Y.N  6897 non-null   int64  
dtypes: float64(1), int64(5), object(3)
memory usage: 538.8+ KB


In [3]:
test = pd.read_csv("./smhrd_data/test.csv")

test.drop(['ID', 'Gender', 'Discount_offered'], axis=1, inplace=True)
# 결측치의 형태가 ?인 데이터를 NA로 대체
test = test.replace('?', pd.NA)

# Warehouse_block 컬럼 이름에 공백들어가 있어서 재설정
test.columns = ['Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Weight_in_gms']

# 연속형 변수중에서 이상치 제거 (개수가 많지 않을뿐더러 5개를 채우려고 하는 것은 비효율적이라고 생각하기 때문)
# test = test[(test['Customer_rating']!=99) & (test['Cost_of_the_Product']!=9999)]

# Mode_of_Shipment에서 끝에 zk가 붙은 이상치 수정
test['Mode_of_Shipment'] = test['Mode_of_Shipment'].apply(lambda x: x.replace('zk', '') if pd.notna(x) else x)
test['Product_importance'].fillna('low', inplace=True)

# Product_importance에서 끝에 m이 하나씩 더붙은 이상치 수정
test['Product_importance'] = test['Product_importance'].apply(lambda x: x[:-1] if x[-1]==x[-2] else x)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x: int(x) if str(x).isdigit() else x)
test['Mode_of_Shipment'].fillna(' Ship', inplace=True)

def fill_calls(row) :
    if np.isnan(row['Customer_care_calls']):
        return pt1.loc[(row['Customer_rating'], row['Mode_of_Shipment'])]
    else:
        return row['Customer_care_calls']

pt1 = train[['Customer_care_calls', 'Customer_rating', 'Mode_of_Shipment']].groupby(['Customer_rating', 'Mode_of_Shipment']).agg(lambda x: x.mode().iloc[0])
test['Customer_care_calls'] = test.apply(fill_calls, axis = 1).astype('int64')

test['Weight_in_gms'].fillna(3424, inplace=True)

test['Weight_in_gms'] = test['Weight_in_gms'].apply(lambda x:int(x))

test['Prior_purchases'] = test['Prior_purchases'].apply(lambda x: 3 if pd.isnull(x) else x)
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4000 entries, 0 to 3999
Data columns (total 8 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Warehouse_block      4000 non-null   object 
 1   Mode_of_Shipment     4000 non-null   object 
 2   Customer_care_calls  4000 non-null   int64  
 3   Customer_rating      4000 non-null   int64  
 4   Cost_of_the_Product  4000 non-null   int64  
 5   Prior_purchases      4000 non-null   float64
 6   Product_importance   4000 non-null   object 
 7   Weight_in_gms        4000 non-null   int64  
dtypes: float64(1), int64(4), object(3)
memory usage: 250.1+ KB


In [4]:
input = train.iloc[:,:-1]
target = train.iloc[:,-1]

### 0.68892 GBM 

In [5]:
object_columns = train.select_dtypes('object').columns
x_train, x_test, y_train, y_test = train_test_split(input, target, test_size=0.3, random_state=42)

for column in object_columns:
    dummy = pd.get_dummies(x_train[column], prefix=column)
    x_train = pd.concat([x_train, dummy], axis = 1)
    x_train.drop(column, axis = 1, inplace=True)

for column in object_columns:
    dummy = pd.get_dummies(x_test[column], prefix=column)
    x_test = pd.concat([x_test, dummy], axis = 1)
    x_test.drop(column, axis = 1, inplace=True)

ss = MinMaxScaler()
x_train_ss = ss.fit_transform(x_train)
x_test_ss = ss.transform(x_test)
gb = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.7, random_state=42)
gb.fit(x_train_ss, y_train)
pred = gb.predict(x_test_ss)
print(accuracy_score(y_test, pred))
print(f1_score(y_test, pred))
print(recall_score(y_test, pred))
print(precision_score(y_test, pred))
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, pred)

0.6710144927536232
0.6792275082430522
0.5823909531502424
0.8146892655367232


array([[668, 164],
       [517, 721]], dtype=int64)

In [19]:
object_columns = train.select_dtypes('object').columns
for column in object_columns:
    dummy = pd.get_dummies(input[column], prefix=column)
    input = pd.concat([input, dummy], axis = 1)
    input.drop(column, axis = 1, inplace=True)
    dummy = pd.get_dummies(test[column], prefix=column)
    test = pd.concat([test, dummy], axis = 1)
    test.drop(column, axis = 1, inplace=True)


ss = MinMaxScaler()
input_ss = ss.fit_transform(input)
test_ss = ss.transform(test)

gb = GradientBoostingClassifier(learning_rate=0.01, max_depth=4, n_estimators=500, subsample=0.7, random_state=42)
gb.fit(input_ss, target)
pred = gb.predict(test_ss)

In [7]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = pred
result.to_csv('result_gbm_yj.csv', index=False)

In [None]:
result = pd.read_csv('result_gbm_yj.csv')
count = 0
for i in range(len(pred)):
    if result['Reached.on.Time_Y.N'][i]==pred[i]:
        count += 1
        
print(count)

## 여기아래부터는 recall-precision관련 테스트용 모델이니 상관쓰지 않아도 o

In [12]:
object_columns = train.select_dtypes('object').columns
for column in object_columns:
    dummy = pd.get_dummies(input[column], prefix=column)
    input = pd.concat([input, dummy], axis = 1)
    input.drop(column, axis = 1, inplace=True)
    dummy = pd.get_dummies(test[column], prefix=column)
    test = pd.concat([test, dummy], axis = 1)
    test.drop(column, axis = 1, inplace=True)

# for column in object_columns:
#     dummy = pd.get_dummies(test[column], prefix=column)
#     test = pd.concat([test, dummy], axis = 1)
#     test.drop(column, axis = 1, inplace=True)
    
    
ss = MinMaxScaler()
input_ss = ss.fit_transform(input)
test_ss = ss.transform(test)

# gb = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
#                            learning_rate=0.1, loss='log_loss', max_depth=3,
#                            max_features=None, max_leaf_nodes=None,
#                            min_impurity_decrease=0.0, min_samples_leaf=1,
#                            min_samples_split=2, min_weight_fraction_leaf=0.0,
#                            n_estimators=100, n_iter_no_change=None,
#                            random_state=42, subsample=1.0, tol=0.0001,
#                            validation_fraction=0.1, verbose=0,
#                            warm_start=False)
gb = GradientBoostingClassifier(ccp_alpha=0.0, criterion='friedman_mse', init=None,
                           learning_rate=0.1, loss='log_loss', max_depth=3,
                           max_features=None, max_leaf_nodes=None,
                           min_impurity_decrease=0.0, min_samples_leaf=5,
                           min_samples_split=5, min_weight_fraction_leaf=0.0,
                           n_estimators=50, n_iter_no_change=None,
                           random_state=42, subsample=1.0, tol=0.0001,
                           validation_fraction=0.1, verbose=0,
                           warm_start=False)
gb.fit(input_ss, target)
pred = gb.predict(test_ss)

In [13]:
result = pd.read_csv('./smhrd_data/sampleSubmission.csv')
result['Reached.on.Time_Y.N'] = pred
result.to_csv('result_gbm_yj02.csv', index=False)

In [14]:
result = pd.read_csv('result_gbm_yj02.csv')
count = 0
for i in range(len(pred)):
    if result['Reached.on.Time_Y.N'][i]==pred[i]:
        count += 1
        
print(count)

4000
