<img src='logo.png'><br>
* ref : https://www.kaggle.com/prachi13/customer-analytics

In [1]:
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import roc_auc_score
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings(action='ignore')

## data load

In [2]:
df = pd.read_csv("./Quiz4/shipping.csv")
print(df.shape)
df.info()
df.head()
df.columns

(10999, 12)
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10999 entries, 0 to 10998
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   ID                   10999 non-null  int64 
 1   Warehouse_block      10999 non-null  object
 2   Mode_of_Shipment     10999 non-null  object
 3   Customer_care_calls  10999 non-null  int64 
 4   Customer_rating      10999 non-null  int64 
 5   Cost_of_the_Product  10999 non-null  int64 
 6   Prior_purchases      10999 non-null  int64 
 7   Product_importance   10999 non-null  object
 8   Gender               10999 non-null  object
 9   Discount_offered     10999 non-null  int64 
 10  Weight_in_gms        10999 non-null  int64 
 11  Reached.on.Time_Y.N  10999 non-null  int64 
dtypes: int64(8), object(4)
memory usage: 1.0+ MB


Index(['ID', 'Warehouse_block', 'Mode_of_Shipment', 'Customer_care_calls',
       'Customer_rating', 'Cost_of_the_Product', 'Prior_purchases',
       'Product_importance', 'Gender', 'Discount_offered', 'Weight_in_gms',
       'Reached.on.Time_Y.N'],
      dtype='object')

In [3]:
df.head()

Unnamed: 0,ID,Warehouse_block,Mode_of_Shipment,Customer_care_calls,Customer_rating,Cost_of_the_Product,Prior_purchases,Product_importance,Gender,Discount_offered,Weight_in_gms,Reached.on.Time_Y.N
0,1,D,Flight,4,2,177,3,low,F,44,1233,1
1,2,F,Flight,4,5,216,2,low,M,59,3088,1
2,3,A,Flight,2,2,183,4,low,M,48,3374,1
3,4,B,Flight,3,3,176,4,medium,M,10,1177,1
4,5,C,Flight,2,2,184,3,medium,F,46,2484,1


## 결측처리

In [4]:
df.isna().sum()

ID                     0
Warehouse_block        0
Mode_of_Shipment       0
Customer_care_calls    0
Customer_rating        0
Cost_of_the_Product    0
Prior_purchases        0
Product_importance     0
Gender                 0
Discount_offered       0
Weight_in_gms          0
Reached.on.Time_Y.N    0
dtype: int64

## 인코딩

In [5]:
df.select_dtypes('object').columns

Index(['Warehouse_block', 'Mode_of_Shipment', 'Product_importance', 'Gender'], dtype='object')

In [6]:
obj_cols = df.select_dtypes('object').columns
for col in obj_cols:
    le = LabelEncoder()
    encoder_data = le.fit_transform(df[col])  #array-like
    df[col] = encoder_data
#     df[col+"_e"] = encoder
# df.drop(obj_cols, axis=1, inplace=True)


## tarin test 분리

In [7]:
y = df['Reached.on.Time_Y.N']
X = df.drop(['Reached.on.Time_Y.N'], axis=1)

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y , test_size=0.2, random_state=1414, shuffle=True, stratify=y)

In [9]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((8799, 11), (2200, 11), (8799,), (2200,))

## 스케일링

In [10]:
scaler = StandardScaler()
scaler.fit(X_train)
X_train_scaler = scaler.transform(X_train)
X_test_scaler = scaler.transform(X_test)

## 1차 점수 확인

In [11]:
model = RandomForestClassifier(random_state=1414)
model.fit(X_train_scaler, y_train)
proba = model.predict_proba(X_test_scaler)
auc = roc_auc_score(y_test, proba[:, 1])
print(auc)   #0.7506465891301206

0.7476977686494692


## 2차 점수 확인
* 모델변경

In [12]:
from sklearn.ensemble import BaggingClassifier 
from xgboost import  XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import StackingClassifier

In [13]:
models = [RandomForestClassifier(random_state=1414), 
          BaggingClassifier(),  XGBClassifier(),  LGBMClassifier(),  DecisionTreeClassifier()]
for model in models:
    model.fit(X_train_scaler, y_train)  
    proba = model.predict_proba(X_test_scaler)
    auc = roc_auc_score(y_test, proba[:, 1])
    print(model.__class__.__name__, auc)   #0.7506465891301206

RandomForestClassifier 0.7476977686494692
BaggingClassifier 0.7475831400675407
XGBClassifier 0.7349151791425783
LGBMClassifier 0.7443739690940736
DecisionTreeClassifier 0.6498466037740709


* 최적의 모델 우선 결정
*  RandomForestClassifier

In [14]:
model = RandomForestClassifier(random_state=1414)

### cross_val_score(model,X,y), GridSearchCV(model,X,y)  --> X, y 통째로 다 주기위해 train, test 합치기
* df
* X,y  
* X_train, X_test, y_train, y_test
* X_train_scaler, X_test_scaler, y_train, y_test   ----------> XX_scaler, yy


In [1]:
# array  + array 
XX_scaler = np.concatenate([X_train_scaler, X_test_scaler])
print(X_train_scaler.shape, X_test_scaler.shape, XX_scaler.shape)

# Series + Series
yy = pd.concat([y_train, y_test], axis=0)


NameError: name 'np' is not defined

## 3차 점수 확인 : cross_val_score

In [16]:
from sklearn.model_selection import cross_val_score
scores_arr = cross_val_score(model, XX_scaler, yy, scoring='roc_auc', cv=5)
print(scores_arr)
print(scores_arr.mean())


[0.72622444 0.75446858 0.72082574 0.76020208 0.74707582]
0.7417593332762049


## 4차 점수 확인 : GridSearchCV
* 하이퍼파라미터 튜닝을 통한 모델 성능 개선

In [18]:
model = RandomForestClassifier(random_state=1414)
myparam = {"n_estimators":[100]}     # model.get_params().keys()
gcv_model = GridSearchCV(model,param_grid=myparam, scoring='roc_auc', refit=True, cv=5)
gcv_model.fit(XX_scaler, yy)
print(gcv_model.best_score_)
print(gcv_model.best_params_)


0.7417593332762049
{'n_estimators': 100}


In [20]:
# from xgboost import DMatrix, cv
# dmatrix_df = DMatrix(data=X, label=y, enable_categorical=True)
# xgb_cv_res = cv(dtrain=dmatrix_df, 

#------------- 0.8140157799999999 --------------          
import xgboost as xgb
dmatrix_df = xgb.DMatrix(data=XX_scaler, label=yy, enable_categorical=True)
xgb_cv_res = xgb.cv(dtrain=dmatrix_df, 
               params={"n_estimators": 300, "objective":"binary:logistic"}, 
               nfold=5, 
               metrics="auc",        #--- 이진분류 점수 중 하나 
               as_pandas=True)
xgb_cv_res['train-auc-mean'].mean()  #------------ 이거 accuracy 점수 아님......


Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "n_estimators" } might not be used.

  This may not be accurate due to some parameters are only used in language bin

0.8140157799999999