<a href="https://colab.research.google.com/github/yanghyeon408/Insurance/blob/master/Insurance_part2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 5절 Modeling

SMOTE를 이용해서 오버샘플링 한 후 RandomForest, Support Vector Machine, Extreme Gradient Boosting, Light GBM 등의 방법을 이용해서 모델을 생성하고 평가해 본다.

In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
cust = pd.read_csv('drive/My Drive/Project/Insu/CUST_DATA_전처리완료.csv', encoding='utf-8')

##5.1 Train 과 Test(제출용 데이터) 분리

In [None]:
train = cust[cust['DIVIDED_SET']==1].reset_index(drop=True)
test = cust[cust['DIVIDED_SET']==2].reset_index(drop=True)

In [None]:
train.drop('DIVIDED_SET', axis=1, inplace=True)
test.drop('DIVIDED_SET', axis=1, inplace=True)

In [None]:
train_X = train.drop(['CUST_ID', 'SIU_CUST_YN'], axis=1)
train_y = train['SIU_CUST_YN']

In [None]:
test_X = test.drop(['CUST_ID', 'SIU_CUST_YN'], axis=1)

In [None]:
train['SIU_CUST_YN'].value_counts()

0.0    18801
1.0     1806
Name: SIU_CUST_YN, dtype: int64

In [None]:
train_X.shape, train_y.shape

((20607, 89), (20607,))

## 5.2 SMOTE를 활용한 오버샘플링. 
일반인인 경우가 훨씬 많기 때문에, SMOTE를 활용하여 일반인과 사기자를 1:1 비율로 샘플링한다

In [None]:
from imblearn.over_sampling import SMOTE
sm = SMOTE()
X_resampled, y_resampled = sm.fit_sample(train_X, list(train_y))



In [None]:
sum(y_resampled==1), sum(y_resampled==0)

(18801, 18801)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X_resampled,
              y_resampled, test_size=0.3, shuffle=True, random_state=42)
X_train.shape, X_val.shape, y_train.shape, y_val.shape

((26321, 89), (11281, 89), (26321,), (11281,))

## 5.3 Random Forest

랜덤포레스트 분류기를 이용해서 모형을 만들고 평가한다

In [None]:
from sklearn.ensemble import RandomForestClassifier
rf_model = RandomForestClassifier(n_estimators=100, max_features=16,
                                  random_state=42)
rf_model.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=16,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=42, verbose=0,
                       warm_start=False)

In [None]:
y_pred = rf_model.predict(X_val)
y_pred

array([0., 1., 0., ..., 1., 0., 1.])

In [None]:
pd.crosstab(y_val, y_pred) #교차분류표

col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5549,116
1.0,46,5570


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      5665
         1.0       0.98      0.99      0.99      5616

    accuracy                           0.99     11281
   macro avg       0.99      0.99      0.99     11281
weighted avg       0.99      0.99      0.99     11281



In [None]:
from sklearn.model_selection import cross_val_score
scores = cross_val_score(rf_model, X_resampled, y_resampled, cv=10, scoring='f1_macro')
scores.mean()

0.9856312257687542

## 5.4 인공신경망

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
def model_fit_assessment(X, y, model):
    X_resampled, y_resampled = sm.fit_sample(X, y)
    X_train, X_val, y_train, y_val = train_test_split(X_resampled,
            y_resampled, test_size=0.3, shuffle=True, random_state=42)
    model.fit(X_train, y_train)
    pred = model.predict(X)
    print(classification_report(y, pred))

In [None]:
from sklearn.neural_network import MLPClassifier
mlp_model = MLPClassifier(hidden_layer_sizes=(30, 30, 20, 20))
model_fit_assessment(X_resampled, y_resampled, mlp_model)

              precision    recall  f1-score   support

         0.0       1.00      0.99      1.00     18801
         1.0       0.99      1.00      1.00     18801

    accuracy                           1.00     37602
   macro avg       1.00      1.00      1.00     37602
weighted avg       1.00      1.00      1.00     37602



## 5.5 SVM

In [None]:
from sklearn.svm import SVC
svm_clf = SVC(random_state=42)
svm_clf.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=42, shrinking=True, tol=0.001,
    verbose=False)

In [None]:
y_pred = rf_model.predict(X_val)
pd.crosstab(y_val, y_pred)

col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5549,116
1.0,46,5570


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.99      0.98      0.99      5665
         1.0       0.98      0.99      0.99      5616

    accuracy                           0.99     11281
   macro avg       0.99      0.99      0.99     11281
weighted avg       0.99      0.99      0.99     11281



## 5.6 XGBOOST

In [None]:
from xgboost import XGBClassifier
xgb_model = XGBClassifier(max_depth=10, learning_rate=0.01,
                          n_estimators=100)
xgb_model.fit(X_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.01, max_delta_step=0, max_depth=10,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [None]:
y_pred = xgb_model.predict(X_val)
pd.crosstab(y_val, y_pred)

col_0,0.0,1.0
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0.0,5494,171
1.0,97,5519


In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_val, y_pred))

              precision    recall  f1-score   support

         0.0       0.98      0.97      0.98      5665
         1.0       0.97      0.98      0.98      5616

    accuracy                           0.98     11281
   macro avg       0.98      0.98      0.98     11281
weighted avg       0.98      0.98      0.98     11281



## 5.7 LIGHTGBM

In [None]:
import numpy as np
X_train.columns

In [None]:
from lightgbm import LGBMClassifier
lgbm_model = LGBMClassifier(n_estimators=100)
lgbm_model.fit(X_train.to_numpy(), y_train)

# 6절 Ensemble

In [None]:
from sklearn.ensemble import VotingClassifier
voting_model = VotingClassifier(
    estimators = [('rf', rf_model), ('xgb', xgb_model)],
    voting='hard')
voting_model.fit(X_train.to_numpy(), y_train)

# 7절 결과 파일 생성

## 7.1 제출 파일

In [None]:
cust = pd.read_csv("drive/My Drive/Project/Insu/CUST_DATA_전처리완료.csv", encoding="utf-8-sig")
test = cust[cust['DIVIDED_SET']==2].reset_index(drop=True)

In [None]:
X_test = test.drop(["CUST_ID", "DIVIDED_SET", "SIU_CUST_YN"], axis=1)
X_test.shape

(1793, 89)

In [None]:
predict_answer = voting_model.predict(X_test.to_numpy())
len(predict_answer)

NotFittedError: ignored

In [None]:
import pandas as pd
import numpy as np
result = pd.DataFrame(data=np.c_[test.CUST_ID, predict_answer.astype(int)], 
                      columns=["CUST_ID", "사기자여부"])
result.head(10)

NameError: ignored