# 추가_Ensemble

## 1.환경설정

### (1) 라이브러리 로딩

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from xgboost import XGBClassifier

from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.metrics import *

### (2) 데이터 로드

In [None]:
# mobile data
path = "https://raw.githubusercontent.com/DA4BAM/dataset/master/mobile_cust_churn.csv"
data = pd.read_csv(path)
data.drop(['id', 'REPORTED_USAGE_LEVEL','OVER_15MINS_CALLS_PER_MONTH'], axis = 1, inplace = True)
data.rename(columns = {'HANDSET_PRICE':'H_PRICE',
                       'AVERAGE_CALL_DURATION':'DURATION',
                       'REPORTED_SATISFACTION':'SATISFACTION',
                       'CONSIDERING_CHANGE_OF_PLAN':'CHANGE'}
            , inplace = True)
data.head()

## 2.모델링

### (1) 데이터준비

In [None]:
# 데이터분할1
target = 'CHURN'
x = data.drop(target, axis=1)
y = data.loc[:, target]

# 가변수화
dumm_cols = ['SATISFACTION','CHANGE']
x = pd.get_dummies(x, columns = dumm_cols, drop_first = True)

# 데이터 분할2
x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=.3, random_state = 100)

# 스케일링
scaler = MinMaxScaler()
x_train = scaler.fit_transform(x_train)
x_val = scaler.transform(x_val)

### (2) 기본모델 준비

In [None]:
m1 = LogisticRegression()
m2 = DecisionTreeClassifier(max_depth=5)
m3 = KNeighborsClassifier(n_neighbors=5, metric='manhattan')
m4 = RandomForestClassifier()
m5 = XGBClassifier()

estimators = [('lr', m1), ('dt', m2), ('knn', m3), ('rf', m4), ('xgb', m5)]

### (3) 앙상블1 : Voting

In [None]:
# Hard Voting 모델 선언
hv_mode = VotingClassifier(estimators=estimators, voting='hard')

# Soft Voting 모델 선언
sv_mode = VotingClassifier(estimators=estimators, voting='soft')

In [None]:
# 학습
hv_mode.fit(x_train, y_train)

In [None]:
sv_mode.fit(x_train, y_train)

In [None]:
# 예측
pred1 = hv_mode.predict(x_val)
pred2 = sv_mode.predict(x_val)

In [None]:
# 평가
print(accuracy_score(y_val, pred1))
print(classification_report(y_val, pred1))

In [None]:
# 4) 평가
print(accuracy_score(y_val, pred2))
print(classification_report(y_val, pred2))

In [None]:
# 내부 모델 중 하나를 뽑아서 사용할 수도 있습니다.
model = hv_mode.named_estimators_['dt']
pred = model.predict(x_val)
pred = np.where(pred == 0, 'LEAVE', 'STAY')
print(accuracy_score(y_val, pred))

### (4) 앙상블2 : Stacking

In [None]:
model_stack = StackingClassifier(estimators=estimators
                                    , final_estimator= LogisticRegression())

In [None]:
# 학습
model_stack.fit(x_train, y_train)

In [None]:
# 예측
pred = model_stack.predict(x_val)

In [None]:
# 평가
print(accuracy_score(y_val, pred))
print(classification_report(y_val, pred))

In [None]:
# 내부 모델 중 하나를 뽑아서 사용할 수도 있습니다.
model = model_stack.named_estimators_['dt']
pred = model.predict(x_val)
pred = np.where(pred == 0, 'LEAVE', 'STAY')
print(accuracy_score(y_val, pred))