# Stacking

- 최적의 분류기를 찾기 위한 마지막 기법
- boosting, bagging, voting에 더 나아가서 결과 값을 응용하는 과정
- 다시 말하자면 여러 분류기를 이용하여 예측값을 만들고
    - 이를 새로운 인풋변수로 활용하여 최종 결과 도출
- self 버전과 sklearn의 라이브러리를 활용해보자

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import warnings
warnings.filterwarnings(action='ignore')

data = pd.read_csv('../1.clustering/titanic.csv')

target = data['Survived']
data = data.drop(['PassengerId', 'Survived'], axis=1)

def sex(a):
    if a == 'male':
        return 0
    else:
        return 1
    
def emb(a):
    if a == 'S':
        return 0
    elif a == 'Q':
        return 1
    else:
        return 2
    
data['Sex'] = data['Sex'].map(sex)
data['Embarked'] = data['Embarked'].map(emb)

train_X, test_X, train_y, test_y = train_test_split(data, target, test_size=0.3, random_state=2019, shuffle=True)

In [2]:
## self 버전 시작합니다.
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn = KNeighborsClassifier(n_neighbors=3).fit(train_X, train_y)
lr = LogisticRegression(random_state=0).fit(train_X, train_y)
tree = DecisionTreeClassifier(random_state=0).fit(train_X, train_y)

new_input = pd.DataFrame()
new_test = pd.DataFrame()

new_input['knn'] = knn.predict(train_X)
new_test['knn'] = knn.predict(test_X)
new_input['lr'] = lr.predict(train_X)
new_test['lr'] = lr.predict(test_X)
new_input['tree'] = tree.predict(train_X)
new_test['tree'] = tree.predict(test_X)


In [3]:
new_input

Unnamed: 0,knn,lr,tree
0,0,0,0
1,1,0,0
2,0,0,0
3,0,0,0
4,0,1,1
...,...,...,...
493,0,1,0
494,1,1,1
495,0,0,0
496,0,0,0


In [4]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(random_state=0, max_depth=6).fit(new_input, train_y)

In [5]:
print(accuracy_score(test_y, rf.predict(new_test)))

0.6869158878504673


In [6]:
print(accuracy_score(test_y, lr.predict(test_X)))
print(accuracy_score(test_y, knn.predict(test_X)))
print(accuracy_score(test_y, tree.predict(test_X)))

0.7990654205607477
0.6915887850467289
0.6869158878504673


In [7]:
rf.feature_importances_

array([0.1578856 , 0.18822809, 0.6538863 ])

In [8]:
## sklearn wrapper
from sklearn.ensemble import StackingClassifier

knn = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(random_state=0)
tree = DecisionTreeClassifier(random_state=0)
rf = RandomForestClassifier(random_state=0, max_depth=6)

estimators = [
    ('lr', lr),
    ('knn', knn),
    ('tree', tree)
]

stacking = StackingClassifier(estimators, rf, stack_method='predict').fit(train_X, train_y)

print(stacking.score(test_X, test_y))

0.7990654205607477


- 결과가 다르다. 왜일까
- 이유를 찾다보니 다음과 같은 문구를 발견
    -  To generalize and avoid over-fitting, the final_estimator is trained on out-samples using sklearn.model_selection.cross_val_predict internally.
    
- 그럼 이를 적용해보면 같을까?
    - 아래 결과를 비교해보면 거의 같다고 볼 수 있을 것 같다.

In [9]:
from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(rf, new_test, test_y, cv=5))

0.7803986710963455

In [10]:
## predict_proba 예시
knn = KNeighborsClassifier(n_neighbors=3).fit(train_X, train_y)
lr = LogisticRegression(random_state=0).fit(train_X, train_y)
tree = DecisionTreeClassifier(random_state=0).fit(train_X, train_y)

new_input = pd.DataFrame()
new_test = pd.DataFrame()

new_input['knn'] = list(zip(*knn.predict_proba(train_X)))[0]
new_test['knn'] = list(zip(*knn.predict_proba(test_X)))[0]
new_input['lr'] = list(zip(*lr.predict_proba(train_X)))[0]
new_test['lr'] = list(zip(*lr.predict_proba(test_X)))[0]
new_input['tree'] = list(zip(*tree.predict_proba(train_X)))[0]
new_test['tree'] = list(zip(*tree.predict_proba(test_X)))[0]

rf = RandomForestClassifier(random_state=0, max_depth=6).fit(new_input, train_y)
rf.score(new_test, test_y)

0.6728971962616822

In [11]:
from sklearn.model_selection import cross_val_score

np.mean(cross_val_score(rf, new_test, test_y, cv=5))

0.7990033222591361