### 11) 모형결합(다수결)

In [1]:
# 모형 결합(model combining)
# 앙상블 방법론(ensemble methods)

# 예측 성능을 향싱시키기 위하여 하나의 모형이 아닌 복수의 모형을 결합하는 방법
# 단일 모형을 사용하는 것보다 계산량이 증가하지만 성능이 향상될 수 있음(과적합 방지 등)

# 취합(aggregation)과 부스팅(boosting)
# 취합: 사용할 모형의 집합을 처음부터 고정
#     다수결(Majority Voting), 배깅(Baggind), 랜덤포레스트(Random Forest)

# 부스팅: 사용할 모형을 점진적으로 늘려가는 방법
#     에이다부스트(AdaBoost), 그레디언트 부스트(Gradient Boost)  

#다수결 방법
# Hard Voting: 단순 투표, 가장 많이 나온 결과를 채택(디폴트)
# Soft Voting: 가중치 투표, 개별 모형의 조건부 확률들을 합한 것들 중 가장 큰 것을 채택

In [2]:
import pandas as pd

# 신용카드 거래 데이터 csv 파일을 로딩(원본)

df = pd.read_csv("c:/workspace3/data/creditcard/creditcard.csv")
df.head()

Unnamed: 0,Time,V1,V2,V3,V4,V5,V6,V7,V8,V9,...,V21,V22,V23,V24,V25,V26,V27,V28,Amount,Class
0,0.0,-1.359807,-0.072781,2.536347,1.378155,-0.338321,0.462388,0.239599,0.098698,0.363787,...,-0.018307,0.277838,-0.110474,0.066928,0.128539,-0.189115,0.133558,-0.021053,149.62,0
1,0.0,1.191857,0.266151,0.16648,0.448154,0.060018,-0.082361,-0.078803,0.085102,-0.255425,...,-0.225775,-0.638672,0.101288,-0.339846,0.16717,0.125895,-0.008983,0.014724,2.69,0
2,1.0,-1.358354,-1.340163,1.773209,0.37978,-0.503198,1.800499,0.791461,0.247676,-1.514654,...,0.247998,0.771679,0.909412,-0.689281,-0.327642,-0.139097,-0.055353,-0.059752,378.66,0
3,1.0,-0.966272,-0.185226,1.792993,-0.863291,-0.010309,1.247203,0.237609,0.377436,-1.387024,...,-0.1083,0.005274,-0.190321,-1.175575,0.647376,-0.221929,0.062723,0.061458,123.5,0
4,2.0,-1.158233,0.877737,1.548718,0.403034,-0.407193,0.095921,0.592941,-0.270533,0.817739,...,-0.009431,0.798278,-0.137458,0.141267,-0.20601,0.502292,0.219422,0.215153,69.99,0


In [12]:
train_cols = df.columns[1:-1]
print(train_cols)

X = df[train_cols] # 독립변수
y = df["Class"]

print(X)
print(y)
y.value_counts()

Index(['V1', 'V2', 'V3', 'V4', 'V5', 'V6', 'V7', 'V8', 'V9', 'V10', 'V11',
       'V12', 'V13', 'V14', 'V15', 'V16', 'V17', 'V18', 'V19', 'V20', 'V21',
       'V22', 'V23', 'V24', 'V25', 'V26', 'V27', 'V28', 'Amount'],
      dtype='object')
               V1         V2        V3        V4        V5        V6  \
0       -1.359807  -0.072781  2.536347  1.378155 -0.338321  0.462388   
1        1.191857   0.266151  0.166480  0.448154  0.060018 -0.082361   
2       -1.358354  -1.340163  1.773209  0.379780 -0.503198  1.800499   
3       -0.966272  -0.185226  1.792993 -0.863291 -0.010309  1.247203   
4       -1.158233   0.877737  1.548718  0.403034 -0.407193  0.095921   
...           ...        ...       ...       ...       ...       ...   
284802 -11.881118  10.071785 -9.834783 -2.066656 -5.364473 -2.606837   
284803  -0.732789  -0.055080  2.035030 -0.738589  0.868229  1.058415   
284804   1.919565  -0.301254 -3.249640 -0.557828  2.630515  3.031260   
284805  -0.240440   0.530483  0.702510 

Class
0    284315
1       492
Name: count, dtype: int64

In [4]:
#언더샘플링

from imblearn.under_sampling import RandomUnderSampler

X_sample, y_sample = RandomUnderSampler(random_state=0).fit_resample(X, y)
X_samp = pd.DataFrame(data=X_sample,columns=train_cols )
y_samp = pd.DataFrame(data=y_sample,columns=['Class'])

df2=pd.concat([X_samp,y_samp],axis=1)
df2.Class.value_counts()

Class
0    492
1    492
Name: count, dtype: int64

In [5]:
X = X_samp[train_cols] # 독립변수
y = y_samp["Class"]

In [6]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=10)

In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
import numpy as np

model1 = LogisticRegression(random_state=1, max_iter=1000)
model2 = DecisionTreeClassifier(random_state=1)
model3 = KNeighborsClassifier(n_neighbors=2)

#     estimators: 개별 모형 목록, 리스트나 named parameter 형식으로 입력
#     voting: {hard, soft} hard voting 과 soft voting 선택. 디폴트는 hard
# 로지스틱 회귀분석과 의사결정나무, KNN 3개를 다수결로 합친 모형

# estimators=[(alias, model)]
ensemble = VotingClassifier(estimators=[('lr', model1), ('tree', model2), ('knn', model3)], voting='soft')
#             다수결

X_train = np.array(X_train)
X_test = np.array(X_test)

for model in (model1, model2, model3, ensemble):
    print(model)

    model.fit(X_train,y_train)
    
    print("학습용:",model.score(X_train, y_train))
    print("검증용:",model.score(X_test, y_test))

    print()   

#일반적으로 다수결 모형이 개별 모형보다 성능이 더 좋음

LogisticRegression(max_iter=1000, random_state=1)
학습용: 0.9529860228716646
검증용: 0.9289340101522843

DecisionTreeClassifier(random_state=1)
학습용: 1.0
검증용: 0.9289340101522843

KNeighborsClassifier(n_neighbors=2)
학습용: 0.9428208386277002
검증용: 0.9289340101522843

VotingClassifier(estimators=[('lr',
                              LogisticRegression(max_iter=1000,
                                                 random_state=1)),
                             ('tree', DecisionTreeClassifier(random_state=1)),
                             ('knn', KNeighborsClassifier(n_neighbors=2))],
                 voting='soft')
학습용: 1.0
검증용: 0.9441624365482234



In [None]:
from sklearn.metrics import classification_report

# 예측값 추출해서 classification_report 확인
y_pred = model(X)
y_pred = torch.argmax(y_pred, 1).detach().numpy()

# classification report 
print(classification_report(Y, y_pred))