#### 붓꽃 품종 분류
- 목표 : 붓꽃의 3개 품종을 분류하기
- 데이터셋 : 내장 데이터셋 사용법
- 피쳐 : 4개
- 타겟 : 품종 1개
- 학습 : 지도학습 > 분류

1. 데이터 준비

In [110]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [112]:
# 내장 데이터 로딩
data=load_iris(as_frame=True)

In [113]:
# Bunch 인스턴스 -> dict와 유사한 형태
data.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [114]:
featuredf=data['data']
target = data['target']

In [115]:
featuredf.shape, target.shape

((150, 4), (150,))

2. 학습을 위한 데이터 셋 준비 -> 학습용, 검증용, 테스트용

In [116]:
# 학습용 & 테스트용 분리
X_train, X_test, y_train, y_test = train_test_split(featuredf, target, stratify=target)

In [117]:

# 학습용과 검증용 분리
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, stratify=y_train)

In [118]:
print(f'Train Ds : {X_train.shape[0]}   {X_train.shape[0]/featuredf.shape[0]}%')
print(f'Train Ds : {X_val.shape[0]}   {X_val.shape[0]/featuredf.shape[0]}%')
print(f'Train Ds : {X_test.shape[0]}   {X_test.shape[0]/featuredf.shape[0]}%')

Train Ds : 84   0.56%
Train Ds : 28   0.18666666666666668%
Train Ds : 38   0.25333333333333335%


In [119]:
# 교차 검증 사용 (데이터셋이 적을 때 과대적합방지하기위해 사용)
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.tree import DecisionTreeClassifier

In [120]:
# KFold 기반
# 모델 인스턴스 생성
dtc_model=DecisionTreeClassifier()

# 정확도 저장 리스트
accuracys=[]

#KFold 인스턴스 생성 기본값으로 5개 설정
kfold = KFold()

In [121]:
for idx, (train_index, val_index) in enumerate(kfold.split(featuredf),1):
   print(f'train_index: {train_index.tolist()}')

   # X_train, X_val 설정
   X_train, y_train = featuredf.iloc[train_index.tolist()], target[train_index.tolist()]
   X_val, y_val= featuredf.iloc[val_index.tolist()], target[val_index.tolist()]

   # 학습 진행
   dtc_model.fit(X_train, y_train)

   # 평가 -> 분류일 경우 score() 메서드는 정확도를 반환해준다.
   train_ac=dtc_model.score(X_train, y_train)
   val_ac=dtc_model.score(X_val, y_val)
   
   accuracys.append([train_ac, val_ac])
   print(f'[{idx}번째] train정확도 : {train_ac}   val 정확도 : {val_ac}')

train_index: [30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[1번째] train정확도 : 1.0   val 정확도 : 1.0
train_index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 

In [122]:
train_mean = sum([value[0] for value in accuracys])/kfold.n_splits
# 학습용 정확도 평균값
test_mean = sum([value[1] for value in accuracys])/kfold.n_splits
print(f'Train 정확도 : {train_mean:.2f}     val 정확도 : {test_mean:.2f}')

Train 정확도 : 1.00     val 정확도 : 0.91


#### 3-2. straitifiedKFold : 정답/레이블/타겟의 데이터 비율을 고려해서 데이터 나눠줌

In [123]:
acc = []

skfold=StratifiedKFold()

for idx, (train_index, val_index) in enumerate(skfold.split(featuredf, target),1):
   print(f'train_index: {train_index.tolist()}')

   # X_train, X_val 설정
   X_train, y_train = featuredf.iloc[train_index.tolist()], target[train_index.tolist()]
   X_val, y_val= featuredf.iloc[val_index.tolist()], target[val_index.tolist()]

   # 학습 진행
   dtc_model.fit(X_train, y_train)

   # 평가 -> 분류일 경우 score() 메서드는 정확도를 반환해준다.
   train_ac=dtc_model.score(X_train, y_train)
   val_ac=dtc_model.score(X_val, y_val)
   
   accuracys.append([train_ac, val_ac])
   print(f'[{idx}번째] train정확도 : {train_ac}   val 정확도 : {val_ac}')

train_index: [10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149]
[1번째] train정확도 : 1.0   val 정확도 : 0.9666666666666667
train_index: [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 120, 121, 122, 123, 124, 125, 126, 127, 

In [124]:
train_mean = sum([value[0] for value in accuracys])/skfold.n_splits
# 학습용 정확도 평균값
test_mean = sum([value[1] for value in accuracys])/skfold.n_splits
print(f'Train 정확도 : {train_mean:.2f}     val 정확도 : {test_mean:.2f}')

Train 정확도 : 2.00     val 정확도 : 1.86


#### 교차검증 및 성능평가 동시 진행 함수 
--> cross_val_score, cross_val_predict  
--> cross_validate

In [125]:
from sklearn.model_selection import cross_val_predict, cross_val_score, cross_validate

In [126]:
# 전체 DS -> 학습용과 테스트용 DS 분리
X_train, X_test, y_train, y_test = train_test_split(featuredf, target, stratify=target)


In [127]:
### cross_val_predict
predict = cross_val_predict(dtc_model, X_train, y_train, cv=3)

In [128]:
print(f'predict : {predict}')

predict : [1 1 1 2 2 2 0 1 1 1 2 1 2 1 1 0 2 2 2 2 0 1 0 0 2 1 0 0 2 0 0 0 2 1 2 1 0
 0 2 0 2 2 0 1 0 0 1 2 1 1 0 2 1 2 0 1 2 2 0 2 1 0 1 0 0 1 1 2 0 1 1 0 0 0
 2 0 1 1 0 2 0 1 2 2 1 2 2 2 2 0 2 1 0 2 2 2 2 0 1 1 1 1 1 2 0 0 0 0 0 2 2
 1]


In [129]:
# croos_val_score
cross_val_score(dtc_model, X_train, y_train)

array([0.95652174, 0.95652174, 0.95454545, 0.90909091, 1.        ])

In [130]:
### cross_validate
res = cross_validate(dtc_model, X_train, y_train,
                     return_train_score=True,
                     return_estimator=True)
resdf = pd.DataFrame(res).loc[:,['test_score', 'train_score']]

In [131]:
resdf

Unnamed: 0,test_score,train_score
0,0.956522,1.0
1,0.956522,1.0
2,0.954545,1.0
3,0.909091,1.0
4,1.0,1.0


#### 최적화 된 모델추출

In [132]:
best_model = res['estimator'][4]
best_model

In [133]:
best_model.predict(X_test)

array([0, 2, 2, 2, 0, 1, 1, 0, 0, 0, 2, 2, 1, 0, 2, 2, 1, 1, 2, 2, 0, 0,
       1, 0, 1, 2, 0, 2, 0, 1, 0, 1, 1, 0, 1, 1, 1, 2])

In [134]:
best_model.score(X_test, y_test)

0.9473684210526315