# [E2] scikit-learn의 학습 및 활용 

>학습과정 및 목표: Scikit-learn을 활용한 Iris의 세 가지 품종 분류모델을 학습하고, scikit-learn의 예제 데이터를 활용하여, 다음의 3가지 프로젝트를 수행하고자 한다. 
* 손글씨 분류 
* 와인 분류
* 유방암 분류

>[note] 응용프로그램에서는 Iris 분류기에서 사용한 '의사결정트리' 대신 '랜덤포레스트'를 이용하여 데이터를 분류하였습니다. 의사결정트리 모델보다 좀더 많은 경우의 수를 가늠해 볼 수 있는 랜덤 포레스트가 좀 더 좋은 학습 결과를 가져올 수 있을것이라고 생각했습니다.

---------------------------------------

## [E2-11] 프로젝트 (1) 손글씨 분류(load_digits)

### 1. 데이터 가져오기

In [5]:
# 사이킷런에 내장되어 있는 digits 데이터를 가져옵니다.
from sklearn.datasets import load_digits
digits = load_digits()

# 데이터를 변환해 줍니다.
#import numpy as np
digits_data = digits.data
digits_label = digits.target

In [6]:
# 데이터 정보를 확인해 볼 수 있습니다.
digits.keys()

dict_keys(['data', 'target', 'frame', 'feature_names', 'target_names', 'images', 'DESCR'])

In [7]:
# 데이터프레임을 사용해 데이터를 보기위해 판다스를 사용합니다.
import pandas as pd
digits_df = pd.DataFrame(data=digits_data)
digits_df["label"] = digits_label
digits_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,55,56,57,58,59,60,61,62,63,label
0,0.0,0.0,5.0,13.0,9.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,13.0,10.0,0.0,0.0,0.0,0
1,0.0,0.0,0.0,12.0,13.0,5.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,11.0,16.0,10.0,0.0,0.0,1
2,0.0,0.0,0.0,4.0,15.0,12.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,3.0,11.0,16.0,9.0,0.0,2
3,0.0,0.0,7.0,15.0,13.0,1.0,0.0,0.0,0.0,8.0,...,0.0,0.0,0.0,7.0,13.0,13.0,9.0,0.0,0.0,3
4,0.0,0.0,0.0,1.0,11.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,2.0,16.0,4.0,0.0,0.0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1792,0.0,0.0,4.0,10.0,13.0,6.0,0.0,0.0,0.0,1.0,...,0.0,0.0,0.0,2.0,14.0,15.0,9.0,0.0,0.0,9
1793,0.0,0.0,6.0,16.0,13.0,11.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,6.0,16.0,14.0,6.0,0.0,0.0,0
1794,0.0,0.0,1.0,11.0,15.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,2.0,9.0,13.0,6.0,0.0,0.0,8
1795,0.0,0.0,2.0,10.0,7.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,5.0,12.0,16.0,12.0,0.0,0.0,9


### 2. 데이터 분리

In [8]:
# 데이터를 1) 훈련, 2) 테스트 데이터셋으로 분리합니다.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label, test_size=0.2, random_state=3)

#나누어진 데이터를 확인 합니다.
print(len(X_train))
print(len(X_test))

1437
360


### 3. 랜덤포레스트 모델 생성 및 학습

In [13]:
# 사이킷런의 랜덤포레스트 모델을 가져옵니다.
from sklearn.ensemble import RandomForestClassifier
random_forest = RandomForestClassifier (random_state=5)

# .fit() 함수를 통해 쉽게 모델을 학습시킬 수 있습니다.
random_forest.fit(X_train, y_train)

# predict() 함수를 통해 분류할 수 있습니다.
y_pred = random_forest.predict(X_test)

### 4. 모델 성능 평가

In [10]:
# 모델의 성능을 평가하기 위해 사용합니다.
from sklearn.metrics import accuracy_score
from sklearn.metrics  import classification_report


print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        47
           1       0.97      1.00      0.99        35
           2       1.00      1.00      1.00        31
           3       1.00      0.97      0.99        40
           4       0.98      0.95      0.97        44
           5       0.97      0.92      0.95        38
           6       1.00      0.92      0.96        25
           7       0.95      1.00      0.98        42
           8       0.97      1.00      0.98        29
           9       0.90      0.97      0.93        29

    accuracy                           0.97       360
   macro avg       0.97      0.97      0.97       360
weighted avg       0.98      0.97      0.97       360



In [11]:
print("테스트 정확도: {0: .4f}".format(accuracy_score(y_test, y_pred)))

테스트 정확도:  0.9750


## 총평

##### 모델의 성능을 평가 함에 있어, 손글씨 분류기의 경우 랜덤포레스트 모델 사용시 accuracy 지수를 통해 매우 높은 정확도를 확인 할 수 있었습니다. 데이터의 특성에 따라 모델의 정확도를 평가하는 기준을 다르게 봐야 할 필요가 있는데, 손글씨 분류의 경우에는  정밀도(precision), 재현률(recall), 이 두지표의 조화평균(f1-score)가 비교적 잘 나온것이라 생각된다. 다만, 9번 label의 경우 정밀도와 재현률의 차이가 다른 label에 비하여 차이가 나는데, 이것이 원본 데이터의 품질의 문제인지, 학습데이터와 테스트 데이터의 분류의 문제인지, 아니면 또 다른 조건의 변경으로 고칠 수 있는 것인지 궁금하다. 