# 프로젝트 (1) load_digits : 손글씨를 분류해 봅시다
- `load_digits` : 손글씨 이미지 데이터 ([링크](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_digits.html#sklearn.datasets.load_digits))

## (1) 필요한 모듈 import하기

In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score

import pandas as pd
import numpy as np

## (2) 데이터 준비

In [2]:
digits = load_digits()

print(dir(digits))

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']


In [3]:
def print_data_info(scikit_data):
    print("data의 type: ", type(scikit_data))
    print("data의 shape: ", scikit_data.data.shape)
    print()
    
    print("feature_name 출력: ")
    print(scikit_data.feature_names)
    print()
    print("feature_name의 type: ", type(scikit_data.feature_names))
    print("feature_name의 shape: ", len(scikit_data.feature_names))
    print()

    print("target_name: ", scikit_data.target_names)
    print("target_name의 type: ", type(scikit_data.target_names))
    print("target_name의 shape: ", len(scikit_data.target_names))
    print()


    print("target의 type: ", type(scikit_data.target))
    print("target의 shape: ", scikit_data.target.shape)
    print()

## (3) 데이터 이해하기
- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [4]:
print_data_info(digits)

data의 type:  <class 'sklearn.utils.Bunch'>
data의 shape:  (1797, 64)

feature_name 출력: 
['pixel_0_0', 'pixel_0_1', 'pixel_0_2', 'pixel_0_3', 'pixel_0_4', 'pixel_0_5', 'pixel_0_6', 'pixel_0_7', 'pixel_1_0', 'pixel_1_1', 'pixel_1_2', 'pixel_1_3', 'pixel_1_4', 'pixel_1_5', 'pixel_1_6', 'pixel_1_7', 'pixel_2_0', 'pixel_2_1', 'pixel_2_2', 'pixel_2_3', 'pixel_2_4', 'pixel_2_5', 'pixel_2_6', 'pixel_2_7', 'pixel_3_0', 'pixel_3_1', 'pixel_3_2', 'pixel_3_3', 'pixel_3_4', 'pixel_3_5', 'pixel_3_6', 'pixel_3_7', 'pixel_4_0', 'pixel_4_1', 'pixel_4_2', 'pixel_4_3', 'pixel_4_4', 'pixel_4_5', 'pixel_4_6', 'pixel_4_7', 'pixel_5_0', 'pixel_5_1', 'pixel_5_2', 'pixel_5_3', 'pixel_5_4', 'pixel_5_5', 'pixel_5_6', 'pixel_5_7', 'pixel_6_0', 'pixel_6_1', 'pixel_6_2', 'pixel_6_3', 'pixel_6_4', 'pixel_6_5', 'pixel_6_6', 'pixel_6_7', 'pixel_7_0', 'pixel_7_1', 'pixel_7_2', 'pixel_7_3', 'pixel_7_4', 'pixel_7_5', 'pixel_7_6', 'pixel_7_7']

feature_name의 type:  <class 'list'>
feature_name의 shape:  64

target_name:  [0 

In [5]:
digits_df = pd.DataFrame(data=digits.data, columns=digits.feature_names)
digits_df['label'] = digits.target
digits_df['label'].value_counts()

3    183
1    182
5    182
4    181
6    181
9    180
7    179
0    178
2    177
8    174
Name: label, dtype: int64

- 레이블 값은 비슷한 분포로 있다.

In [6]:
# digits_data[0]

In [7]:
# from sklearn.preprocessing import MinMaxScaler

# scaler = MinMaxScaler()
# scaler.fit(digits_data)
# digits_scaled = scaler.transform(digits_data)
# digits_scaled[0]

In [8]:
# len(digits_label)

## (4) train, test 데이터 분리

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(digits.data,
                                                   digits.target,
                                                   test_size=0.2,
                                                   random_state=42)

## (5) 다양한 모델로 학습시켜보기
- Decision Tree 사용해 보기
- Random Forest 사용해 보기
- SVM 사용해 보기
- SGD Classifier 사용해 보기
- Logistic Regression 사용해 보기

어떤 모델이 가장 좋은 성능을 보일까?

### Decision Tree

In [10]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score, cross_validate

dt_clf = DecisionTreeClassifier() 

dt_clf.fit(X_train, y_train)

pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy
# scores = cross_val_score(dt_clf, data, label, scoring='accuracy', cv=3)
# print("교차 검증별 정확도: ", np.round(scores, 4))
# print("평균 검증 정확도: ", np.round(np.mean(scores), 4))

0.8555555555555555

In [11]:
from sklearn.model_selection import GridSearchCV

# hyperparameters
parameters = {'max_depth': [8, 9, 10], 'min_samples_split': [8, 9, 10, 11]}

grid_dtree = GridSearchCV(dt_clf, param_grid=parameters, cv=3, refit=True)

In [12]:
# 순차적으로 학습
grid_dtree.fit(X_train, y_train)

# GridSearchCV 결과 추출해 DataFrame으로 변환
scores_df = pd.DataFrame(grid_dtree.cv_results_)
scores_df[['params', 'mean_test_score', 'rank_test_score',
          'split0_test_score', 'split1_test_score', 'split2_test_score']]

Unnamed: 0,params,mean_test_score,rank_test_score,split0_test_score,split1_test_score,split2_test_score
0,"{'max_depth': 8, 'min_samples_split': 8}",0.819068,8,0.843424,0.797495,0.816284
1,"{'max_depth': 8, 'min_samples_split': 9}",0.821155,6,0.845511,0.799582,0.818372
2,"{'max_depth': 8, 'min_samples_split': 10}",0.814196,12,0.837161,0.795407,0.810021
3,"{'max_depth': 8, 'min_samples_split': 11}",0.817676,11,0.843424,0.795407,0.814196
4,"{'max_depth': 9, 'min_samples_split': 8}",0.821851,5,0.843424,0.803758,0.818372
5,"{'max_depth': 9, 'min_samples_split': 9}",0.823243,3,0.845511,0.807933,0.816284
6,"{'max_depth': 9, 'min_samples_split': 10}",0.819068,8,0.835073,0.805846,0.816284
7,"{'max_depth': 9, 'min_samples_split': 11}",0.819068,8,0.835073,0.803758,0.818372
8,"{'max_depth': 10, 'min_samples_split': 8}",0.823243,4,0.837161,0.803758,0.82881
9,"{'max_depth': 10, 'min_samples_split': 9}",0.823939,2,0.835073,0.805846,0.830898


### Random Forest

In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=32)

rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9806


In [14]:
scores = cross_val_score(rf_clf, digits.data, digits.target, scoring='accuracy', cv=5)
print("교차 검증별 정확도: ", np.round(scores, 4))
print("평균 검증 정확도: ", np.round(np.mean(scores), 4))

교차 검증별 정확도:  [0.9306 0.9083 0.9666 0.961  0.922 ]
평균 검증 정확도:  0.9377


### Support Vector Machine (SVM)

In [15]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train) # train data 표준화

svm_model = svm.SVC()

svm_model.fit(train_scaled, y_train)

test_scaled = scaler.transform(X_test) # test data 표준화
pred = svm_model.predict(test_scaled)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9806


### Stochastic Gradient Descent Classifier (SGDClassifier)

In [16]:
from sklearn.linear_model import SGDClassifier

sgc_clf = SGDClassifier()

sgc_clf.fit(X_train, y_train)

pred = sgc_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9667


### Logistic Regression

In [17]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train) # train data 표준화

lr = LogisticRegression(max_iter=100)
lr.fit(train_scaled, y_train)

test_scaled = scaler.transform(X_test) # test data 표준화
pred = lr.predict(test_scaled)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9722


## (6) 모델을 평가해 보기
- 학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 
- 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? 
- sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

In [18]:
from sklearn.metrics import confusion_matrix

print(classification_report(y_test, pred))
print()
confusion_matrix(y_test, pred)

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        33
           1       0.97      1.00      0.98        28
           2       1.00      1.00      1.00        33
           3       0.97      0.97      0.97        34
           4       1.00      0.98      0.99        46
           5       0.94      0.94      0.94        47
           6       0.97      0.97      0.97        35
           7       1.00      0.97      0.99        34
           8       0.97      0.97      0.97        30
           9       0.93      0.95      0.94        40

    accuracy                           0.97       360
   macro avg       0.97      0.97      0.97       360
weighted avg       0.97      0.97      0.97       360




array([[33,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 28,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 33,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0,  0, 33,  0,  1,  0,  0,  0,  0],
       [ 0,  1,  0,  0, 45,  0,  0,  0,  0,  0],
       [ 0,  0,  0,  0,  0, 44,  1,  0,  0,  2],
       [ 0,  0,  0,  0,  0,  1, 34,  0,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0, 33,  0,  1],
       [ 0,  0,  0,  0,  0,  1,  0,  0, 29,  0],
       [ 0,  0,  0,  1,  0,  0,  0,  0,  1, 38]])

# 2-12. 프로젝트 (2) load_wine : 와인을 분류해 봅시다
- `load_wine` : 와인 데이터 ([링크](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_wine.html#sklearn.datasets.load_wine))

## (1) 필요한 모듈 import하기

In [19]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## (2) 데이터 준비

In [20]:
wine = load_wine()

## (3) 데이터 이해하기
- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [21]:
print_data_info(wine)

data의 type:  <class 'sklearn.utils.Bunch'>
data의 shape:  (178, 13)

feature_name 출력: 
['alcohol', 'malic_acid', 'ash', 'alcalinity_of_ash', 'magnesium', 'total_phenols', 'flavanoids', 'nonflavanoid_phenols', 'proanthocyanins', 'color_intensity', 'hue', 'od280/od315_of_diluted_wines', 'proline']

feature_name의 type:  <class 'list'>
feature_name의 shape:  13

target_name:  ['class_0' 'class_1' 'class_2']
target_name의 type:  <class 'numpy.ndarray'>
target_name의 shape:  3

target의 type:  <class 'numpy.ndarray'>
target의 shape:  (178,)



In [22]:
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df['label'] = wine.target
wine_df['label'].value_counts()

1    71
0    59
2    48
Name: label, dtype: int64

In [23]:
wine_data = wine.data
wine_target = wine.target

## (4) train, test 데이터 분리

In [24]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                   wine_target,
                                                   test_size=0.2,
                                                   random_state=42)

## (5) 다양한 모델로 학습시켜보기
- Decision Tree 사용해 보기
- Random Forest 사용해 보기
- SVM 사용해 보기
- SGD Classifier 사용해 보기
- Logistic Regression 사용해 보기

어떤 모델이 가장 좋은 성능을 보일까?

### Decision Tree

In [25]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier() 

dt_clf.fit(X_train, y_train)

pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy

0.9444444444444444

### Random Forest

In [26]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=32)

rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)
accuracy

1.0

### Support Vector Machine (SVM)

In [27]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train) # train data 표준화

svm_model = svm.SVC()

svm_model.fit(train_scaled, y_train)

test_scaled = scaler.transform(X_test) # test data 표준화
pred = svm_model.predict(test_scaled)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 1.0000


### Stochastic Gradient Descent Classifier (SGDClassifier)

In [28]:
from sklearn.linear_model import SGDClassifier

sgc_clf = SGDClassifier()

sgc_clf.fit(X_train, y_train)

pred = sgc_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.7222


### Logistic Regression

In [29]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train) # train data 표준화

lr = LogisticRegression(max_iter=100)
lr.fit(train_scaled, y_train)

test_scaled = scaler.transform(X_test) # test data 표준화
pred = lr.predict(test_scaled)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 1.0000


## (6) 모델을 평가해 보기
- 학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 
- 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? 
- sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

# 2-13. 프로젝트 (3) load_breast_cancer : 유방암 여부를 진단해 봅시다


- `load_breast_cancer` : 유방암 데이터 ([링크](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_breast_cancer.html#sklearn.datasets.load_breast_cancer))

- 재현율이 중요

## (1) 필요한 모듈 import하기

In [30]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## (2) 데이터 준비

In [31]:
breast_cancer = load_breast_cancer()

## (3) 데이터 이해하기
- Feature Data 지정하기
- Label Data 지정하기
- Target Names 출력해 보기
- 데이터 Describe 해 보기

In [32]:
print_data_info(breast_cancer)

data의 type:  <class 'sklearn.utils.Bunch'>
data의 shape:  (569, 30)

feature_name 출력: 
['mean radius' 'mean texture' 'mean perimeter' 'mean area'
 'mean smoothness' 'mean compactness' 'mean concavity'
 'mean concave points' 'mean symmetry' 'mean fractal dimension'
 'radius error' 'texture error' 'perimeter error' 'area error'
 'smoothness error' 'compactness error' 'concavity error'
 'concave points error' 'symmetry error' 'fractal dimension error'
 'worst radius' 'worst texture' 'worst perimeter' 'worst area'
 'worst smoothness' 'worst compactness' 'worst concavity'
 'worst concave points' 'worst symmetry' 'worst fractal dimension']

feature_name의 type:  <class 'numpy.ndarray'>
feature_name의 shape:  30

target_name:  ['malignant' 'benign']
target_name의 type:  <class 'numpy.ndarray'>
target_name의 shape:  2

target의 type:  <class 'numpy.ndarray'>
target의 shape:  (569,)



In [33]:
cancer_df = pd.DataFrame(data=breast_cancer.data, columns=breast_cancer.feature_names)
cancer_df['label'] = breast_cancer.target
cancer_df['label'].value_counts()

1    357
0    212
Name: label, dtype: int64

In [34]:
cancer_data = breast_cancer.data
cancer_target = breast_cancer.target

## (4) train, test 데이터 분리

In [35]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(cancer_data,
                                                   cancer_target,
                                                   test_size=0.2,
                                                   random_state=42)

## (5) 다양한 모델로 학습시켜보기
- Decision Tree 사용해 보기
- Random Forest 사용해 보기
- SVM 사용해 보기
- SGD Classifier 사용해 보기
- Logistic Regression 사용해 보기

어떤 모델이 가장 좋은 성능을 보일까?

### Decision Tree

In [36]:
from sklearn.tree import DecisionTreeClassifier

dt_clf = DecisionTreeClassifier() 

dt_clf.fit(X_train, y_train)

pred = dt_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy

0.9473684210526315

### Random Forest

In [37]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(random_state=32)

rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
accuracy = accuracy_score(y_test, pred)
accuracy

0.9649122807017544

### Support Vector Machine (SVM)

In [38]:
from sklearn import svm
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train) # train data 표준화

svm_model = svm.SVC()

svm_model.fit(train_scaled, y_train)

test_scaled = scaler.transform(X_test) # test data 표준화
pred = svm_model.predict(test_scaled)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9825


### Stochastic Gradient Descent Classifier (SGDClassifier)

In [39]:
from sklearn.linear_model import SGDClassifier

sgc_clf = SGDClassifier()

sgc_clf.fit(X_train, y_train)

pred = sgc_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9474


### Logistic Regression

In [40]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
train_scaled = scaler.transform(X_train) # train data 표준화

lr = LogisticRegression(max_iter=100)
lr.fit(train_scaled, y_train)

test_scaled = scaler.transform(X_test) # test data 표준화
pred = lr.predict(test_scaled)
accuracy = accuracy_score(y_test, pred)
print("예측 정확도: {0:.4f}".format(accuracy_score(y_test, pred)))

예측 정확도: 0.9737


## (6) 모델을 평가해 보기
- 학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 
- 모델의 성능을 평가하는 지표로는 무엇이 좋을까요? 
- sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

## 참고 자료

- [[Python] 어떤 스케일러를 쓸 것인가?](https://mkjjo.github.io/python/2019/01/10/scaler.html)
- [로지스틱회귀(Logistic Regression) – 파이썬 코드 예제](http://hleecaster.com/ml-logistic-regression-example/)