# 1. Digit  Classifier

## (1) 필요한 모듈 import하기

In [1]:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## (2) 데이터 준비

In [2]:
digits = load_digits()

In [3]:
print(dir(digits))

['DESCR', 'data', 'feature_names', 'frame', 'images', 'target', 'target_names']


## (3) 데이터 이해하기

In [4]:
# Feature Data 지정하기
digits_data = digits.data

In [5]:
# Label Data 지정하기
digits_label = digits.target

In [6]:
# Target Names 출력해 보기
digits.target_names

array([0, 1, 2, 3, 4, 5, 6, 7, 8, 9])

In [7]:
# 데이터 Describe 해 보기
print(digits.DESCR)

.. _digits_dataset:

Optical recognition of handwritten digits dataset
--------------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 5620
    :Number of Attributes: 64
    :Attribute Information: 8x8 image of integer pixels in the range 0..16.
    :Missing Attribute Values: None
    :Creator: E. Alpaydin (alpaydin '@' boun.edu.tr)
    :Date: July; 1998

This is a copy of the test set of the UCI ML hand-written digits datasets
https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits

The data set contains images of hand-written digits: 10 classes where
each class refers to a digit.

Preprocessing programs made available by NIST were used to extract
normalized bitmaps of handwritten digits from a preprinted form. From a
total of 43 people, 30 contributed to the training set and different 13
to the test set. 32x32 bitmaps are divided into nonoverlapping blocks of
4x4 and the number of on pixels are counted in each blo

## (4) train, test 데이터 분리

In [8]:
X_train, X_test, y_train, y_test = train_test_split(digits_data, digits_label,
                                                   test_size=0.2, random_state=9)

In [9]:
print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

X_train 개수:  1437 , X_test 개수:  360


## (5) 다양한 모델로 학습시켜보기

### Decision Tree 사용해 보기

In [20]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix, classification_report

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.8555555555555555

[confusion matrix]
 [[29  0  1  0  0  2  0  0  1  0]
 [ 0 37  3  0  0  0  0  0  0  0]
 [ 0  3 39  0  0  1  1  0  1  0]
 [ 0  0  0 33  0  0  0  0  0  1]
 [ 0  1  0  0 30  0  3  4  1  2]
 [ 0  1  0  2  0 34  0  1  1  0]
 [ 0  0  0  0  1  0 28  0  0  0]
 [ 0  0  2  0  1  0  0 24  1  0]
 [ 0  1  1  2  0  0  2  2 28  0]
 [ 0  4  0  0  1  0  0  1  3 26]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.88      0.94        33
           1       0.79      0.93      0.85        40
           2       0.85      0.87      0.86        45
           3       0.89      0.97      0.93        34
           4       0.91      0.73      0.81        41
           5       0.92      0.87      0.89        39
           6       0.82      0.97      0.89        29
           7       0.75      0.86      0.80        28
           8       0.78      0.78      0.78        36
           9       0.90      0.74      0.81  

### Random Forest 사용해 보기

In [21]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9833333333333333

[confusion matrix]
 [[32  0  0  0  1  0  0  0  0  0]
 [ 0 40  0  0  0  0  0  0  0  0]
 [ 0  0 45  0  0  0  0  0  0  0]
 [ 0  0  0 34  0  0  0  0  0  0]
 [ 0  0  0  0 40  0  0  1  0  0]
 [ 0  0  0  0  0 38  0  0  0  1]
 [ 0  0  0  0  0  0 29  0  0  0]
 [ 0  0  0  0  0  0  0 27  0  1]
 [ 0  0  0  0  0  0  0  2 34  0]
 [ 0  0  0  0  0  0  0  0  0 35]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       1.00      1.00      1.00        40
           2       1.00      1.00      1.00        45
           3       1.00      1.00      1.00        34
           4       0.98      0.98      0.98        41
           5       1.00      0.97      0.99        39
           6       1.00      1.00      1.00        29
           7       0.90      0.96      0.93        28
           8       1.00      0.94      0.97        36
           9       0.95      1.00      0.97  

### SVM 사용해 보기

In [22]:
from sklearn import svm
svm_model = svm.SVC(random_state=32)

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9888888888888889

[confusion matrix]
 [[32  0  0  0  1  0  0  0  0  0]
 [ 0 40  0  0  0  0  0  0  0  0]
 [ 0  0 45  0  0  0  0  0  0  0]
 [ 0  0  0 34  0  0  0  0  0  0]
 [ 0  0  0  0 40  0  0  0  0  1]
 [ 0  0  0  0  0 38  0  0  0  1]
 [ 0  0  0  0  0  0 29  0  0  0]
 [ 0  0  0  0  0  0  0 27  0  1]
 [ 0  0  0  0  0  0  0  0 36  0]
 [ 0  0  0  0  0  0  0  0  0 35]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       1.00      1.00      1.00        40
           2       1.00      1.00      1.00        45
           3       1.00      1.00      1.00        34
           4       0.98      0.98      0.98        41
           5       1.00      0.97      0.99        39
           6       1.00      1.00      1.00        29
           7       1.00      0.96      0.98        28
           8       1.00      1.00      1.00        36
           9       0.92      1.00      0.96  

### SGD Classifier 사용해 보기

In [23]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(random_state=32)

sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9444444444444444

[confusion matrix]
 [[32  0  0  0  1  0  0  0  0  0]
 [ 0 34  0  0  0  0  1  0  3  2]
 [ 0  0 45  0  0  0  0  0  0  0]
 [ 0  0  0 29  0  2  0  0  3  0]
 [ 0  0  0  0 40  0  0  1  0  0]
 [ 0  0  1  0  0 37  0  0  0  1]
 [ 0  0  0  0  0  0 29  0  0  0]
 [ 0  0  0  0  0  0  0 25  1  2]
 [ 0  0  0  0  1  0  0  1 34  0]
 [ 0  0  0  0  0  0  0  0  0 35]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       1.00      0.85      0.92        40
           2       0.98      1.00      0.99        45
           3       1.00      0.85      0.92        34
           4       0.95      0.98      0.96        41
           5       0.95      0.95      0.95        39
           6       0.97      1.00      0.98        29
           7       0.93      0.89      0.91        28
           8       0.83      0.94      0.88        36
           9       0.88      1.00      0.93  

### Logistic Regression 사용해 보기

In [24]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=32, solver='liblinear')

logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9611111111111111

[confusion matrix]
 [[32  0  0  0  1  0  0  0  0  0]
 [ 0 38  0  0  0  0  0  0  1  1]
 [ 0  0 45  0  0  0  0  0  0  0]
 [ 0  0  0 33  0  1  0  0  0  0]
 [ 0  1  0  0 39  0  0  1  0  0]
 [ 0  0  0  0  0 38  0  0  0  1]
 [ 0  0  0  0  0  0 29  0  0  0]
 [ 0  0  0  0  0  0  0 26  1  1]
 [ 0  0  0  1  1  0  0  1 32  1]
 [ 0  0  0  0  0  0  0  0  1 34]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.97      0.98        33
           1       0.97      0.95      0.96        40
           2       1.00      1.00      1.00        45
           3       0.97      0.97      0.97        34
           4       0.95      0.95      0.95        41
           5       0.97      0.97      0.97        39
           6       1.00      1.00      1.00        29
           7       0.93      0.93      0.93        28
           8       0.91      0.89      0.90        36
           9       0.89      0.97      0.93  

## (6) 모델을 평가해 보기

##### 학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요? 

각 모델별 정확도accuracy를 정리하면 다음과 같다.   
Decision Tree       0.84   
Random Forest      0.975   
SVM                 0.986   
SGD                 0.944   
Logistic Regression  0.972   
    
     
정확도 순서를 정리하면 다음과 같다.    
SVM(0.99) > RandomForest(0.975) > Logistic Regression(0.972) > SGD(0.944) > Decision Tree(0.84)

##### 모델의 성능을 평가하는 지표로는 무엇이 좋을까요?      
sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

손글씨숫자 분류기는 이미지를 숫자로 분류하는 것이다. sklearn metrics의 평가지표중 정확도(precision)이 적절하다. 0을 0으로 분류하지 못하는 것보다. 1을 0으로 분류하는 것이 더 큰 문제이기 때문이다.    
   
오차행렬의 정밀도 지표로 성능을 비교해 보면 SVM(0.99), Random Forest(0.98)로 우수하다.

# 2. Wine Classifier

## (1) 필요한 모듈 import하기

In [25]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## (2) 데이터 준비

In [26]:
wine = load_wine()

## (3) 데이터 이해하기

In [27]:
# Feature Data 지정하기
wine_data = wine.data

In [28]:
# Label Data 지정하기
wine_label = wine.target

In [29]:
# Target Names 출력해 보기
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [30]:
# 데이터 Describe 해 보기
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

## (4) train, test 데이터 분리

In [31]:
X_train, X_test, y_train, y_test = train_test_split(wine_data, wine_label,
                                                   test_size=0.2, random_state=9)

In [32]:
print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))

X_train 개수:  142 , X_test 개수:  36


## (5) 다양한 모델로 학습시켜보기

### Decision Tree 사용해 보기

In [33]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9722222222222222

[confusion matrix]
 [[16  1  0]
 [ 0 11  0]
 [ 0  0  8]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.94      0.97        17
           1       0.92      1.00      0.96        11
           2       1.00      1.00      1.00         8

    accuracy                           0.97        36
   macro avg       0.97      0.98      0.98        36
weighted avg       0.97      0.97      0.97        36



### Random Forest 사용해 보기

In [34]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 1.0

[confusion matrix]
 [[17  0  0]
 [ 0 11  0]
 [ 0  0  8]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



###  SVM 사용해 보기

In [35]:
from sklearn import svm
svm_model = svm.SVC(random_state=32)

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.75

[confusion matrix]
 [[16  0  1]
 [ 0  8  3]
 [ 1  4  3]]

[classification_report]
               precision    recall  f1-score   support

           0       0.94      0.94      0.94        17
           1       0.67      0.73      0.70        11
           2       0.43      0.38      0.40         8

    accuracy                           0.75        36
   macro avg       0.68      0.68      0.68        36
weighted avg       0.74      0.75      0.75        36



### SGD Classifier 사용해 보기

In [36]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(random_state=32)

sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.6666666666666666

[confusion matrix]
 [[17  0  0]
 [ 4  7  0]
 [ 6  2  0]]

[classification_report]
               precision    recall  f1-score   support

           0       0.63      1.00      0.77        17
           1       0.78      0.64      0.70        11
           2       0.00      0.00      0.00         8

    accuracy                           0.67        36
   macro avg       0.47      0.55      0.49        36
weighted avg       0.53      0.67      0.58        36



  _warn_prf(average, modifier, msg_start, len(result))


###  Logistic Regression 사용해 보기

In [37]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=32, solver='liblinear')

logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 1.0

[confusion matrix]
 [[17  0  0]
 [ 0 11  0]
 [ 0  0  8]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      1.00      1.00        17
           1       1.00      1.00      1.00        11
           2       1.00      1.00      1.00         8

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



## (6) 모델을 평가해 보기

##### 학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요?

각 모델별 정확도accuracy를 정리하면 다음과 같다.   
Decision Tree       0.972   
Random Forest      1.0   
SVM                 0.75   
SGD                 0.667   
Logistic Regression  1.0   
    
     
따라서 모델의 성능순서는 다음과 같다.    
Random Forest/Logistic Regression(1.0) > Decision Tree(0.972) > SVM (0.75) > SGD (0.667)

##### 모델의 성능을 평가하는 지표로는 무엇이 좋을까요?      
sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

와인 분류기에서는 정확도(precision)이 적절한 평가지표이다.
즉 3가지 카테고리로 분류함에 있어 class_01을 class_01로 분류 못하는 것은 괜찮지만, class_02를 class01로 분류하는 것이 더 큰 문제이기 때문이다.

정밀도(precision)에서도 Random Forest와 Logistic Regression이 우수하다.   

특이한 것은 SGD모델에서는 class_2의 TP가 0이다. 제대로 맞춘 것이 없다는 의미이다. 
때문에 정확도는 0.67이고 정밀도는 0.55인데 이것은 class_2가 모두 0이기 때문이다.
이는 SDGClassifier는 이진선형분류기로서 선, 평면, 초평면을 이용해 2개의 클래스를 구분하는 분류기이기 때문이다. 3개 이상 분류문제에는 부적합하다.

# 3. Breast Cancer Classifier

## (1) 필요한 모듈 import하기

In [38]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

## (2) 데이터 준비

In [39]:
breast_cancer = load_breast_cancer()

In [40]:
print(dir(breast_cancer))

['DESCR', 'data', 'feature_names', 'filename', 'frame', 'target', 'target_names']


## (3) 데이터 이해하기

In [41]:
# Feature Data 지정하기
breast_cancer_data = breast_cancer.data

In [42]:
# Label Data 지정하기
breast_cancer_label = breast_cancer.target

In [43]:
# Target Names 출력해 보기
breast_cancer.target_names

array(['malignant', 'benign'], dtype='<U9')

In [44]:
# 데이터 Describe 해 보기
print(breast_cancer.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

## (4) train, test 데이터 분리

In [45]:
X_train, X_test, y_train, y_test = train_test_split(breast_cancer_data, breast_cancer_label,
                                                   test_size=0.2, random_state=9)

In [46]:
print('X_train 개수: ', len(X_train), ', X_test 개수: ', len(X_test))  

X_train 개수:  455 , X_test 개수:  114


## (5) 다양한 모델로 학습시켜보기

### Decision Tree 사용해 보기

In [47]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report

decision_tree = DecisionTreeClassifier(random_state=32)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9210526315789473

[confusion matrix]
 [[34  6]
 [ 3 71]]

[classification_report]
               precision    recall  f1-score   support

           0       0.92      0.85      0.88        40
           1       0.92      0.96      0.94        74

    accuracy                           0.92       114
   macro avg       0.92      0.90      0.91       114
weighted avg       0.92      0.92      0.92       114



### Random Forest 사용해 보기

In [48]:
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.956140350877193

[confusion matrix]
 [[35  5]
 [ 0 74]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.88      0.93        40
           1       0.94      1.00      0.97        74

    accuracy                           0.96       114
   macro avg       0.97      0.94      0.95       114
weighted avg       0.96      0.96      0.96       114



### SVM 사용해 보기

In [49]:
from sklearn import svm
svm_model = svm.SVC(random_state=32)

svm_model.fit(X_train, y_train)
y_pred = svm_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9122807017543859

[confusion matrix]
 [[30 10]
 [ 0 74]]

[classification_report]
               precision    recall  f1-score   support

           0       1.00      0.75      0.86        40
           1       0.88      1.00      0.94        74

    accuracy                           0.91       114
   macro avg       0.94      0.88      0.90       114
weighted avg       0.92      0.91      0.91       114



### SGD Classifier 사용해 보기

In [50]:
from sklearn.linear_model import SGDClassifier
sgd_model = SGDClassifier(random_state=32)

sgd_model.fit(X_train, y_train)
y_pred = sgd_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9298245614035088

[confusion matrix]
 [[34  6]
 [ 2 72]]

[classification_report]
               precision    recall  f1-score   support

           0       0.94      0.85      0.89        40
           1       0.92      0.97      0.95        74

    accuracy                           0.93       114
   macro avg       0.93      0.91      0.92       114
weighted avg       0.93      0.93      0.93       114



###  Logistic Regression 사용해 보기

In [51]:
from sklearn.linear_model import LogisticRegression
logistic_model = LogisticRegression(random_state=32, solver='liblinear')

logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)

print('accuracy_score =', accuracy_score(y_test, y_pred))
print('\n[confusion matrix]\n',confusion_matrix(y_test, y_pred))
print('\n[classification_report]\n',classification_report(y_test, y_pred))

accuracy_score = 0.9385964912280702

[confusion matrix]
 [[34  6]
 [ 1 73]]

[classification_report]
               precision    recall  f1-score   support

           0       0.97      0.85      0.91        40
           1       0.92      0.99      0.95        74

    accuracy                           0.94       114
   macro avg       0.95      0.92      0.93       114
weighted avg       0.94      0.94      0.94       114



## (6) 모델을 평가해 보기

##### 학습된 모델들의 테스트데이터 예측 결과를 어떻게 해석해야 할까요?

각 모델별 정확도accuracy를 정리하면 다음과 같다.     
Decision Tree        0.921   
Random Forest        0.956   
SVM                  0.912   
SGD                  0.930    
Logistic Regression  0.939    
   
모델의 성능은 다음과 같다.    
Random Forest(0.956) > Logistic Regression (0.939) > SGD(0.930) > Decision Tree(0.921) > SVM(0.912)

##### 모델의 성능을 평가하는 지표로는 무엇이 좋을까요?    
    
sklearn.metrics 에서 제공하는 평가지표 중 적절한 것을 선택해 보세요. 선택하신 이유도 설명해 주세요.

유방암 분류기의 경우는 recall지표가 적절하다. 유방암 양성진단을 정확히 진단하는게 중요하다. 양성인데 음성으로 진단하면 치명적이기 때문이다. 반명 음성을 양성으로 진단하는 것은 용인될 수 있다.     
       
recall지표로 보았을 때도 Random Forest(0.94)와 Logistic Regression(0.92)이 뛰어나다.