In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

In [2]:
# 데이터 준비
wine = load_wine()

In [3]:
# 데이터 이해하기
data = wine.data
label = wine.target
target_name = wine.target_names
desc = wine.DESCR

In [4]:
print(desc)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [5]:
print(target_name)

['class_0' 'class_1' 'class_2']


In [6]:
# train, test 분리
random_seed = 25

x_train, x_test, y_train, y_test = train_test_split(
    data,
    label,
    test_size=0.2,
    random_state=random_seed
)

In [7]:
# 다양한 모델로 학습
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.linear_model import SGDClassifier
from sklearn.linear_model import LogisticRegression

decision = DecisionTreeClassifier(random_state=random_seed)
random_forest = RandomForestClassifier(random_state=random_seed)
svm_model = svm.SVC(random_state=random_seed)
sgd = SGDClassifier(random_state=random_seed)
logisitic = LogisticRegression(random_state=random_seed)

decision.fit(x_train, y_train)
random_forest.fit(x_train, y_train)
svm_model.fit(x_train, y_train)
sgd.fit(x_train, y_train)
logisitic.fit(x_train, y_train)

decision_pred = decision.predict(x_test)
random_pred = random_forest.predict(x_test)
svm_pred = svm_model.predict(x_test)
sgd_pred = sgd.predict(x_test)
logistic_pred = logisitic.predict(x_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [8]:
# 평가하기
print(classification_report(y_test, decision_pred))
print(classification_report(y_test, random_pred))
print(classification_report(y_test, svm_pred))
print(classification_report(y_test, sgd_pred))
print(classification_report(y_test, logistic_pred))

              precision    recall  f1-score   support

           0       0.91      0.91      0.91        11
           1       0.94      0.83      0.88        18
           2       0.78      1.00      0.88         7

    accuracy                           0.89        36
   macro avg       0.87      0.91      0.89        36
weighted avg       0.90      0.89      0.89        36

              precision    recall  f1-score   support

           0       1.00      0.91      0.95        11
           1       0.95      1.00      0.97        18
           2       1.00      1.00      1.00         7

    accuracy                           0.97        36
   macro avg       0.98      0.97      0.98        36
weighted avg       0.97      0.97      0.97        36

              precision    recall  f1-score   support

           0       0.89      0.73      0.80        11
           1       0.82      0.78      0.80        18
           2       0.40      0.57      0.47         7

    accuracy        

  _warn_prf(average, modifier, msg_start, len(result))


In [9]:
print(f'data.shape : {data.shape}\n'
      f'label.shape : {label.shape}')

data.shape : (178, 13)
label.shape : (178,)


### 데이터셋

|데이터셋 크기|feature 크기|
|:---:|:--:|
|178|13|

### 모델 성능 비교 (macro avg, f1 score)

|randome_forest|logistic_regression|decision_tree|support vector machine|stochastic gradient descent|
|:---:|:---:|:---:|:---:|:---:|
|0.98|0.98|0.89|0.69|0.54|

### 분석

데이터셋 수 : 적음   
데이터셋 밸런스 : 불균형   
특성 수 : 보통   

### metrics = macro avg, f1 score

- f1 score : 정밀도와 재현율을 모두 따지므로 모든 지표를 볼 수 있어 더 정확하다고 생각합니다.
- macro avg : 각 클래스의 데이터 수와 관계없이 정확도를 반영할 수 있어서 좋다고 생각합니다.   
    weighted avg는 수가 많은 데이터의 영향을 너무 많이 받아, 적은 데이터의 정확도가 무시됩니다.
   