In [1]:
# import required libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# set data to a variable
wine = load_wine()
print(wine.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [2]:
# set data to a variable
wine_data = wine.data
wine_label = wine.target

# check names of the labels and details of data
print(wine.target_names)
print(wine.DESCR)

['class_0' 'class_1' 'class_2']
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
   

In [3]:
# use train test split method
X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.3,
                                                    random_state=33)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(124, 13) (54, 13) (124,) (54,)


In [4]:
# use decision tree classifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=11)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        15
           1       0.95      0.90      0.93        21
           2       0.85      0.61      0.71        18

    accuracy                           0.83        54
   macro avg       0.84      0.84      0.82        54
weighted avg       0.85      0.83      0.83        54

[[15  0  0]
 [ 0 19  2]
 [ 6  1 11]]


In [5]:
# use random forest classifier
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=11)
random_forest.fit(X_train, y_train)
y_pred2 = random_forest.predict(X_test)

print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        21
           2       1.00      1.00      1.00        18

    accuracy                           1.00        54
   macro avg       1.00      1.00      1.00        54
weighted avg       1.00      1.00      1.00        54

[[15  0  0]
 [ 0 21  0]
 [ 0  0 18]]


In [6]:
# use support vector machine
from sklearn import svm

svm_model = svm.SVC(random_state=11)
svm_model.fit(X_train, y_train)
y_pred3 = svm_model.predict(X_test)

print(classification_report(y_test, y_pred3, zero_division=0))
print(confusion_matrix(y_test, y_pred3))
print(y_test)
print(y_pred3)

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        15
           1       0.61      0.95      0.74        21
           2       0.00      0.00      0.00        18

    accuracy                           0.65        54
   macro avg       0.44      0.65      0.52        54
weighted avg       0.43      0.65      0.52        54

[[15  0  0]
 [ 1 20  0]
 [ 5 13  0]]
[0 0 1 0 2 0 0 0 2 2 0 2 0 1 2 1 1 1 0 2 2 1 1 1 1 1 1 2 2 1 1 2 2 1 0 2 0
 1 2 0 2 1 0 1 0 1 0 1 2 2 2 1 2 1]
[0 0 1 0 1 0 0 0 1 1 0 1 0 1 1 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 1 0 1 1 0 0 0
 1 1 0 1 1 0 1 0 1 0 1 1 1 1 1 1 1]


In [7]:
# use stochastic gradient descent
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=33) # random state를 train test split random state와 같은 숫자로 지정했을 때
sgd_model.fit(X_train, y_train)
y_pred4 = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred4, zero_division=0))
print(confusion_matrix(y_test, y_pred4))
print(y_test)
print(y_pred4)

              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.68      0.71      0.70        21
           2       0.50      0.61      0.55        18

    accuracy                           0.67        54
   macro avg       0.73      0.66      0.68        54
weighted avg       0.71      0.67      0.68        54

[[10  0  5]
 [ 0 15  6]
 [ 0  7 11]]
[0 0 1 0 2 0 0 0 2 2 0 2 0 1 2 1 1 1 0 2 2 1 1 1 1 1 1 2 2 1 1 2 2 1 0 2 0
 1 2 0 2 1 0 1 0 1 0 1 2 2 2 1 2 1]
[0 2 1 0 1 0 0 0 1 2 2 1 0 2 1 1 1 1 0 2 2 2 1 1 1 1 1 2 2 1 2 2 2 2 0 2 2
 1 1 0 2 1 2 2 0 1 2 2 2 1 1 1 2 1]


In [8]:
# use stochastic gradient descent
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=98) # random state를 98로 지정했을 때
sgd_model.fit(X_train, y_train)
y_pred4 = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred4, zero_division=0))
print(confusion_matrix(y_test, y_pred4))
print(y_test)
print(y_pred4)

              precision    recall  f1-score   support

           0       1.00      0.53      0.70        15
           1       0.00      0.00      0.00        21
           2       0.39      1.00      0.56        18

    accuracy                           0.48        54
   macro avg       0.46      0.51      0.42        54
weighted avg       0.41      0.48      0.38        54

[[ 8  0  7]
 [ 0  0 21]
 [ 0  0 18]]
[0 0 1 0 2 0 0 0 2 2 0 2 0 1 2 1 1 1 0 2 2 1 1 1 1 1 1 2 2 1 1 2 2 1 0 2 0
 1 2 0 2 1 0 1 0 1 0 1 2 2 2 1 2 1]
[0 2 2 0 2 0 0 0 2 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 2
 2 2 2 2 2 2 2 0 2 2 2 2 2 2 2 2 2]


In [9]:
# use logistic regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=11, max_iter=3000)
logistic_model.fit(X_train, y_train)
y_pred5 = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred5))
print(confusion_matrix(y_test, y_pred5))

              precision    recall  f1-score   support

           0       0.94      1.00      0.97        15
           1       0.91      0.95      0.93        21
           2       1.00      0.89      0.94        18

    accuracy                           0.94        54
   macro avg       0.95      0.95      0.95        54
weighted avg       0.95      0.94      0.94        54

[[15  0  0]
 [ 1 20  0]
 [ 0  2 16]]


Classification Report와 confusion matrix:
- 베스트 모델: Random Forest Classifier
- 워스트 모델: SGD Classifier

SVM과 SGD 는 분류 2 와인을 전혀 맞추지 못해서 다른 모델들 보다 전체적으로 지표가 낮았다.

다만 SGD 모델에서는 random state를 98로 맞추니 반대로 분류 1을 전혀 못 맞추고 대부분 분류 2로 예측하는 것을 볼수가 있었다.

그리고 SGD random state를 잘 맞추면 한 분류를 아예 배제하지 않지만 평가지표에서 낮은 점수가 나오는 것을 볼수가 있었다.

왜 데이터를 그렇게 분류하는 지는 아직 잘 모르겠다...

데이터가 불균형하기에, classification report에서 accuracy 말고도 f1-score가 높은 모델을 골랐다.

또한 confusion matrix로 데이터 분류 상황을 봤는데, 모든 모델들이 분류 0은 제대로 맞췄고, 분류 1은 틀리는 경우도 보았지만, 분류 2를 더 많이 틀리는 걸 볼수가 있었다.