In [1]:
# import required libraries
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# set data to a variable
wine = load_wine()
print(wine.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])


In [2]:
# set data to a variable
wine_data = wine.data
wine_label = wine.target

# check names of the labels and details of data
print(wine.target_names)
print(wine.DESCR)

['class_0' 'class_1' 'class_2']
.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
   

In [13]:
# use train test split method
X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.3,
                                                    random_state=33)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(124, 13) (54, 13) (124,) (54,)


In [14]:
# use decision tree classifier
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=11)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.71      1.00      0.83        15
           1       0.95      0.90      0.93        21
           2       0.85      0.61      0.71        18

    accuracy                           0.83        54
   macro avg       0.84      0.84      0.82        54
weighted avg       0.85      0.83      0.83        54

[[15  0  0]
 [ 0 19  2]
 [ 6  1 11]]


In [12]:
# use random forest classifier
from sklearn.ensemble import RandomForestClassifier

random_forest = RandomForestClassifier(random_state=11)
random_forest.fit(X_train, y_train)
y_pred2 = random_forest.predict(X_test)

print(classification_report(y_test, y_pred2))
print(confusion_matrix(y_test, y_pred2))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        10
           1       1.00      1.00      1.00        14
           2       1.00      1.00      1.00        12

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36

[[10  0  0]
 [ 0 14  0]
 [ 0  0 12]]


In [6]:
# use support vector machine
from sklearn import svm

svm_model = svm.SVC(random_state=11)
svm_model.fit(X_train, y_train)
y_pred3 = svm_model.predict(X_test)

print(classification_report(y_test, y_pred3, zero_division=0))
print(confusion_matrix(y_test, y_pred3))

              precision    recall  f1-score   support

           0       0.62      1.00      0.77        10
           1       0.65      0.93      0.76        14
           2       0.00      0.00      0.00        12

    accuracy                           0.64        36
   macro avg       0.42      0.64      0.51        36
weighted avg       0.43      0.64      0.51        36

[[10  0  0]
 [ 1 13  0]
 [ 5  7  0]]


In [7]:
# use stochastic gradient descent
from sklearn.linear_model import SGDClassifier

sgd_model = SGDClassifier(random_state=11)
sgd_model.fit(X_train, y_train)
y_pred4 = sgd_model.predict(X_test)

print(classification_report(y_test, y_pred4, zero_division=0))
print(confusion_matrix(y_test, y_pred4))

              precision    recall  f1-score   support

           0       0.42      1.00      0.59        10
           1       0.67      0.57      0.62        14
           2       0.00      0.00      0.00        12

    accuracy                           0.50        36
   macro avg       0.36      0.52      0.40        36
weighted avg       0.38      0.50      0.40        36

[[10  0  0]
 [ 6  8  0]
 [ 8  4  0]]


In [8]:
# use logistic regression
from sklearn.linear_model import LogisticRegression

logistic_model = LogisticRegression(random_state=11, max_iter=2500)
logistic_model.fit(X_train, y_train)
y_pred5 = logistic_model.predict(X_test)

print(classification_report(y_test, y_pred5))
print(confusion_matrix(y_test, y_pred5))

              precision    recall  f1-score   support

           0       0.91      1.00      0.95        10
           1       0.87      0.93      0.90        14
           2       1.00      0.83      0.91        12

    accuracy                           0.92        36
   macro avg       0.93      0.92      0.92        36
weighted avg       0.92      0.92      0.92        36

[[10  0  0]
 [ 1 13  0]
 [ 0  2 10]]


Classification Report와 confusion matrix:
- 베스트 모델: Random Forest Classifier
- 워스트 모델: SGD Classifier

SVM과 SGD 는 분류 2 와인을 전혀 맞추지 못해서 다른 모델들 보다 전체적으로 지표가 낮았다.

데이터가 불균형하기에, classification report에서 accuracy 말고도 f1-score가 높은 모델을 골랐다.

또한 confusion matrix로 데이터 분류 상황을 봤는데, 모든 모델들이 분류 0은 제대로 맞췄고, 분류 1은 틀리는 경우도 보았지만, 분류 2를 더 많이 틀리는 걸 볼수가 있었다.