### 필요한 모델 불러오기

In [1]:
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

### 데이터 준비하기 

In [2]:
from sklearn.datasets import load_wine

wine = load_wine()
wine.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names'])

In [3]:
wine_data = wine.data

print(wine_data.shape) 
#shape는 배열의 형상정보를 출력

(178, 13)


In [4]:
wine_data[0]

array([1.423e+01, 1.710e+00, 2.430e+00, 1.560e+01, 1.270e+02, 2.800e+00,
       3.060e+00, 2.800e-01, 2.290e+00, 5.640e+00, 1.040e+00, 3.920e+00,
       1.065e+03])

In [5]:
wine_label = wine.target
print(wine_label.shape)
wine_label

(178,)


array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2])

In [6]:
wine.target_names

array(['class_0', 'class_1', 'class_2'], dtype='<U7')

In [7]:
print(wine.DESCR)

.. _wine_dataset:

Wine recognition dataset
------------------------

**Data Set Characteristics:**

    :Number of Instances: 178 (50 in each of three classes)
    :Number of Attributes: 13 numeric, predictive attributes and the class
    :Attribute Information:
 		- Alcohol
 		- Malic acid
 		- Ash
		- Alcalinity of ash  
 		- Magnesium
		- Total phenols
 		- Flavanoids
 		- Nonflavanoid phenols
 		- Proanthocyanins
		- Color intensity
 		- Hue
 		- OD280/OD315 of diluted wines
 		- Proline

    - class:
            - class_0
            - class_1
            - class_2
		
    :Summary Statistics:
    
                                   Min   Max   Mean     SD
    Alcohol:                      11.0  14.8    13.0   0.8
    Malic Acid:                   0.74  5.80    2.34  1.12
    Ash:                          1.36  3.23    2.36  0.27
    Alcalinity of Ash:            10.6  30.0    19.5   3.3
    Magnesium:                    70.0 162.0    99.7  14.3
    Total Phenols:                0

In [8]:
wine.feature_names

['alcohol',
 'malic_acid',
 'ash',
 'alcalinity_of_ash',
 'magnesium',
 'total_phenols',
 'flavanoids',
 'nonflavanoid_phenols',
 'proanthocyanins',
 'color_intensity',
 'hue',
 'od280/od315_of_diluted_wines',
 'proline']

### train 데이터와 test 데이터 분리하기

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=4)

print('X_train 개수: ', len(X_train),', X_test 개수: ', len(X_test))
print(y_test)

X_train 개수:  142 , X_test 개수:  36
[2 2 0 0 1 2 0 1 0 1 1 0 2 2 0 1 0 1 1 2 1 2 1 2 0 2 1 1 2 2 0 1 0 1 2 2]


In [10]:
X_train.shape, y_train.shape

((142, 13), (142,))

In [11]:
X_test.shape, y_test.shape

((36, 13), (36,))

In [12]:
y_train, y_test

(array([0, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 2, 0, 2, 1, 1, 1, 0, 0, 1, 1, 1,
        2, 2, 0, 2, 1, 1, 1, 0, 2, 1, 1, 0, 1, 1, 1, 0, 1, 2, 0, 0, 1, 1,
        2, 2, 2, 1, 0, 0, 2, 0, 0, 1, 0, 1, 0, 1, 2, 2, 1, 1, 1, 1, 1, 0,
        2, 2, 1, 1, 1, 0, 0, 1, 0, 0, 0, 1, 1, 2, 2, 0, 2, 1, 2, 0, 2, 2,
        1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 2, 0, 1, 0, 2, 1, 2, 1, 1, 1, 1,
        0, 2, 0, 2, 1, 0, 2, 0, 1, 0, 2, 0, 0, 2, 0, 1, 0, 0, 2, 0, 0, 1,
        1, 1, 0, 2, 0, 1, 1, 1, 2, 1]),
 array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2, 1, 2,
        1, 2, 0, 2, 1, 1, 2, 2, 0, 1, 0, 1, 2, 2]))

### 모델 학습시키기

In [13]:
from sklearn.tree import DecisionTreeClassifier

decision_tree = DecisionTreeClassifier(random_state=30)
print(decision_tree._estimator_type)

classifier


In [14]:
decision_tree.fit(X_train, y_train)

DecisionTreeClassifier(random_state=30)

### test 데이터로 예측하기

In [15]:
y_pred = decision_tree.predict(X_test)
y_pred

array([2, 0, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2, 1, 2,
       1, 2, 0, 2, 1, 1, 0, 2, 0, 1, 0, 1, 2, 1])

### 실제 답과 비교

In [16]:
y_test

array([2, 2, 0, 0, 1, 2, 0, 1, 0, 1, 1, 0, 2, 2, 0, 1, 0, 1, 1, 2, 1, 2,
       1, 2, 0, 2, 1, 1, 2, 2, 0, 1, 0, 1, 2, 2])

### Accuracy 확인하기

In [17]:
from sklearn.metrics import accuracy_score

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9166666666666666

### 다른 모델들 활용하기

In [18]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.2,
                                                    random_state=30)

decision_tree = DecisionTreeClassifier(random_state=30)
decision_tree.fit(X_train, y_train)
y_pred = decision_tree.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.8333333333333334

In [19]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.2,
                                                    random_state=30)

random_forest = RandomForestClassifier(random_state=30)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.9722222222222222

In [20]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn import svm

X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.2,
                                                    random_state=30)

svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred=svm_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6944444444444444

In [21]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(wine_data,
                                                    wine_label,
                                                    test_size=0.2,
                                                    random_state=30)

sgd_model = SGDClassifier()
sgd_model.fit(X_train,y_train)
y_pred = sgd_model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.6111111111111112

#### Accuracy 확인해도 정답의 분포에 따라 모델의 성능을 잘 평가하지 못할수 있기 때문에 오차 행렬로 표현해보기

In [22]:
from sklearn.metrics import classification_report
from sklearn.ensemble import RandomForestClassifier

X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=21)

random_forest = RandomForestClassifier(random_state=32)
random_forest.fit(X_train, y_train)
y_pred = random_forest.predict(X_test)


svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred=svm_model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.45      1.00      0.62        10
           2       0.00      0.00      0.00        11

    accuracy                           0.61        36
   macro avg       0.44      0.60      0.48        36
weighted avg       0.48      0.61      0.52        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [23]:
from sklearn.metrics import classification_report
from sklearn import svm
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=21)
svm_model = svm.SVC()
svm_model.fit(X_train, y_train)
y_pred=svm_model.predict(X_test)
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.86      0.80      0.83        15
           1       0.45      1.00      0.62        10
           2       0.00      0.00      0.00        11

    accuracy                           0.61        36
   macro avg       0.44      0.60      0.48        36
weighted avg       0.48      0.61      0.52        36



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [24]:
from sklearn.metrics import classification_report
from sklearn.linear_model import SGDClassifier

X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=21)

sgd_model = SGDClassifier()
sgd_model.fit(X_train,y_train)
y_pred = sgd_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.67      0.80        15
           1       0.53      1.00      0.69        10
           2       0.57      0.36      0.44        11

    accuracy                           0.67        36
   macro avg       0.70      0.68      0.64        36
weighted avg       0.74      0.67      0.66        36



In [25]:
from sklearn.metrics import classification_report
from sklearn.linear_model import LogisticRegression
X_train, X_test, y_train, y_test = train_test_split(wine_data, 
                                                    wine_label, 
                                                    test_size=0.2, 
                                                    random_state=21)
logistic_model = LogisticRegression()
logistic_model.fit(X_train, y_train)
y_pred = logistic_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        15
           1       1.00      1.00      1.00        10
           2       1.00      1.00      1.00        11

    accuracy                           1.00        36
   macro avg       1.00      1.00      1.00        36
weighted avg       1.00      1.00      1.00        36



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


(6)와인을 분류하는 일은 logistic_model을 사용해야 겠다는 생각이 들었습니다 그 이유는 평가한 모델중에 정확도가 높게 나온 모델이 
   logistic_model이라서 그렇게 생각했습니다.