# Scikit-Learn 맛보기
## Iris 분류 - 결정트리, 서포트벡터머신(SVM), 로지스틱 회귀

### 1. 데이터 전처리

In [1]:
from sklearn.datasets import load_iris
iris = load_iris()

In [2]:
type(iris)

sklearn.utils.Bunch

In [3]:
# Feature data - X 값
iris.data.shape

(150, 4)

In [4]:
# Feature name
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
# Target data - y 값
iris.target.shape

(150,)

In [6]:
import pandas as pd
df = pd.DataFrame(iris.data, columns=iris.feature_names)
df['target'] = iris.target
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [7]:
# Target name
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [8]:
# 설명
print(iris.DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

### 2. 학습용 데이터와 테스트용 데이터 분리

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=2021
)

In [10]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [11]:
y_test

array([0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 2, 1, 2, 1, 1, 0, 1, 1, 2,
       2, 0, 2, 1, 1, 1, 0, 0, 1, 1, 0, 2, 1, 1, 2, 0])

In [12]:
import numpy as np
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([34, 36, 42], dtype=int64))

In [14]:
X_train, X_test, y_train, y_test = train_test_split(
    iris.data, iris.target, random_state=2021,
    test_size=0.2, stratify=iris.target
)

In [15]:
np.unique(y_train, return_counts=True)

(array([0, 1, 2]), array([40, 40, 40], dtype=int64))

### 3. 학습

- Decision Tree

In [16]:
from sklearn.tree import DecisionTreeClassifier

In [17]:
# 모델 생성 - 객체를 만드는 작업, 괄호 안은 하이퍼 파라메터
dtc = DecisionTreeClassifier(random_state=2021)

In [18]:
# 학습
dtc.fit(X_train, y_train)

DecisionTreeClassifier(random_state=2021)

### 4. 예측

In [19]:
pred_dt = dtc.predict(X_test)

In [20]:
df = pd.DataFrame({'y':y_test, 'DT':pred_dt})
df.head()

Unnamed: 0,y,DT
0,0,0
1,1,1
2,1,2
3,2,2
4,0,0


### 5. 평가 - 예측값과 실제값을 비교해서 모델 성능 평가

In [21]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, pred_dt)

0.9

In [22]:
# 4, 5를 한꺼번에 수행
dtc.score(X_test, y_test)

0.9

### SVM(Support Vector Machine)으로 3,4,5 과정을 수행

In [25]:
from sklearn.svm import SVC
svc = SVC(random_state=2021)
svc.fit(X_train, y_train)

SVC(random_state=2021)

In [26]:
pred_sv = svc.predict(X_test)
accuracy_score(y_test, pred_sv)

0.9

### Logistic Regression으로 3,4,5 과정을 수행

In [27]:
from sklearn.linear_model import LogisticRegression
lrc = LogisticRegression(random_state=2021)
lrc.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression(random_state=2021)

In [28]:
# 해결책1 - max_iter 하이퍼 파라메터 값 조정
lrc2 = LogisticRegression(random_state=2021, max_iter=1000)
lrc2.fit(X_train, y_train)

LogisticRegression(max_iter=1000, random_state=2021)

In [33]:
pred_lr2 = lrc2.predict(X_test)
accuracy_score(y_test, pred_lr2)

0.9333333333333333

In [30]:
# 해결책2 - 데이터 전처리를 해줄 것
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
iris_std = scaler.fit_transform(iris.data)

In [31]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(
    iris_std, iris.target, random_state=2021, test_size=0.2, stratify=iris.target
)

In [32]:
lrc3 = LogisticRegression(random_state=2021)
lrc3.fit(X_train2, y_train2)

LogisticRegression(random_state=2021)

In [36]:
pred_lr3 = lrc3.predict(X_test2)
accuracy_score(y_test2, pred_lr3)

0.9

### 예측결과 비교

In [37]:
df['SVC'] = pred_sv
df['LR2'] = pred_lr2
df['y2'] = y_test2
df['LR3'] = pred_lr3
df

Unnamed: 0,y,DT,SVC,LR2,y2,LR3
0,0,0,0,0,0,0
1,1,1,1,1,1,1
2,1,2,1,1,1,2
3,2,2,2,2,2,2
4,0,0,0,0,0,0
5,1,1,1,1,1,1
6,0,0,0,0,0,0
7,1,1,1,1,1,1
8,2,2,2,2,2,2
9,0,0,0,0,0,0
