# iris 분류
## - test비율 20%, random_state 2021
## - MinMaxScaler 적용
## - cv=5
## - 결정트리: max_depth, min_samples_split
## - 최적의 파라메터와 정확도 계산

In [2]:
import numpy as np
import pandas as pd

In [3]:
from sklearn.datasets import load_iris
iris = load_iris()

In [4]:
iris.feature_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

## step1) minmaxscaler 적용해서 전처리

In [9]:
# MinMaxScaler 적용
from sklearn.preprocessing import MinMaxScaler
mm_scaler = MinMaxScaler()
iris_scaled = mm_scaler.fit_transform(iris.data)

In [10]:
df2 = pd.DataFrame(iris_scaled, columns=iris.feature_names)
df2.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667


## step2) 학습용 데이터와 테스트용 데이터 분리

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    iris.data,iris.target, test_size=0.2,random_state=2021
)

In [12]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((120, 4), (30, 4), (120,), (30,))

In [13]:
y_test

array([1, 0, 0, 1, 2, 2, 1, 2, 1, 0, 0, 0, 0, 0, 2, 0, 1, 2, 1, 1, 1, 2,
       1, 2, 1, 2, 0, 1, 2, 1])

## step3) 머신러닝 알고리즘 적용해 학습

In [33]:
# 결정트리
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
# 과정과 하이퍼 파라미터 튜닝을 동시에

In [35]:
params = {
    'max_depth':[2,4,6],
    'min_samples_split':[2,3,4]
}

In [37]:
# 객체 생성 --> 모델 생성, 괄호 안은 Hyper Parameter
dtc = DecisionTreeClassifier(random_state=2021)
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=5, scoring='accuracy')

## step4) 예측값과 실제값 비교해 모델 성능 평가

In [38]:
grid_dtc.fit(X_train,y_train)
grid_dtc.best_score_

0.9416666666666667

In [39]:
grid_dtc.best_params_

{'max_depth': 2, 'min_samples_split': 2}

In [40]:
estimator = grid_dtc.best_estimator_

In [41]:
from sklearn.metrics import accuracy_score
pred = estimator.predict(X_test)
accuracy_score(y_test, pred)

0.9333333333333333

# one-hot encoding

In [42]:
iris.target.shape

(150,)

In [43]:
from sklearn.preprocessing import OneHotEncoder
encoder = OneHotEncoder()
classes = encoder.fit_transform(iris.target.reshape(-1,1))
classes.shape

(150, 3)

In [44]:
X_train, X_test, Y_train, Y_test = train_test_split( # 2차원->y 대문자로
    iris_scaled, classes, stratify=classes.toarray(), #stratify 1차원데이터값으로 iris.target도 가능
    test_size=0.2, random_state=2021
)

In [45]:
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

((120, 4), (30, 4), (120, 3), (30, 3))

In [46]:
Y_test[:5].toarray()

array([[0., 0., 1.],
       [0., 1., 0.],
       [0., 1., 0.],
       [1., 0., 0.],
       [0., 0., 1.]])

In [47]:
Y_test[:5]
# sparse matrix 사용하지 말고 .toarray() 사용

<5x3 sparse matrix of type '<class 'numpy.float64'>'
	with 5 stored elements in Compressed Sparse Row format>

In [49]:
dtc = DecisionTreeClassifier(random_state=2021)
grid_dtc = GridSearchCV(dtc, param_grid=params, cv=5, scoring='accuracy')

In [50]:
grid_dtc.fit(X_train, Y_train.toarray())

GridSearchCV(cv=5, estimator=DecisionTreeClassifier(random_state=2021),
             param_grid={'max_depth': [2, 4, 6],
                         'min_samples_split': [2, 3, 4]},
             scoring='accuracy')

In [51]:
estimator = grid_dtc.best_estimator_
pred = grid_dtc.predict(X_test)

In [52]:
pred.shape

(30, 3)

In [53]:
accuracy_score(Y_test.toarray(),pred)

0.9333333333333333