# iris 데이터로 예측 정확도 추출하기 (예시 : 0.9556)

In [24]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

iris = load_iris()
dt_clf = DecisionTreeClassifier()

iris_data = iris.data
iris_label = iris.target

X_train, X_test, y_train, y_test = train_test_split(iris.data, iris.target)

dt_clf.fit(X_train, y_train)
pred = dt_clf.predict(X_test)
accuracy_score(y_test, pred)

0.9473684210526315

# iris교차검증하여 개별과 평균 검증 정확도 나타내기/5번 검증

In [25]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold
import numpy as np

iris = load_iris()

iris_data = iris.data
iris_label = iris.target

dt_clf = DecisionTreeClassifier()
KFold = KFold(n_splits = 5)
cv_accurancy = []

for train_index, test_index in KFold.split(iris_data):
    X_train, X_test = iris_data[train_index], iris_data[test_index]
    y_train, y_test = iris_label[train_index], iris_label[test_index]
    
    dt_clf.fit(X_train, y_train)
    pred = dt_clf.predict(X_test)
    accurancy = accuracy_score(y_test, pred)
    cv_accurancy.append(accurancy)
    
    print('개별 검증 정확도: ', cv_accurancy)

print('평균 검증 정확도: ', np.mean(cv_accurancy))

개별 검증 정확도:  [1.0]
개별 검증 정확도:  [1.0, 0.9666666666666667]
개별 검증 정확도:  [1.0, 0.9666666666666667, 0.8666666666666667]
개별 검증 정확도:  [1.0, 0.9666666666666667, 0.8666666666666667, 0.9333333333333333]
개별 검증 정확도:  [1.0, 0.9666666666666667, 0.8666666666666667, 0.9333333333333333, 0.8]
평균 검증 정확도:  0.9133333333333333


# 제대로 분배가 아닐땐 뭘 사용?

In [26]:
from sklearn.model_selection import StratifiedKFold

# KFlod보다 더 간단한 방법

In [35]:
from sklearn.model_selection import cross_val_score, cross_validate

iris = load_iris()
iris_data = iris.data
iris_label = iris.target

dt_clf = DecisionTreeClassifier()

score = cross_val_score(dt_clf, data, label, scoring = 'accuracy', cv=3)

np.mean(scores).round(4)

0.9667

# GridSearchCV 테스트 데이터 세트 정확도 구하기

In [44]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV, train_test_split

iris = load_iris()

iris_data = iris.data
iris_label = iris.target

dt_clf = DecisionTreeClassifier()

X_train, X_test, y_train, y_test = train_test_split(iris_data, iris_label, test_size=0.2,
                                                   random_state=121)

grid_parameters = {'max_depth':[1,2,3],
                  'min_samples_split':[2,3]}

grid_tree = GridSearchCV(dt_clf, param_grid=grid_parameters, cv=3, refit=True)
grid_tree.fit(X_train, y_train)

pred = grid_tree.predict(X_test)
accuracy_score(y_test, pred)

0.9666666666666667

# 변수를 숫자식으로 인코딩 하기
  'TV', '냉장고', '전자레인지'

In [58]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()

encoder.fit_transform(['TV', '냉장고', '전자레인지'])

array([0, 1, 2], dtype=int64)

# 원-핫 인코딩 만들기 1
items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

In [62]:
from sklearn.preprocessing import OneHotEncoder
import numpy as np

items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']

items = np.array(items).reshape(-1,1)

encoder = OneHotEncoder()

one_hot_labels = encoder.fit_transform(items)
one_hot_labels.toarray()

array([[1., 0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0.],
       [0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 0., 1., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]])

# array로 바꿀 필요 없이 원-핫 인코딩 만들기2

In [63]:
import pandas as pd

items = ['TV','냉장고','전자레인지','컴퓨터','선풍기','선풍기','믹서','믹서']
pd.get_dummies(items)

Unnamed: 0,TV,냉장고,믹서,선풍기,전자레인지,컴퓨터
0,1,0,0,0,0,0
1,0,1,0,0,0,0
2,0,0,0,0,1,0
3,0,0,0,0,0,1
4,0,0,0,1,0,0
5,0,0,0,1,0,0
6,0,0,1,0,0,0
7,0,0,1,0,0,0


# 근본 표준화 만들기

In [51]:
iris_data = iris.data
iris_df = pd.DataFrame(iris.data)

def standardize(x):
    return (x - x.mean()/x.std())

iris_df.apply(standardize).mean()

0   -1.213269
1   -3.957050
2    1.629181
3   -0.374104
dtype: float64

# 피처 스케일링1 : 한 번에 표준화 하기

In [47]:
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler
import pandas as pd

iris = load_iris()
iris_data = iris.data
pd.DataFrame(iris_data)

scaler = StandardScaler()
scaler.fit(iris_data)

iris_scaler = scaler.transform(iris_data)
iris_scaler

array([[-9.00681170e-01,  1.01900435e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00, -1.31979479e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.38535265e+00,  3.28414053e-01, -1.39706395e+00,
        -1.31544430e+00],
       [-1.50652052e+00,  9.82172869e-02, -1.28338910e+00,
        -1.31544430e+00],
       [-1.02184904e+00,  1.24920112e+00, -1.34022653e+00,
        -1.31544430e+00],
       [-5.37177559e-01,  1.93979142e+00, -1.16971425e+00,
        -1.05217993e+00],
       [-1.50652052e+00,  7.88807586e-01, -1.34022653e+00,
        -1.18381211e+00],
       [-1.02184904e+00,  7.88807586e-01, -1.28338910e+00,
        -1.31544430e+00],
       [-1.74885626e+00, -3.62176246e-01, -1.34022653e+00,
        -1.31544430e+00],
       [-1.14301691e+00,  9.82172869e-02, -1.28338910e+00,
        -1.44707648e+00],
       [-5.37177559e-01,  1.47939788e+00, -1.28338910e+00,
        -1.31544430e+00],
       [-1.26418478e+00,  7.88807586e-01, -1.22655167e+00,
      

# 피처 스케일링2 : 데이터값을 0,1로 변환하기(표준화)

In [49]:
from sklearn.datasets import load_iris
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import MinMaxScaler

iris = load_iris()
iris_data = iris.data

scaler = MinMaxScaler()
scaler.fit(iris_data)

iris_scaler = scaler.transform(iris_data)
iris_scaler

array([[0.22222222, 0.625     , 0.06779661, 0.04166667],
       [0.16666667, 0.41666667, 0.06779661, 0.04166667],
       [0.11111111, 0.5       , 0.05084746, 0.04166667],
       [0.08333333, 0.45833333, 0.08474576, 0.04166667],
       [0.19444444, 0.66666667, 0.06779661, 0.04166667],
       [0.30555556, 0.79166667, 0.11864407, 0.125     ],
       [0.08333333, 0.58333333, 0.06779661, 0.08333333],
       [0.19444444, 0.58333333, 0.08474576, 0.04166667],
       [0.02777778, 0.375     , 0.06779661, 0.04166667],
       [0.16666667, 0.45833333, 0.08474576, 0.        ],
       [0.30555556, 0.70833333, 0.08474576, 0.04166667],
       [0.13888889, 0.58333333, 0.10169492, 0.04166667],
       [0.13888889, 0.41666667, 0.06779661, 0.        ],
       [0.        , 0.41666667, 0.01694915, 0.        ],
       [0.41666667, 0.83333333, 0.03389831, 0.04166667],
       [0.38888889, 1.        , 0.08474576, 0.125     ],
       [0.30555556, 0.79166667, 0.05084746, 0.125     ],
       [0.22222222, 0.625     ,

# 근본 정규식 표현과 람다식을 이용한 표현을 만들기

In [55]:
# 근본 정규식
from sklearn.datasets import load_iris

iris = load_iris()
iris_data = iris.data
iris_df = pd.DataFrame(iris_data)

def min_max(x):
    return (x - x.min()) / (x.max() - x.min())

iris_df.apply(min_max)

Unnamed: 0,0,1,2,3
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
5,0.305556,0.791667,0.118644,0.125
6,0.083333,0.583333,0.067797,0.083333
7,0.194444,0.583333,0.084746,0.041667
8,0.027778,0.375,0.067797,0.041667
9,0.166667,0.458333,0.084746,0.0


In [56]:
# 람다식
iris_df.apply(lambda x : (x - x.min()) / (x.max() - x.min()))

Unnamed: 0,0,1,2,3
0,0.222222,0.625,0.067797,0.041667
1,0.166667,0.416667,0.067797,0.041667
2,0.111111,0.5,0.050847,0.041667
3,0.083333,0.458333,0.084746,0.041667
4,0.194444,0.666667,0.067797,0.041667
5,0.305556,0.791667,0.118644,0.125
6,0.083333,0.583333,0.067797,0.083333
7,0.194444,0.583333,0.084746,0.041667
8,0.027778,0.375,0.067797,0.041667
9,0.166667,0.458333,0.084746,0.0
