<a href="https://colab.research.google.com/github/yyj0128/MachineLearning/blob/main/Chapter7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **ch7. 모형평가**



## 7.3 파이프라인

In [8]:
#### 파이프라인 사용 전 ###

from sklearn import datasets
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

raw_boston = datasets.load_boston()


X = raw_boston.data
y = raw_boston.target

# 훈련/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=7)

# 표준화 스케일링
std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

# 모델 생성
lr = LinearRegression()
lr.fit(X_tn_std, y_tn)

# 예측
pred_lr = lr.predict(X_te_std)

# 평가
mean_squared_error(y_te, pred_lr)


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

29.515137790197567

In [9]:
### 파이프라인 사용 후 ###

# 훈련/테스트 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=7)

# 파이프라인
lr_pipeline = Pipeline([
                            ('scaler', StandardScaler()),
                            ('linear_regression', LinearRegression())
])

# 모델 학습
lr_pipeline.fit(X_tn, y_tn)

# 예측
pred_lr2 = lr_pipeline.predict(X_te)

# 평가
mean_squared_error(y_te, pred_lr2)

29.515137790197567

## 7.4 그리드서치


In [10]:
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

# 데이터 불러오기
raw_iris = datasets.load_iris()

# 피처/타깃 분리
X = raw_iris.data
y = raw_iris.target

# 훈련/테스트 데이터 분할
X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=7)


# 표준화 스케일
std_scale = StandardScaler()
std_scale.fit(X_tn)
X_tn_std = std_scale.transform(X_tn)
X_te_std = std_scale.transform(X_te)


best_accuracy = 0

for k in [1,2,3,4,5,6,7,8,9,10]:
  knn = KNeighborsClassifier(n_neighbors=k)
  knn.fit(X_tn, y_tn)
  pred_knn = knn.predict(X_te_std)
  accuracy = accuracy_score(y_te, pred_knn)
  if accuracy > best_accuracy:
    best_accuracy = accuracy
    final_k = {'k' : k}

print(final_k)
print(accuracy)

{'k': 1}
0.2894736842105263
