In [None]:
# 주피터 노트북 환경설정
import warnings
warnings.filterwarnings('ignore')
warnings.simplefilter('ignore')

from IPython.display import set_matplotlib_formats
set_matplotlib_formats("retina")

from IPython.core.display import display, HTML
# display(HTML("<style>.container { font-weight: bold !important; font-family:'Malgun Gothic' !important;}</style>"))
display(HTML("<style>.container { font-weight: bold !important;}</style>"))
display(HTML("<style>.container { width: 98% !important; }</style>"))

In [None]:
import numpy as np
import pandas as pd
import os

import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

# 관련 라이브러리 임포트 
import matplotlib.font_manager as fm

#  한글글꼴로 변경
# plt.rcParams['font.family'] = '한글글꼴명'
plt.rcParams['font.size'] = 11.0
# plt.rcParams['font.family'] = 'batang'
plt.rcParams['font.family'] = 'Malgun Gothic'

# 그래프에서 마이너스 폰트 깨지는 문제에 대한 대처
matplotlib.rcParams['axes.unicode_minus'] = False

# 그래프 기본 크기 설정 
plt.rcParams['figure.figsize'] = [10, 6]

# Pipeline 

- https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html
- 데이터 사전 처리 및 분류의 모든 단계를 포함하는 단일 객체를 만들때 사용한다. 
- train과 test 데이터 손실을 피할 수 있다.
- 교차 검증 및 기타 모델 선택 유형을 쉽게 만든다.
- from sklearn.pipeline import Pipeline

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LinearRegression, LogisticRegression

from sklearn.metrics import f1_score, recall_score, precision_score, classification_report
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV

from sklearn.pipeline import Pipeline

In [None]:
print(dir(Pipeline))

### 스케일 + 모델화 

In [None]:
from sklearn.datasets import load_iris

iris_data = load_iris()
print(iris_data.keys())

X = iris_data.data
y = iris_data.target
feature_names = iris_data.feature_names
target_names = iris_data.target_names

print(feature_names)
print(target_names)

In [None]:
scale = StandardScaler()
scale.fit(X)
X_scale = scale.transform(X)
print(X_test[:3])

X_train, X_test, y_train, y_test = train_test_split(X_scale, y, random_state=111, stratify=y)
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
print(model.score(X_train, y_train) , model.score(X_test, y_test))
print()
print(classification_report(y_test, model.predict(X_test)))

In [None]:
X[:3]

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=111, stratify=y)

pipe = Pipeline([ ('scale', StandardScaler()), 
                  ('model_dt', DecisionTreeClassifier(max_depth=3)) ])
print(pipe)

pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))
print(pipe.predict(X_test))
print(pipe.get_params())
print(f1_score(y_test, pipe.predict(X_test), average='macro'))


### 스케일 + Polynominal + 모델화 

In [None]:
perch_length = np.array(
    [8.4, 13.7, 15.0, 16.2, 17.4, 18.0, 18.7, 19.0, 19.6, 20.0, 
     21.0, 21.0, 21.0, 21.3, 22.0, 22.0, 22.0, 22.0, 22.0, 22.5, 
     22.5, 22.7, 23.0, 23.5, 24.0, 24.0, 24.6, 25.0, 25.6, 26.5, 
     27.3, 27.5, 27.5, 27.5, 28.0, 28.7, 30.0, 32.8, 34.5, 35.0, 
     36.5, 36.0, 37.0, 37.0, 39.0, 39.0, 39.0, 40.0, 40.0, 40.0, 
     40.0, 42.0, 43.0, 43.0, 43.5, 44.0]
     ).reshape(-1, 1)

perch_weight = np.array(
    [5.9, 32.0, 40.0, 51.5, 70.0, 100.0, 78.0, 80.0, 85.0, 85.0, 
     110.0, 115.0, 125.0, 130.0, 120.0, 120.0, 130.0, 135.0, 110.0, 
     130.0, 150.0, 145.0, 150.0, 170.0, 225.0, 145.0, 188.0, 180.0, 
     197.0, 218.0, 300.0, 260.0, 265.0, 250.0, 250.0, 300.0, 320.0, 
     514.0, 556.0, 840.0, 685.0, 700.0, 700.0, 690.0, 900.0, 650.0, 
     820.0, 850.0, 900.0, 1015.0, 820.0, 1100.0, 1000.0, 1100.0, 
     1000.0, 1000.0]
     ).reshape(-1, 1)

In [None]:
scale = StandardScaler()
scale.fit(perch_length)
X_scale = scale.transform(perch_length)
# y_scale = scale.transform(perch_weight)
print(X_test[:3])
# X_train, X_test, y_train, y_test = train_test_split(X_scale, y, random_state=111)
X_train, X_test, y_train, y_test = train_test_split(X_scale, y_scale, random_state=111)

poly = PolynomialFeatures(degree=3, include_bias=False)
poly.fit(X_train)
X_train_poly = poly.transform(X_train)
X_test_poly = poly.transform(X_test)

model_lr = LinearRegression()
model_lr.fit(X_train_poly, y_train)
print()
print(model_lr.score(X_test_poly, y_test))
print()
print(model_lr.predict(X_test_poly))


In [None]:
X_train, X_test, y_train, y_test = train_test_split(perch_length, perch_weight, random_state=111)

pipe2 = Pipeline([ ('scale', StandardScaler()), 
                      ('poly', PolynomialFeatures(degree=3, include_bias=False)),
                      ('model_lr', LinearRegression()) ])
print(pipe2.get_params())

pipe2.fit(X_train, y_train)
print(pipe2.score(X_test, y_test))
print(pipe2.predict(X_test))
