In [5]:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np

# 다항식으로 변환시킬 단항식 생성
x = np.arange(4).reshape(2,2)

# degree=2(2차 다항식)로 변환하기
poly = PolynomialFeatures(degree=2)
poly.fit(x)
poly_f = poly.transform(x)
print(f"변환 이전 단항식 계수 feature:\n {x}")
print()
print(f"변환된 2차 다항식 계수 feature:\n {poly_f}")

변환 이전 단항식 계수 feature:
 [[0 1]
 [2 3]]

변환된 2차 다항식 계수 feature:
 [[1. 0. 1. 0. 0. 1.]
 [1. 2. 3. 4. 6. 9.]]


# Pipeline + LinearRegression이용해 다항회귀 구현

In [10]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline

def polynomial_func(X):
    y = 1 + 2*X[:,0] + 3*X[:,0]**2 + 4*X[:,1]**3
    return y

# Pipeline객체로 한 흐름으로 Polynominalfeatures와 LinearRegression구현
model = Pipeline([('poly',PolynomialFeatures(degree=3)),
                  ('linear',LinearRegression())])
x = np.arange(4).reshape(2,2)
y = polynomial_func(x)

model = model.fit(x,y)
# 다항회귀 계수보기 위해 Pipeline에서 정의해줬던 linear불러오기
print('Polynominal 회귀계수:\n',
     np.round(model.named_steps['linear'].coef_, 2))

Polynominal 회귀계수:
 [0.   0.18 0.18 0.36 0.54 0.72 0.72 1.08 1.62 2.34]


In [16]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error , r2_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
from sklearn.pipeline import Pipeline
from sklearn.datasets import load_boston
import pandas as pd
import numpy as np

# boston 데이타셋 로드
boston = load_boston()

# boston 데이타셋 DataFrame 변환 
bostonDF = pd.DataFrame(boston.data , columns = boston.feature_names)

# boston dataset의 target array는 주택 가격임. 이를 PRICE 컬럼으로 DataFrame에 추가함. 
bostonDF['PRICE'] = boston.target
print('Boston 데이타셋 크기 :',bostonDF.shape)

y_target = bostonDF['PRICE']
X_data = bostonDF.drop(['PRICE'],axis=1,inplace=False)


X_train , X_test , y_train , y_test = train_test_split(X_data , y_target ,test_size=0.3, random_state=156)

# pipeline이용해 다항회귀 구현
# include_bias : 절편값 포함 여부
p_model = Pipeline([('poly',PolynomialFeatures(degree=3,
                                              include_bias=False)),
                   ('linear',LinearRegression())])
p_model.fit(X_train, y_train)
y_preds = p_model.predict(X_test)
# metric
mse = mean_squared_error(y_test, y_preds)
rmse = np.sqrt(mse)
r2_score = r2_score(y_test, y_preds)

print(f"MSE : {mse:.4f}")
print(f"RMSE : {rmse:.4f}")
print(f"Variance Score(R2 score) {r2_score:.4f}")

Boston 데이타셋 크기 : (506, 14)
MSE : 79625.5938
RMSE : 282.1801
Variance Score(R2 score) -1116.5979
