In [1]:
from sklearn.datasets import load_boston
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd

In [2]:
boston_dataset = load_boston()
print(boston_dataset.data)
print()
print(boston_dataset.data.shape)
print(boston_dataset.feature_names)

[[6.3200e-03 1.8000e+01 2.3100e+00 ... 1.5300e+01 3.9690e+02 4.9800e+00]
 [2.7310e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9690e+02 9.1400e+00]
 [2.7290e-02 0.0000e+00 7.0700e+00 ... 1.7800e+01 3.9283e+02 4.0300e+00]
 ...
 [6.0760e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 5.6400e+00]
 [1.0959e-01 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9345e+02 6.4800e+00]
 [4.7410e-02 0.0000e+00 1.1930e+01 ... 2.1000e+01 3.9690e+02 7.8800e+00]]

(506, 13)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [3]:
polynomial_transformer = PolynomialFeatures(2) # 2차식
polynomial_data = polynomial_transformer.fit_transform(boston_dataset.data)
print(polynomial_data)
print(polynomial_data.shape)

[[1.00000000e+00 6.32000000e-03 1.80000000e+01 ... 1.57529610e+05
  1.97656200e+03 2.48004000e+01]
 [1.00000000e+00 2.73100000e-02 0.00000000e+00 ... 1.57529610e+05
  3.62766600e+03 8.35396000e+01]
 [1.00000000e+00 2.72900000e-02 0.00000000e+00 ... 1.54315409e+05
  1.58310490e+03 1.62409000e+01]
 ...
 [1.00000000e+00 6.07600000e-02 0.00000000e+00 ... 1.57529610e+05
  2.23851600e+03 3.18096000e+01]
 [1.00000000e+00 1.09590000e-01 0.00000000e+00 ... 1.54802902e+05
  2.54955600e+03 4.19904000e+01]
 [1.00000000e+00 4.74100000e-02 0.00000000e+00 ... 1.57529610e+05
  3.12757200e+03 6.20944000e+01]]
(506, 105)


In [4]:
polynomial_feature_names = polynomial_transformer.get_feature_names(boston_dataset.feature_names)
polynomial_feature_names



['1',
 'CRIM',
 'ZN',
 'INDUS',
 'CHAS',
 'NOX',
 'RM',
 'AGE',
 'DIS',
 'RAD',
 'TAX',
 'PTRATIO',
 'B',
 'LSTAT',
 'CRIM^2',
 'CRIM ZN',
 'CRIM INDUS',
 'CRIM CHAS',
 'CRIM NOX',
 'CRIM RM',
 'CRIM AGE',
 'CRIM DIS',
 'CRIM RAD',
 'CRIM TAX',
 'CRIM PTRATIO',
 'CRIM B',
 'CRIM LSTAT',
 'ZN^2',
 'ZN INDUS',
 'ZN CHAS',
 'ZN NOX',
 'ZN RM',
 'ZN AGE',
 'ZN DIS',
 'ZN RAD',
 'ZN TAX',
 'ZN PTRATIO',
 'ZN B',
 'ZN LSTAT',
 'INDUS^2',
 'INDUS CHAS',
 'INDUS NOX',
 'INDUS RM',
 'INDUS AGE',
 'INDUS DIS',
 'INDUS RAD',
 'INDUS TAX',
 'INDUS PTRATIO',
 'INDUS B',
 'INDUS LSTAT',
 'CHAS^2',
 'CHAS NOX',
 'CHAS RM',
 'CHAS AGE',
 'CHAS DIS',
 'CHAS RAD',
 'CHAS TAX',
 'CHAS PTRATIO',
 'CHAS B',
 'CHAS LSTAT',
 'NOX^2',
 'NOX RM',
 'NOX AGE',
 'NOX DIS',
 'NOX RAD',
 'NOX TAX',
 'NOX PTRATIO',
 'NOX B',
 'NOX LSTAT',
 'RM^2',
 'RM AGE',
 'RM DIS',
 'RM RAD',
 'RM TAX',
 'RM PTRATIO',
 'RM B',
 'RM LSTAT',
 'AGE^2',
 'AGE DIS',
 'AGE RAD',
 'AGE TAX',
 'AGE PTRATIO',
 'AGE B',
 'AGE LSTAT',
 'DI

In [5]:
X = pd.DataFrame(polynomial_data, columns=polynomial_feature_names)
X

Unnamed: 0,1,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,...,TAX^2,TAX PTRATIO,TAX B,TAX LSTAT,PTRATIO^2,PTRATIO B,PTRATIO LSTAT,B^2,B LSTAT,LSTAT^2
0,1.0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,...,87616.0,4528.8,117482.40,1474.08,234.09,6072.570,76.194,157529.6100,1976.5620,24.8004
1,1.0,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,...,58564.0,4307.6,96049.80,2211.88,316.84,7064.820,162.692,157529.6100,3627.6660,83.5396
2,1.0,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,...,58564.0,4307.6,95064.86,975.26,316.84,6992.374,71.734,154315.4089,1583.1049,16.2409
3,1.0,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,...,49284.0,4151.4,87607.86,652.68,349.69,7379.581,54.978,155732.8369,1160.2122,8.6436
4,1.0,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,...,49284.0,4151.4,88111.80,1183.26,349.69,7422.030,99.671,157529.6100,2115.4770,28.4089
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,1.0,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,...,74529.0,5733.0,107013.27,2639.91,441.00,8231.790,203.070,153656.1601,3790.5433,93.5089
502,1.0,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,...,74529.0,5733.0,108353.70,2478.84,441.00,8334.900,190.680,157529.6100,3603.8520,82.4464
503,1.0,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,...,74529.0,5733.0,108353.70,1539.72,441.00,8334.900,118.440,157529.6100,2238.5160,31.8096
504,1.0,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,...,74529.0,5733.0,107411.85,1769.04,441.00,8262.450,136.080,154802.9025,2549.5560,41.9904


In [6]:
y = pd.DataFrame(boston_dataset.target, columns = ["MEDV"])
y

Unnamed: 0,MEDV
0,24.0
1,21.6
2,34.7
3,33.4
4,36.2
...,...
501,22.4
502,20.6
503,23.9
504,22.0


In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 5)

In [8]:
model = LinearRegression()
model.fit(X_train, y_train)
print(model.coef_)
print(model.intercept_)

[[ 2.55996369e-07 -5.09146959e+00 -1.65753983e-01 -5.97358604e+00
   2.43179261e+01  1.65180559e+02  2.19910116e+01  1.03167123e+00
  -5.66895775e+00  3.22443249e+00 -1.10055942e-02  5.35127787e+00
  -4.81524408e-02  7.53109325e-01  2.16774682e-03  2.69938772e-01
   5.87901385e-01  2.41731932e+00 -2.52413194e-02  8.92859572e-02
  -5.18832420e-03 -5.77807152e-02  3.55602049e-01 -3.86092281e-02
   5.43572101e-01 -3.18134358e-04  2.40035425e-02 -7.48850220e-04
  -7.16133310e-03 -1.06886010e-01 -1.27782609e+00  2.50137719e-02
   1.14111417e-04 -1.25254119e-02 -4.68024813e-03  6.05725185e-04
  -8.57873132e-03  1.85030053e-03 -4.64730601e-03  3.08484808e-02
  -2.09065897e-01  1.30035723e+00  3.13497405e-01  6.72540164e-04
   7.51823883e-02 -7.38014889e-03  4.23364348e-04 -6.72155117e-03
   6.42107774e-03 -5.32275093e-03  2.43179260e+01 -1.84845896e+01
  -6.89090796e+00  3.60375828e-02  3.05451225e+00 -4.09746374e-01
   2.34143012e-02 -8.47140007e-01  2.67079534e-02 -4.67786369e-01
  -4.67850

In [9]:
y_test_prediction = model.predict(X_test)

In [10]:
mean_squared_error(y_test, y_test_prediction)**0.5

3.196527651373747

In [11]:
# 실습과제_다항 회귀로 당뇨병 예측하기1 : 문제만들기 
    
    
# 필요한 라이브러리 import
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd  

diabetes_dataset = datasets.load_diabetes()  # 데이터 셋 갖고오기

# 코드를 쓰세요
polynomial_transform = PolynomialFeatures(2)
polynomial_data = polynomial_transform.fit_transform(diabetes_dataset.data)
polynomial_feature_names = polynomial_transform.get_feature_names(diabetes_dataset.feature_names)

X = pd.DataFrame(polynomial_data, columns=polynomial_feature_names)

# 테스트 코드
X.head()



Unnamed: 0,1,age,sex,bmi,bp,s1,s2,s3,s4,s5,...,s3^2,s3 s4,s3 s5,s3 s6,s4^2,s4 s5,s4 s6,s5^2,s5 s6,s6^2
0,1.0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,...,0.001884,0.000113,-0.000864,0.000766,7e-06,-5.2e-05,4.6e-05,0.000396,-0.000351,0.000311
1,1.0,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,...,0.005537,-0.002939,-0.005085,-0.006861,0.00156,0.002699,0.003641,0.004669,0.0063,0.008502
2,1.0,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,...,0.001047,8.4e-05,-9.3e-05,0.000839,7e-06,-7e-06,6.7e-05,8e-06,-7.4e-05,0.000672
3,1.0,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,...,0.001299,-0.001236,-0.000818,0.000337,0.001177,0.000779,-0.000321,0.000515,-0.000212,8.8e-05
4,1.0,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,...,6.6e-05,-2.1e-05,-0.00026,-0.00038,7e-06,8.3e-05,0.000121,0.001023,0.001492,0.002175


In [12]:
# 실습과제_다항 회귀로 당뇨병 예측하기 2 : 모델 학습하기

# 필요한 라이브러리 import
from sklearn import datasets
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error

import pandas as pd  

diabetes_dataset = datasets.load_diabetes()

# 지난 과제 코드를 가지고 오세요.
polynomial_transform = PolynomialFeatures(2)
polynomial_data = polynomial_transform.fit_transform(diabetes_dataset.data)
polynomial_feature_names = polynomial_transform.get_feature_names(diabetes_dataset.feature_names)

X = pd.DataFrame(polynomial_data, columns=polynomial_feature_names)

# 목표 변수
y = pd.DataFrame(diabetes_dataset.target, columns=['diabetes'])

# 코드를 쓰세요

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=5)

model = LinearRegression()
model.fit(X_train, y_train)

y_test_predict = model.predict(X_test)

mse = mean_squared_error(y_test, y_test_predict)
mse ** 0.5




57.87704902724854