In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import sklearn
import tensorflow as tf

In [4]:
data_url = "http://lib.stat.cmu.edu/datasets/boston"
raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
target = raw_df.values[1::2, 2]

X = pd.DataFrame(data, columns=[
    "CRIM", "ZN", "INDUS", "CHAS", "NOX", "RM", "AGE", "DIS", "RAD", "TAX", 
    "PTRATIO", "B", "LSTAT"
])
Y = pd.Series(target, name="MEDV")

In [5]:
from sklearn import datasets

In [7]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [9]:
X_tn, X_te, Y_tn, Y_te = train_test_split(X, Y, random_state=7) # split data for training data and data for learning
print(X_tn, X_te, Y_tn, Y_te)

        CRIM    ZN  INDUS  CHAS    NOX     RM   AGE     DIS  RAD    TAX  \
41   0.12744   0.0   6.91   0.0  0.448  6.770   2.9  5.7209  3.0  233.0   
264  0.55007  20.0   3.97   0.0  0.647  7.206  91.6  1.9301  5.0  264.0   
193  0.02187  60.0   2.93   0.0  0.401  6.800   9.9  6.2196  1.0  265.0   
205  0.13642   0.0  10.59   0.0  0.489  5.891  22.3  3.9454  4.0  277.0   
79   0.08387   0.0  12.83   0.0  0.437  5.874  36.6  4.5026  5.0  398.0   
..       ...   ...    ...   ...    ...    ...   ...     ...  ...    ...   
67   0.05789  12.5   6.07   0.0  0.409  5.878  21.4  6.4980  4.0  345.0   
502  0.04527   0.0  11.93   0.0  0.573  6.120  76.7  2.2875  1.0  273.0   
25   0.84054   0.0   8.14   0.0  0.538  5.599  85.7  4.4546  4.0  307.0   
196  0.04011  80.0   1.52   0.0  0.404  7.287  34.1  7.3090  2.0  329.0   
175  0.06664   0.0   4.05   0.0  0.510  6.546  33.1  3.1323  5.0  296.0   

     PTRATIO       B  LSTAT  
41      17.9  385.41   4.84  
264     13.0  387.89   8.10  
193     1

In [10]:
std_scale = StandardScaler()
X_tn_std = std_scale.fit_transform(X_tn)
X_te_std = std_scale.transform(X_te)

In [11]:
clf_linear = LinearRegression()
clf_linear.fit(X_tn_std, Y_tn)

LinearRegression()

In [14]:
pred_linear = clf_linear.predict(X_te_std)
print(pred_linear)

[23.1903541  18.97985889 19.82548836 19.00126197  4.39524325 11.90230303
 21.24870187 28.64449553 29.03550064 13.90644782  6.41422339 32.65356658
 18.99884691 20.01569489 37.15275422 22.80485488 29.04529555 33.04200949
 10.48602033 24.45472284 21.33069324 27.60222354 37.52118276 13.6113556
  9.56442243 15.03368415 35.5975585  26.01017573 25.52430154 27.06321433
 19.07680237 30.54746571 31.27561168 16.40132981 39.76707419 20.27263903
 18.94934061 17.12210014 21.6262832  28.15101424 26.95292863 19.14352801
 14.50664721 25.78075705 18.50460146 13.93439214 24.96593139 19.12431756
 20.6780475   6.23807397 27.71460362 26.74617711 11.83361779 40.10855118
 14.66523328 22.12023896 20.34305401 20.3786179  23.56685605 21.91582872
 20.79748126 35.43123681 17.32592458 20.92077502 24.1674162  43.38199388
 19.59747681 20.11624895 22.35462757 28.12506906 25.53832602 12.88949504
 13.1552648  33.3092473  26.12666965 22.54135443 12.14404271 16.61972119
 28.52703363 17.81932988 24.42637646 27.69824683 23.

In [15]:
mean_squared_error(Y_te, pred_linear)

29.515137790197574

In [16]:
# using sklearn pipeline
X_tn, X_te, Y_tn, Y_te = train_test_split(X, Y, random_state=7)

# pipeline
linear_pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('linear_regression', LinearRegression())
])

# learning
linear_pipeline.fit(X_tn, Y_tn)

# predict
pred_linear = linear_pipeline.predict(X_te)

# evaluate
mean_squared_error(Y_te, pred_linear)

29.515137790197574