## Boston House Price Prediction Using Linear Regression

In [3]:
# Import libraries

import pandas as pd
import numpy as np

# Visualization Libraries:
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
# Import modules from sklearn library:

from sklearn import linear_model
from sklearn.model_selection import train_test_split

In [5]:
# Load Boston housing dataset (from sklearn.datasets)

from sklearn.datasets import load_boston
boston = load_boston()
print(boston)

{'data': array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]]), 'target': array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 1

From looking at the output, the data listed after 'target' is the target varaible. 

In [6]:
# Print information about dataset:
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [7]:
# Transform dataset into dataframe using Pandas:
# x values will be the independent variables
# y values will be the dependent variables (house price) (target)

dataf_x = pd.DataFrame(boston.data, columns=boston.feature_names)    # Dependent variables
dataf_y = pd.DataFrame(boston.target)                                # Target variable

In [8]:
# Explore the dataset (Dependent variables)
dataf_x.describe()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
count,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0,506.0
mean,3.613524,11.363636,11.136779,0.06917,0.554695,6.284634,68.574901,3.795043,9.549407,408.237154,18.455534,356.674032,12.653063
std,8.601545,23.322453,6.860353,0.253994,0.115878,0.702617,28.148861,2.10571,8.707259,168.537116,2.164946,91.294864,7.141062
min,0.00632,0.0,0.46,0.0,0.385,3.561,2.9,1.1296,1.0,187.0,12.6,0.32,1.73
25%,0.082045,0.0,5.19,0.0,0.449,5.8855,45.025,2.100175,4.0,279.0,17.4,375.3775,6.95
50%,0.25651,0.0,9.69,0.0,0.538,6.2085,77.5,3.20745,5.0,330.0,19.05,391.44,11.36
75%,3.677083,12.5,18.1,0.0,0.624,6.6235,94.075,5.188425,24.0,666.0,20.2,396.225,16.955
max,88.9762,100.0,27.74,1.0,0.871,8.78,100.0,12.1265,24.0,711.0,22.0,396.9,37.97


In [9]:
# Explore the dataset (Target variable)
dataf_y.describe()

Unnamed: 0,0
count,506.0
mean,22.532806
std,9.197104
min,5.0
25%,17.025
50%,21.2
75%,25.0
max,50.0


## Linear Regression

In [10]:
# Initialise the linear regression model.
reg = linear_model.LinearRegression()

### Model 1: 70% train and 30% test set

In [23]:
# Split dataset into 70% train and 30% test set
x_train, x_test, y_train, y_test = train_test_split(dataf_x, dataf_y)

In [24]:
# Train the model with the training data (using 'fit' function)
reg.fit(x_train, y_train)

LinearRegression()

In [25]:
# Print the coefficients for each feature/column
print(reg.coef_)

[[-9.63453639e-02  5.12348229e-02  2.65196905e-02  2.64787405e+00
  -1.79048126e+01  4.23485827e+00  4.75884187e-03 -1.50932968e+00
   3.19122896e-01 -1.34414589e-02 -9.51456052e-01  1.04055416e-02
  -5.40917372e-01]]


In [26]:
# Print predictions of the test data (y values)
y_pred = reg.predict(x_test)
print(y_pred)

[[21.06503878]
 [33.65867286]
 [12.92864468]
 [30.40133857]
 [25.30660094]
 [16.91370659]
 [18.69174556]
 [20.00580037]
 [24.07885223]
 [11.57107746]
 [12.03708501]
 [26.42096954]
 [21.04149235]
 [31.73304507]
 [11.07134842]
 [21.33903176]
 [27.50130503]
 [32.16207355]
 [16.35612152]
 [14.26237347]
 [20.25796916]
 [13.31792081]
 [ 8.24411082]
 [33.6602943 ]
 [18.53423763]
 [26.97657654]
 [31.77654481]
 [18.59950287]
 [22.10659822]
 [ 6.25365375]
 [15.05037164]
 [14.43896027]
 [31.1058949 ]
 [20.78430841]
 [31.65939567]
 [16.72347722]
 [35.6250064 ]
 [12.92198801]
 [30.17921946]
 [23.24582345]
 [36.67081588]
 [15.50340063]
 [19.65045997]
 [13.54345206]
 [19.14693805]
 [35.95492417]
 [ 9.01591887]
 [15.638049  ]
 [21.44987617]
 [12.13136956]
 [21.03593822]
 [20.92111539]
 [16.28198163]
 [21.90814505]
 [ 2.57804421]
 [41.00994018]
 [19.17912623]
 [20.1701806 ]
 [17.25653353]
 [21.99940275]
 [33.34889446]
 [27.78892952]
 [34.25852916]
 [27.25761871]
 [21.29816974]
 [25.68506589]
 [13.72694

In [27]:
# print the actual values
print(y_test)

        0
339  19.0
57   31.6
245  18.5
191  30.5
5    28.7
..    ...
414   7.0
331  17.1
209  20.0
112  18.8
311  22.1

[127 rows x 1 columns]


In [30]:
# Check model performance/accuracy, using MSE (Mean Squared Error)
print('Mean Squared Error = ',(np.mean((y_pred - y_test)**2)))

Mean Squared Error =  0    23.805078
dtype: float64


Model1 give an MSE of 23.8. We want this value to be as close to zero as possible.

### Model 2: 75% train and 25% test set

In [17]:
# Split dataset into 75% train and 25% test set
x_train2, x_test2, y_train2, y_test2 = train_test_split(dataf_x, dataf_y, test_size = 0.25) 

In [18]:
# Train the model with the training data (using 'fit' function)
reg.fit(x_train2, y_train2)

LinearRegression()

In [19]:
# Print the coefficients for each feature/column
print(reg.coef_)

[[-6.41186836e-02  3.16204929e-02 -3.14450451e-03  2.62180411e+00
  -1.59416968e+01  3.96152983e+00 -5.45588008e-03 -1.40004165e+00
   2.73590688e-01 -1.11597730e-02 -1.00592386e+00  1.16130505e-02
  -5.33654137e-01]]


In [20]:
# Print predictions of the test data (y values)
y_pred2 = reg.predict(x_test2)
print(y_pred2)

[[18.06608866]
 [15.28584105]
 [22.0399334 ]
 [15.13545507]
 [25.27000757]
 [20.88084128]
 [19.96165843]
 [24.1775286 ]
 [30.72347228]
 [ 8.29245428]
 [17.04301869]
 [19.35987465]
 [25.04784098]
 [20.35617162]
 [41.62531517]
 [18.8685812 ]
 [25.02291465]
 [18.61265594]
 [35.5663254 ]
 [19.7522843 ]
 [13.62708337]
 [34.61893835]
 [28.26243639]
 [20.86219012]
 [ 0.80575132]
 [36.43600825]
 [25.79423648]
 [14.87927404]
 [22.74278555]
 [29.59564115]
 [22.38949144]
 [15.0168343 ]
 [31.97387132]
 [27.29094851]
 [24.02442265]
 [15.7375045 ]
 [23.38618615]
 [34.89036859]
 [32.69065487]
 [22.1368794 ]
 [21.61882856]
 [24.72477027]
 [27.01107106]
 [33.17611945]
 [20.46826871]
 [16.71120677]
 [28.78226461]
 [22.65340312]
 [21.13288891]
 [28.55168662]
 [32.67353168]
 [25.24844476]
 [13.36313653]
 [20.34558074]
 [27.82195294]
 [42.82080838]
 [11.54141154]
 [17.14414091]
 [21.72219426]
 [ 6.5470888 ]
 [25.85049803]
 [33.33720344]
 [11.41324189]
 [25.65107675]
 [19.76645629]
 [16.04526431]
 [28.66314

In [21]:
# print the actual values
print(y_test2)

        0
380  10.4
149  15.4
328  19.3
455  14.1
321  23.1
..    ...
223  30.1
338  20.6
107  20.4
305  28.4
196  33.3

[127 rows x 1 columns]


In [22]:
# Check model performance/accuracy, using MSE (Mean Squared Error)
print('Mean Squared Error = ',(np.mean((y_pred2 - y_test2)**2)))

Mean Squared Error =  0    22.403907
dtype: float64


Above we can see that this 2nd model gives an MSE of 22.4

From this we can see that Model2 performs better than Model1, as it gives a lower MSE.