In [60]:
# import the basic/essential libraries

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [84]:
# check the present working directory

import os
print(os.getcwd())

C:\Users\vkondang\Python


In [85]:
# change the present working directory to the one containing the data (csv) files

import os
os.chdir("C:\\Users\\vkondang\\Python")

In [86]:
# read data from csv file in to a pandas data frame

BH = pd.read_csv("BostonHousing.csv")

In [87]:
# inspect the data frame for sanity

BH.head(6)

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV,CAT. MEDV
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0,0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6,0
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7,1
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4,1
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,5.33,36.2,1
5,0.02985,0.0,2.18,0,0.458,6.43,58.7,6.0622,3,222,18.7,5.21,28.7,0


In [88]:
BH.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM         506 non-null float64
ZN           506 non-null float64
INDUS        506 non-null float64
CHAS         506 non-null int64
NOX          506 non-null float64
RM           506 non-null float64
AGE          506 non-null float64
DIS          506 non-null float64
RAD          506 non-null int64
TAX          506 non-null int64
PTRATIO      506 non-null float64
LSTAT        506 non-null float64
MEDV         506 non-null float64
CAT. MEDV    506 non-null int64
dtypes: float64(10), int64(4)
memory usage: 55.4 KB


In [89]:
# drop rows with one or more columns containing NA values

BH = BH.drop(["CAT. MEDV"], axis = 1)
BH = BH.dropna()

In [90]:
# check the count of rows in the data frame

BH.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 506 entries, 0 to 505
Data columns (total 13 columns):
CRIM       506 non-null float64
ZN         506 non-null float64
INDUS      506 non-null float64
CHAS       506 non-null int64
NOX        506 non-null float64
RM         506 non-null float64
AGE        506 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
LSTAT      506 non-null float64
MEDV       506 non-null float64
dtypes: float64(10), int64(3)
memory usage: 55.3 KB


In [91]:
# test train split 70/30

from sklearn.model_selection import train_test_split
X_train, X_test = train_test_split(BH, test_size=0.3, random_state = 123)

In [92]:
# separating the outcome variable from predictors

y_train = X_train["MEDV"]
y_test = X_test["MEDV"]

X_train = X_train.drop(["MEDV"], axis = 1)
X_test = X_test.drop(["MEDV"], axis = 1)

X_train['CHAS'] = X_train['CHAS'].astype(float)
X_train['RAD'] = X_train['RAD'].astype(float)
X_train['TAX'] = X_train['TAX'].astype(float)

X_test['CHAS'] = X_test['CHAS'].astype(float)
X_test['RAD'] = X_test['RAD'].astype(float)
X_test['TAX'] = X_test['TAX'].astype(float)

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [93]:
# baseline value of the train dataset 22.762
# baseline value of the test dataset 21.999

print(y_train.mean(), y_test.mean())



22.761864406779655 21.999342105263153


In [94]:
X_train

array([[-0.35842932,  0.37320621, -0.57528914, ..., -1.07497958,
         0.07439155, -0.83365402],
       [-0.37674054,  0.41676539, -0.76854658, ..., -0.95456799,
        -0.77056919, -1.01390057],
       [-0.37507786, -0.49797742, -1.23412134, ..., -1.25559697,
        -0.30114655, -0.43572509],
       ...,
       [-0.34436402, -0.49797742, -0.51379813, ..., -0.68966249,
         0.54381419, -0.67836468],
       [ 0.61710364, -0.49797742,  1.05568658, ...,  1.59213716,
         0.82546777,  1.52618932],
       [ 0.11316957, -0.49797742,  1.05568658, ...,  1.59213716,
         0.82546777, -0.75878238]])

In [95]:
# Linear Regression Model

from sklearn.linear_model import LinearRegression
import math
LinReg = LinearRegression().fit(X_train, y_train)
y_pred = LinReg.predict(X_test)

print(LinReg.score(X_test, y_test), sum(y_pred - y_test)**2, math.sqrt(sum(y_pred - y_test)**2/y_test.count()))

0.6353089008165288 822.9967364499556 2.3268975511368244


In [None]:
# R2 of 0.6353 was obtained with a simple linear regression model

In [96]:
# Ridge (L2) Regression with GridSearchCV

from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = {'alpha': alphas}
grid_searchRidge = GridSearchCV(Ridge(), param_grid, verbose=3, cv=5)
grid_searchRidge.fit(X_train, y_train)
print(grid_searchRidge.best_params_, grid_searchRidge.best_score_, grid_searchRidge.score(X_test, y_test))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.8042144407604729, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.6796320604153903, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ............ alpha=0.0001, score=0.782731345620619, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.6728969883214899, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.7222993892017124, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ............. alpha=0.001, score=0.804214900727496, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished


In [97]:
# With Ridge (L2 Regularization) an R2 of 0.6324 was obtained. This is inferior to the simple linear regression model.

In [98]:
# Lasso (L1) Regression with GridSearchCV

from sklearn.linear_model import Lasso
from sklearn.model_selection import GridSearchCV
alphas = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = {'alpha': alphas}
grid_searchLasso = GridSearchCV(Lasso(), param_grid, verbose=3, cv=5)
grid_searchLasso.fit(X_train, y_train)
print(grid_searchLasso.best_params_, grid_searchLasso.best_score_, grid_searchLasso.score(X_test, y_test))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.8042182255106747, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.6796343796880228, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.7827484354672515, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.6728886092965365, total=   0.0s
[CV] alpha=0.0001 ....................................................
[CV] ........... alpha=0.0001, score=0.7223011518694167, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ............ alpha=0.001, score=0.8042528033963545, total=   0.0s
[CV] alpha=0.001 .....................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.0s finished


In [None]:
# With Lasso (R1) Regularization, an R2 of 0.6344 was obtained. This is also inferior when compared to the simple linear
# regression model

In [99]:
# SVM Regression with GridSearchCV

from sklearn.svm import LinearSVR
from sklearn.model_selection import GridSearchCV
Cs = [0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000]
param_grid = {'C': alphas}
grid_searchLinSVR = GridSearchCV(LinearSVR(), param_grid, verbose=3, cv=5)
grid_searchLinSVR.fit(X_train, y_train)
print(grid_searchLinSVR.best_params_, grid_searchLinSVR.best_score_, grid_searchLinSVR.score(X_test, y_test))

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV] C=0.0001 ........................................................
[CV] ............... C=0.0001, score=-7.343058419634129, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............... C=0.0001, score=-4.891426905640251, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ................ C=0.0001, score=-5.42954188770518, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............... C=0.0001, score=-6.124752769737166, total=   0.0s
[CV] C=0.0001 ........................................................
[CV] ............... C=0.0001, score=-7.092955264131401, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ................ C=0.001, score=-7.202925131079581, total=   0.0s
[CV] C=0.001 .........................................................
[CV] ............

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.0s remaining:    0.0s


[CV] .................. C=100, score=0.7388997825056396, total=   0.0s
[CV] C=1000 ..........................................................
[CV] ................. C=1000, score=0.5736237112862728, total=   0.0s
[CV] C=1000 ..........................................................
[CV] .................. C=1000, score=0.563240921048942, total=   0.0s
[CV] C=1000 ..........................................................
[CV] ................. C=1000, score=0.5059848517978052, total=   0.0s
[CV] C=1000 ..........................................................
[CV] ................. C=1000, score=0.5636904790043578, total=   0.0s
[CV] C=1000 ..........................................................
[CV] ................. C=1000, score=0.7070173001332414, total=   0.0s
{'C': 100} 0.7208878289013578 0.5859449117428996


[Parallel(n_jobs=1)]: Done  40 out of  40 | elapsed:    0.2s finished


In [None]:
# With Linear SVR, an R2 of 0.5859 was obtained. This is also inferior when compared to the simple linear
# regression model

In [100]:
# NN Regression with GridSearchCV

from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import GridSearchCV
alphas = [0.001, 0.01, 0.1, 1, 10, 100]
param_grid = {'alpha': alphas}
grid_searchLinMLPR = GridSearchCV(MLPRegressor(solver = 'sgd', hidden_layer_sizes=(5, 2), random_state=123, max_iter=1000), param_grid, verbose=3, cv=5)
grid_searchLinMLPR.fit(X_train, y_train)
print(grid_searchLinMLPR.best_params_, grid_searchLinMLPR.best_score_, grid_searchLinMLPR.score(X_test, y_test))

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.5s remaining:    0.0s


[CV] ............ alpha=0.001, score=0.7926508559309641, total=   0.5s
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.1s remaining:    0.0s


[CV] ............ alpha=0.001, score=0.8043880368720266, total=   0.5s
[CV] alpha=0.001 .....................................................
[CV] ....... alpha=0.001, score=-1.8939385549199272e-05, total=   0.1s
[CV] alpha=0.001 .....................................................




[CV] ............ alpha=0.001, score=0.7890736867089219, total=   0.5s
[CV] alpha=0.001 .....................................................




[CV] ............ alpha=0.001, score=0.8105966099114699, total=   0.6s
[CV] alpha=0.01 ......................................................




[CV] .............. alpha=0.01, score=0.792553208798376, total=   0.5s
[CV] alpha=0.01 ......................................................




[CV] ............. alpha=0.01, score=0.8031881046211282, total=   0.6s
[CV] alpha=0.01 ......................................................
[CV] ........ alpha=0.01, score=-1.8939385422411803e-05, total=   0.1s
[CV] alpha=0.01 ......................................................




[CV] ............. alpha=0.01, score=0.7881629986652874, total=   0.5s
[CV] alpha=0.01 ......................................................




[CV] ............. alpha=0.01, score=0.8105158938505082, total=   0.6s
[CV] alpha=0.1 .......................................................




[CV] ................. alpha=0.1, score=0.7944863820247, total=   0.6s
[CV] alpha=0.1 .......................................................




[CV] .............. alpha=0.1, score=0.8027313618427558, total=   0.6s
[CV] alpha=0.1 .......................................................
[CV] ......... alpha=0.1, score=-1.8939384154315064e-05, total=   0.1s
[CV] alpha=0.1 .......................................................




[CV] .............. alpha=0.1, score=0.7889150982081323, total=   0.9s
[CV] alpha=0.1 .......................................................




[CV] .............. alpha=0.1, score=0.8103518285521429, total=   0.6s
[CV] alpha=1 .........................................................




[CV] ................. alpha=1, score=0.792965275037149, total=   0.6s
[CV] alpha=1 .........................................................




[CV] ................ alpha=1, score=0.8252274240145532, total=   0.6s
[CV] alpha=1 .........................................................
[CV] ........... alpha=1, score=-1.8939371480453104e-05, total=   0.1s
[CV] alpha=1 .........................................................




[CV] ................. alpha=1, score=0.786852715948025, total=   0.5s
[CV] alpha=1 .........................................................




[CV] ................ alpha=1, score=0.8533651152409284, total=   0.6s
[CV] alpha=10 ........................................................




[CV] ................ alpha=10, score=0.790263885540833, total=   0.6s
[CV] alpha=10 ........................................................




[CV] ............... alpha=10, score=0.7955646259220059, total=   0.5s
[CV] alpha=10 ........................................................
[CV] .......... alpha=10, score=-0.00012166949989578235, total=   0.4s
[CV] alpha=10 ........................................................




[CV] ............... alpha=10, score=0.7584916095975011, total=   0.5s
[CV] alpha=10 ........................................................




[CV] ............... alpha=10, score=0.8445757800006415, total=   0.6s
[CV] alpha=100 .......................................................
[CV] ........... alpha=100, score=-0.032932206223698524, total=   0.1s
[CV] alpha=100 .......................................................
[CV] ........... alpha=100, score=-0.029638805538744876, total=   0.1s
[CV] alpha=100 .......................................................
[CV] ......... alpha=100, score=-1.8938084153319323e-05, total=   0.1s
[CV] alpha=100 .......................................................
[CV] .......... alpha=100, score=-0.0030171035190704742, total=   0.1s
[CV] alpha=100 .......................................................
[CV] .......... alpha=100, score=-1.860161555367057e-05, total=   0.1s


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   15.2s finished


{'alpha': 1} 0.6511085814589562 0.7766721371719463




In [None]:
# With MLP Regressor, an R2 of 0.7767 was obtained. This is a clear improvement from the previous models!