## Data Analytics I
Create a Linear Regression Model using Python/R to predict home prices using Boston Housing
Dataset (https://www.kaggle.com/c/boston-housing). The Boston Housing dataset contains
information about various houses in Boston through different parameters. There are 506 samples
and 14 feature variables in this dataset.
The objective is to predict the value of prices of the house using the given features.

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [21]:
df = pd.read_csv('Boston.csv')
df.drop(columns = ['Unnamed: 0'],inplace=True)
df

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0


In [3]:
df.head(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [23]:
df.isnull().sum()

crim       0
zn         0
indus      0
chas       0
nox        0
rm         0
age        0
dis        0
rad        0
tax        0
ptratio    0
black      0
lstat      0
medv       0
dtype: int64

In [24]:
from sklearn.preprocessing import MinMaxScaler

In [25]:
scaler = MinMaxScaler()

In [26]:
df = scaler.fit_transform(df)

In [27]:
df = pd.DataFrame(df)

In [28]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]

In [29]:
x_train,x_test,y_train,y_test = train_test_split(X,y,test_size=0.33)

In [30]:
x_train.shape

(339, 13)

In [31]:
x_test.shape

(167, 13)

In [32]:
y_train.shape

(339,)

In [33]:
y_test.shape

(167,)

In [34]:
lr = LinearRegression()

In [35]:
reg = lr.fit(x_train,y_train)

In [36]:
reg.score(x_test,y_test)

0.7606687012241788

In [37]:
features = [0.003587,0.0,0.785557,0.0,0.49177,0.549914,0.987642,0.062099,0.130435,0.477099,0.914894,1.0,0.376932]
print(reg.predict(np.array([features]))[0])

0.3157411591403494


In [38]:
predictions = reg.predict(x_test)
print("predictions on X_test: \n\n{} ".format(predictions))


predictions on X_test: 

[0.35006847 0.36546956 0.17943906 0.08290057 0.32891565 0.44039142
 0.71083001 0.41759146 0.3066836  0.43912251 0.56765117 0.14705485
 0.26384515 0.28963181 0.41307575 0.38662133 0.16782638 0.3450703
 0.27258253 0.43717274 0.21567215 0.65703396 0.19955256 0.27573205
 0.32531344 0.61935293 0.24527941 0.10893289 0.61608674 0.44579188
 0.51106596 0.34198906 0.4026931  0.39535102 0.40915559 0.53028465
 0.46751144 0.32284402 0.14343297 0.34192561 0.30569041 0.4033444
 0.4998352  0.44761084 0.17318515 0.28015028 0.30327303 0.36971823
 0.24693412 0.35973212 0.03883776 0.39479019 0.27525508 0.40468424
 0.32040862 0.40197538 0.38501131 0.68223704 0.28499792 0.31332559
 0.38031002 0.56127005 0.4914801  0.3252072  0.4041842  0.60641033
 0.5706731  0.46224522 0.49691186 0.35509512 0.3726199  0.1649191
 0.34586744 0.35070815 0.43868721 0.27637024 0.28600501 0.61712577
 0.5134238  0.02646634 0.35322284 0.43394341 0.29738211 0.46811499
 0.24895373 0.72082229 0.58994809 0.4297

In [None]:
sns.discplot()