In [1]:
import pandas as pd
import numpy as np
import matplotlib as mat
import matplotlib.pyplot as plt
import matplotlib.animation as animation
import statsmodels.api as stats
import sklearn as sklearn
from sklearn import preprocessing

In [2]:
print("pandas version:", pd.__version__)
print("numpy version:", np.__version__)
print("matplotlib version:", mat.__version__)
print("statsmodels version:", stats.__version__)
print("sklearn version:", sklearn.__version__)

pandas version: 1.0.5
numpy version: 1.18.5
matplotlib version: 3.3.0
statsmodels version: 0.11.1
sklearn version: 0.23.1


In [3]:
# Load data from sklearn package
from sklearn import datasets
boston = datasets.load_boston()
print(boston.keys())

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])


In [4]:
# Load boston into the dataframe
dataframe = pd.DataFrame(data = boston.data, columns =boston.feature_names)
print("Shape of the dataframe", dataframe.shape)

# Adding one more column into the dataset
dataframe['Target'] = boston.target
print("Shape of the dataframe", dataframe.shape)
print()

dataframe = dataframe.rename(columns = {'Target':'Price'})
print(dataframe.columns)

Shape of the dataframe (506, 13)
Shape of the dataframe (506, 14)

Index(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX',
       'PTRATIO', 'B', 'LSTAT', 'Price'],
      dtype='object')


In [5]:
# Lets see the correlation among all the features of the dataset
correlation = dataframe.corr()
print(correlation)

             CRIM        ZN     INDUS      CHAS       NOX        RM       AGE  \
CRIM     1.000000 -0.200469  0.406583 -0.055892  0.420972 -0.219247  0.352734   
ZN      -0.200469  1.000000 -0.533828 -0.042697 -0.516604  0.311991 -0.569537   
INDUS    0.406583 -0.533828  1.000000  0.062938  0.763651 -0.391676  0.644779   
CHAS    -0.055892 -0.042697  0.062938  1.000000  0.091203  0.091251  0.086518   
NOX      0.420972 -0.516604  0.763651  0.091203  1.000000 -0.302188  0.731470   
RM      -0.219247  0.311991 -0.391676  0.091251 -0.302188  1.000000 -0.240265   
AGE      0.352734 -0.569537  0.644779  0.086518  0.731470 -0.240265  1.000000   
DIS     -0.379670  0.664408 -0.708027 -0.099176 -0.769230  0.205246 -0.747881   
RAD      0.625505 -0.311948  0.595129 -0.007368  0.611441 -0.209847  0.456022   
TAX      0.582764 -0.314563  0.720760 -0.035587  0.668023 -0.292048  0.506456   
PTRATIO  0.289946 -0.391679  0.383248 -0.121515  0.188933 -0.355501  0.261515   
B       -0.385064  0.175520 

In [6]:
# Print the description about the dataset
print(boston.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [7]:
# correlation['Price'].sort_values(ascending = False)
corr_values = correlation['Price'].abs().sort_values(ascending = False)
print(corr_values)
# This shows that the 'Price' largely depends on the first two parameters ('LSTAT' and 'RM')
# We will also take 'TAX' parameter into account for predicting the 'Price'
# RM       average number of rooms per dwelling
# LSTAT    % lower status of the population
# TAX      full-value property-tax rate per $10,000

Price      1.000000
LSTAT      0.737663
RM         0.695360
PTRATIO    0.507787
INDUS      0.483725
TAX        0.468536
NOX        0.427321
CRIM       0.388305
RAD        0.381626
AGE        0.376955
ZN         0.360445
B          0.333461
DIS        0.249929
CHAS       0.175260
Name: Price, dtype: float64


In [8]:
# Perfoem multivariate linear regression using scikit-learn
from sklearn.linear_model import LinearRegression
ys = dataframe['Price']
xs = np.c_[dataframe['RM'], dataframe['LSTAT'], dataframe['TAX']]

print("shape for ys: {} & shape for xs: {}".format(ys.shape, xs.shape))

shape for ys: (506,) & shape for xs: (506, 3)


In [9]:
# Data Standardization
xs = preprocessing.scale(xs)
ys = preprocessing.scale(ys)

In [10]:
# Create LinearRegression object
lr = LinearRegression()

#Fitting the model
lr = lr.fit(xs, ys)

# Make the prediction
pred = lr.predict(xs)

In [11]:
intercept = lr.intercept_
Theta_0 = lr.coef_[0]
Theta_1 = lr.coef_[1]
Theta_2 = lr.coef_[2]

print('Intercept : {}'.format(round(intercept, 3)))
print('Theta_0 : {}'.format(round(Theta_0, 4)))
print('Theta_1 : {}'.format(round(Theta_1, 4)))
print('Theta_2 : {}'.format(round(Theta_2, 4)))

r2_sk = lr.score(xs,ys)
print('R square from sci-kit learn: {}'.format(round(r2_sk, 4)))

Intercept : 0.0
Theta_0 : 0.3972
Theta_1 : -0.429
Theta_2 : -0.1191
R square from sci-kit learn: 0.6485
