In [None]:
pip install scikit-learn==1.1.3

In [2]:
#in built dataset from sklearn is used
import pandas as pd
import numpy as np
import sklearn
import matplotlib.pyplot as plt
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [None]:
# Loading pre-defined Boston Dataset
boston_dataset = datasets.load_boston()
boston_dataset # it returns a dictionary
# print(boston_dataset.DESCR)

In [None]:
boston_dataset.DESCR

In [5]:
#Load the data and divide into X and Y varaiable
boston_pd = pd.DataFrame(boston_dataset.data)
boston_pd.columns = boston_dataset.feature_names
boston_pd_target = np.asarray(boston_dataset.target)
boston_pd['House Price'] = pd.Series(boston_pd_target)

# input
X = boston_pd.iloc[:, :-1]

#output
Y = boston_pd.iloc[:,-1]

In [None]:
boston_pd

In [7]:
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.35)
# (x_train.shape, y_train.shape,x_test.shape, y_test.shape)

In [None]:
x_test

In [None]:
# Apply Linear Regression Model
lreg = LinearRegression()
lreg.fit(x_train, y_train)

In [None]:
# Generate Prediction on test set
lreg_y_pred = lreg.predict(x_test)
lreg_y_pred # shows predicted prices of houses from test data

In [None]:
# calculating Mean Squared Error (mse)
mean_squared_error = np.mean((lreg_y_pred - y_test)**2)
print("Mean squared Error on test set : ", mean_squared_error) # high error

Mean squared Error on test set :  27.752813304033154


In [None]:
# Putting together the coefficient and their corrsponding variable names
lreg_coefficient = pd.DataFrame()
lreg_coefficient["Columns"] = x_train.columns
lreg_coefficient['Coefficient Estimate'] = pd.Series(lreg.coef_)
print(lreg_coefficient) # y=-0.11X1 + 0.02X2 - 0.03X3 + ......

# model is giving too much importance to NOX feature as it is highly negative. See description of NOX - poisonous gas or pollution. If more pollution House price will be lower

    Columns  Coefficient Estimate
0      CRIM             -0.075031
1        ZN              0.036428
2     INDUS              0.020271
3      CHAS              3.580639
4       NOX            -12.882000
5        RM              4.676558
6       AGE              0.001515
7       DIS             -1.180234
8       RAD              0.270171
9       TAX             -0.013033
10  PTRATIO             -0.872204
11        B              0.012096
12    LSTAT             -0.532878


In [None]:
plt.scatter(boston_pd['NOX'], boston_pd['House Price'])
# not so high negative correlation. upto 0.7 NOX prices are still high. So our model should not give much importance to NOX

In [None]:
import seaborn as sns
sns.barplot(x='Coefficient Estimate' , y='Columns', data=lreg_coefficient)

In [None]:
# import ridge regression from sklearn library
from sklearn.linear_model import Ridge

# Train the model
ridgeR = Ridge(alpha = 2) # alpha is a hyper parameter which needs to be experimented, Default is 1. Range is 1 to infinity
ridgeR.fit(x_train, y_train)
y_pred = ridgeR.predict(x_test)

In [None]:
# calculate mean square error
mean_squared_error_ridge = np.mean((y_pred - y_test)**2)
print(mean_squared_error_ridge)

28.540087712871134


In [None]:
# get ridge coefficient and print them
ridge_coefficient = pd.DataFrame()
ridge_coefficient["Columns"]= x_train.columns
ridge_coefficient['Linear Coeff.Estimate'] = pd.Series(lreg.coef_)
ridge_coefficient['Ridge Coeff.Estimate'] = pd.Series(ridgeR.coef_)
print(ridge_coefficient)

####  import Lasso regression from sklearn library



In [None]:
from sklearn.linear_model import Lasso
# Train the model
lasso = Lasso(alpha = 0.05)
lasso.fit(x_train, y_train)
y_pred1 = lasso.predict(x_test)

In [None]:
# Calculate Mean Squared Error
mean_squared_error = np.mean((y_pred1 - y_test)**2)
print("Mean squared error on test set", mean_squared_error)

Mean squared error on test set 29.067825155009604


In [None]:
lasso_coeff = pd.DataFrame()
lasso_coeff["Columns"] = x_train.columns
lasso_coeff['Linear Coeff.Estimate'] = pd.Series(lreg.coef_)
lasso_coeff['Coefficient Estimate'] = pd.Series(lasso.coef_)
print(lasso_coeff)

    Columns  Linear Coeff.Estimate  Coefficient Estimate
0      CRIM              -0.075031             -0.063960
1        ZN               0.036428              0.037777
2     INDUS               0.020271             -0.032412
3      CHAS               3.580639              2.974122
4       NOX             -12.882000             -0.000000
5        RM               4.676558              4.674272
6       AGE               0.001515             -0.008224
7       DIS              -1.180234             -0.988615
8       RAD               0.270171              0.235684
9       TAX              -0.013033             -0.013959
10  PTRATIO              -0.872204             -0.735124
11        B               0.012096              0.013196
12    LSTAT              -0.532878             -0.555025


In [None]:
# import model
from sklearn.linear_model import ElasticNet
# Train the model
e_net = ElasticNet(alpha = .01,l1_ratio=.3) #l1_ratio should be in between 0 and 1. then it will be a combination of L1 and L2.
e_net.fit(x_train, y_train)

In [None]:
# calculate the prediction and mean square error
y_pred_elastic = e_net.predict(x_test)
mean_squared_error = np.mean((y_pred_elastic - y_test)**2)
print("Mean Squared Error on test set", mean_squared_error)

Mean Squared Error on test set 28.587375729718882


In [None]:
e_net_coeff = pd.DataFrame()
e_net_coeff["Columns"] = x_train.columns
e_net_coeff['Coefficient Estimate'] = pd.Series(e_net.coef_)
e_net_coeff

Unnamed: 0,Columns,Coefficient Estimate
0,CRIM,-0.068929
1,ZN,0.038076
2,INDUS,-0.022403
3,CHAS,3.291635
4,NOX,-3.585979
5,RM,4.64586
6,AGE,-0.006137
7,DIS,-1.063971
8,RAD,0.247887
9,TAX,-0.013698
