In [647]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import PolynomialFeatures, StandardScaler, MinMaxScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import r2_score
# Suppress warnings
import warnings
warnings.filterwarnings(action = 'ignore')

In [648]:
#load data https://drive.google.com/file/d/1tX62d7cA8Vb_5zOEuhwjE5vLZ5aq7RjG/view?usp=drive_link
df = pd.read_csv("model_selection_data.csv")

In [649]:
df.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Target
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33,36.2


In [650]:
# The Boston Housing Dataset
# The Boston Housing Dataset is a derived from information collected by the U.S. Census Service concerning housing in the area of Boston MA.
# The following describes the dataset columns:
# CRIM - per capita crime rate by town
# ZN - proportion of residential land zoned for lots over 25,000 sq.ft.
# INDUS - proportion of non-retail business acres per town.
# CHAS - Charles River dummy variable (1 if tract bounds river; 0 otherwise)
# NOX - nitric oxides concentration (parts per 10 million)
# RM - average number of rooms per dwelling
# AGE - proportion of owner-occupied units built prior to 1940
# DIS - weighted distances to five Boston employment centres
# RAD - index of accessibility to radial highways
# TAX - full-value property-tax rate per $10,000
# PTRATIO - pupil-teacher ratio by town
# B - 1000(Bk - 0.63)^2 where Bk is the proportion of blacks by town
# LSTAT - % lower status of the population
# MEDV - Median value of owner-occupied homes in $1000's

In [651]:
#check for missing value
df.isnull().mean()

CRIM       0.0
ZN         0.0
INDUS      0.0
CHAS       0.0
NOX        0.0
RM         0.0
AGE        0.0
DIS        0.0
RAD        0.0
TAX        0.0
PTRATIO    0.0
B          0.0
LSTAT      0.0
Target     0.0
dtype: float64

In [652]:
df.shape

(506, 14)

In [653]:
X=df.drop(["Target"],axis=1)
y=df["Target"]

In [654]:
#split data into training and test sets
X_train,X_test,y_train,y_test = train_test_split(X,y, test_size=0.2,random_state=42)

In [655]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(404, 13)
(102, 13)
(404,)
(102,)


# base model

In [656]:
#base model
# data--> standardscalling-->polynomialfeatures-->linerregression

In [657]:
liner_reg_pipeline = make_pipeline(StandardScaler(),PolynomialFeatures(degree=2),LinearRegression())
liner_reg_pipeline.fit(X_train,y_train)

In [658]:
# Predictions
y_pred_liner_reg_train = liner_reg_pipeline.predict(X_train)
y_pred_liner_reg_test = liner_reg_pipeline.predict(X_test)


In [659]:
# R2 calculation
r2_liner_reg_train = r2_score(y_train, y_pred_liner_reg_train)
r2_liner_reg_test = r2_score(y_test, y_pred_liner_reg_test)

In [660]:
print("Training R-sq:")
print(f"Linear Reg:{r2_liner_reg_train:.2f}")
print("Testing R-sq:")
print(f"Linear Reg:{r2_liner_reg_test:.2f}")

Training R-sq:
Linear Reg:0.94
Testing R-sq:
Linear Reg:0.81


#Sing of overfitting as there is diff of 13% in train and test performance
#useing reguralisaion technique
Ridge Regression

In [661]:
# alpha is pentelty parameter
ridge_reg_pipeline = make_pipeline(StandardScaler(),PolynomialFeatures(degree=2), Ridge(alpha=84))
ridge_reg_pipeline.fit(X_train,y_train)


In [662]:
# Predictions
y_pred_ridge_reg_train = ridge_reg_pipeline.predict(X_train)
y_pred_ridge_reg_test = ridge_reg_pipeline.predict(X_test)


In [663]:
r2_ridge_reg_train = r2_score(y_train, y_pred_ridge_reg_train)
r2_ridge_reg_test = r2_score(y_test, y_pred_ridge_reg_test)

In [664]:
print("Training R-sq:")
print(f"Ridge Reg:{r2_ridge_reg_train:.2f}")
print("Testing R-sq:")
print(f"Ridge Reg:{r2_ridge_reg_test:.2f}")

Training R-sq:
Ridge Reg:0.89
Testing R-sq:
Ridge Reg:0.82


In [665]:
# train and test both should be high but difference should be minimal
# alpha =85 train= 89 test = 82

In [666]:
print("Training R-sq:")
print(f"Linear Reg:{r2_liner_reg_train:.2f}")
print("Testing R-sq:")
print(f"Linear Reg:{r2_liner_reg_test:.2f}")

Training R-sq:
Linear Reg:0.94
Testing R-sq:
Linear Reg:0.81


In [667]:
# alpha is pentelty parameter
ridge_reg_pipeline = make_pipeline(MinMaxScaler(),PolynomialFeatures(degree=2), Ridge(alpha=0.02))
ridge_reg_pipeline.fit(X_train,y_train)


In [668]:
# Predictions
y_pred_ridge_reg_train = ridge_reg_pipeline.predict(X_train)
y_pred_ridge_reg_test = ridge_reg_pipeline.predict(X_test)


In [669]:
r2_ridge_reg_train = r2_score(y_train, y_pred_ridge_reg_train)
r2_ridge_reg_test = r2_score(y_test, y_pred_ridge_reg_test)

In [670]:
print("Training R-sq:")
print(f"Ridge Reg:{r2_ridge_reg_train:.2f}")
print("Testing R-sq:")
print(f"Ridge Reg:{r2_ridge_reg_test:.2f}")

Training R-sq:
Ridge Reg:0.93
Testing R-sq:
Ridge Reg:0.84


# Lasso regularisation 

In [671]:
# alpha is pentelty parameter
lasso_reg_pipeline = make_pipeline(StandardScaler(),PolynomialFeatures(degree=2), Lasso(alpha=.45))
lasso_reg_pipeline.fit(X_train,y_train)

In [672]:
# Predictions
y_pred_lasso_reg_train = lasso_reg_pipeline.predict(X_train)
y_pred_lasso_reg_test = lasso_reg_pipeline.predict(X_test)

In [673]:
r2_lasso_reg_train = r2_score(y_train, y_pred_lasso_reg_train)
r2_lasso_reg_test = r2_score(y_test, y_pred_lasso_reg_test)

In [674]:
print("Training R-sq:")
print(f"Lasso Reg:{r2_lasso_reg_train:.2f}")
print("Testing R-sq:")
print(f"Ridge Reg:{r2_lasso_reg_test:.2f}")

Training R-sq:
Lasso Reg:0.83
Testing R-sq:
Ridge Reg:0.78


In [675]:
# alpha is pentelty parameter
lasso_reg_pipeline = make_pipeline(MinMaxScaler(),PolynomialFeatures(degree=2), Lasso(alpha=0.021))
lasso_reg_pipeline.fit(X_train,y_train)

In [676]:
# Predictions
y_pred_lasso_reg_train = lasso_reg_pipeline.predict(X_train)
y_pred_lasso_reg_test = lasso_reg_pipeline.predict(X_test)

In [677]:
r2_lasso_reg_train = r2_score(y_train, y_pred_lasso_reg_train)
r2_lasso_reg_test = r2_score(y_test, y_pred_lasso_reg_test)

In [678]:
print("Training R-sq:")
print(f"Lasso Reg:{r2_lasso_reg_train:.2f}")
print("Testing R-sq:")
print(f"Ridge Reg:{r2_lasso_reg_test:.2f}")

Training R-sq:
Lasso Reg:0.85
Testing R-sq:
Ridge Reg:0.82


In [684]:
# Best result for all iteration
# alpha =85 train= 89 test = 82 (Ridge _ stdscaller)  -- rejected
# alpha =2 train= 89 test = 84 (Ridge _ minmaxscaller) -- first choise 
# alpha =0.45 train= 83 test = 78 (lasso _ stdscaller) -- rejected
# alpha =0.21 train= 85 test = 82 (Ridge _ minmaxscaller) -- 2nd choice -- may be overfit
## outcomes
## minmaxscaler works better thanks stdscaller
