In [34]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt 

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.datasets import load_diabetes
from sklearn.metrics import r2_score

In [35]:
# Load the boston dataset 

data = pd.read_csv("BostonHousing.csv")

In [36]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,medv
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [37]:
data.rename(columns={"medv":"price"}, inplace=True)

In [38]:
data.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,b,lstat,price
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.9,5.33,36.2


In [39]:
# Features and Target 

x = data.drop(columns=["price"])
y = data["price"]

In [40]:
# Train and Testing 
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.2,random_state=0)

In [41]:
len(x_train)

404

In [42]:
len(x_test)

102

In [43]:
# Apply Linear Regression 

lr = LinearRegression()
lr.fit(x_train,y_train)

In [44]:
y_pred = lr.predict(x_test)
r2_model = r2_score(y_test,y_pred)
print("R2 Score is = ", r2_model)

R2 Score is =  0.5892223849182507


# Ridge and Lasso

In [45]:
# apply Ridge

from sklearn.linear_model import Ridge, Lasso
ridge_model = Ridge(alpha=1.0)
ridge_model.fit(x_train,y_train)

y_pred_ridge = ridge_model.predict(x_test)
r2_model_ridge = r2_score(y_test,y_pred_ridge)
print("R2 score of Ridge model is =" , r2_model_ridge)

R2 score of Ridge model is = 0.5796111714164924


In [46]:
from sklearn.linear_model import Ridge, Lasso
ridge_model = Ridge(alpha=5)
ridge_model.fit(x_train,y_train)

y_pred_ridge = ridge_model.predict(x_test)
r2_model_ridge = r2_score(y_test,y_pred_ridge)
print("R2 score of Ridge model is =" , r2_model_ridge)

R2 score of Ridge model is = 0.5679816088812537


In [47]:
# apply Lasso 

from sklearn.linear_model import Ridge, Lasso
lasso_model = Lasso(alpha=1.0)
lasso_model.fit(x_train,y_train)

y_pred_lasso = lasso_model.predict(x_test)
r2_model_lasso = r2_score(y_test,y_pred_lasso)
print("R2 score of Lasso model is =" , r2_model_lasso)

R2 score of Lasso model is = 0.48789271561192604


In [48]:
# Identify the coefficients with bad slope results 
bad_features = np.where(lasso_model.coef_ == 0)[0]
print("Features with bad slope results are ", list(x.columns[bad_features]))

Features with bad slope results are  ['indus', 'chas', 'nox']


In [49]:
# Removing the bad Features 
x_train_filtered = x_train.drop(x_train.columns[bad_features],axis=1)
x_test_filtered = x_test.drop(x_test.columns[bad_features],axis=1)

model_filtered_lr = LinearRegression()
model_filtered_lr.fit(x_train_filtered,y_train)

model_filtered_lasso = Lasso(alpha=0.1)
model_filtered_lasso.fit(x_train_filtered,y_train)

y_pred_Linear = model_filtered_lr.predict(x_test_filtered)
r2_model_linear = r2_score(y_test,y_pred_Linear)
print("R2_Score of Linear Model after removing bad features is = " , r2_model_linear)

y_pred_Lasso = model_filtered_lasso.predict(x_test_filtered)
r2_model_Lasso = r2_score(y_test,y_pred_Lasso)
print("R2_Score of Lasso Model after removing bad features is = " , r2_model_Lasso)

R2_Score of Linear Model after removing bad features is =  0.5558354547335738
R2_Score of Lasso Model after removing bad features is =  0.5542304065052205


In [50]:
#Identify coefficients closer to zero 

small_coefficients = np.where((lasso_model.coef_ < 0.05) & (lasso_model.coef_ > -0.05))[0]
print(small_coefficients)

[ 2  3  4  6  9 11]


In [51]:
print("Features with very small coefficients " , list(x.columns[small_coefficients]))

Features with very small coefficients  ['indus', 'chas', 'nox', 'age', 'tax', 'b']


In [52]:
# Removing small coeffecients features 

x_train_new = x_train.drop(x_train.columns[small_coefficients],axis=1)
x_test_new = x_test.drop(x_test.columns[small_coefficients],axis=1)


In [53]:
# Apply Linear and Lasso 

model_filtered_lr_new = LinearRegression()
model_filtered_lr_new.fit(x_train_new,y_train)

model_filtered_lasso_new = Lasso(alpha=0.1)
model_filtered_lasso_new.fit(x_train_new,y_train)

y_pred_Linear_new = model_filtered_lr_new.predict(x_test_new)
r2_model_linear_new = r2_score(y_test,y_pred_Linear_new)
print("R2_Score of Linear Model after removing bad features is = " , r2_model_linear_new)

y_pred_Lasso_new = model_filtered_lasso_new.predict(x_test_new)
r2_model_Lasso_new = r2_score(y_test,y_pred_Lasso_new)
print("R2_Score of Lasso Model after removing bad features is = " , r2_model_Lasso_new)

R2_Score of Linear Model after removing bad features is =  0.5173619013105011
R2_Score of Lasso Model after removing bad features is =  0.5163287074198907


In [59]:
from itertools import combinations

best_r2_score = 0
best_column_combination = None

#Iterate through all possible combinations 
for i in range(1 , len(small_coefficients) +1):
    for subset in combinations(small_coefficients ,i):
        #Create a new subset with the ubset of columns dropped
        x_train_subset = x_train.drop(x_train.columns[list(subset)],axis=1)
        x_test_subset = x_test.drop(x_test.columns[list(subset)],axis=1)

        # Apply Linear Regression on the subset od columns 
        model_subset = LinearRegression()
        model_subset.fit(x_train_subset,y_train)

        # Calculate R2 score for the subset of columns 
        r2_subset = model_subset.score(x_test_subset,y_test)

        #Update the best r@ scotre and column combination if achieved 
        if r2_subset > best_r2_score:
            best_r2_score = r2_subset
            best_column_combination = subset

print("Best R2 score is = ", best_r2_score)
print("Best column combination is = " , best_column_combination)

Best R2 score is =  0.5896890888383506
Best column combination is =  (6,)
