In [3]:
## basic tutorial on L1 and L2 Regularization
## tutorial url:
## https://www.geeksforgeeks.org/ml-implementing-l1-and-l2-regularization-using-sklearn/

In [4]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import train_test_split, cross_val_score
from statistics import mean

In [14]:
data = pd.read_csv('../Datasets/kc_house_data.csv')
 
# Dropping the numerically non-sensical variables
dropColumns = ['id', 'date', 'zipcode']
data = data.drop(dropColumns, axis = 1)

## Liam: drop rows with NAN due to regression error
data.dropna(how='any', inplace=True)
 
# Separating the dependent and independent variables
y = data['price']
X = data.drop('price', axis = 1)
 
data.head(2)

Unnamed: 0,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,grade,sqft_above,sqft_basement,yr_built,yr_renovated,lat,long,sqft_living15,sqft_lot15
0,221900.0,3,1.0,1180,5650,1.0,0,0,3,7,1180.0,0,1955,0,47.5112,-122.257,1340,5650
1,538000.0,3,2.25,2570,7242,2.0,0,0,3,7,2170.0,400,1951,1991,47.721,-122.319,1690,7639


In [21]:
# Dividing the data into training and testing set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=3)

print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(16208, 17) (5403, 17) (16208,) (5403,)


#### Baseline Regression

In [22]:
# Building and fitting the Linear Regression model
linearModel = LinearRegression()
linearModel.fit(X_train, y_train)
 
# Evaluating the Linear Regression model
print(linearModel.score(X_test, y_test))

0.6883812736206611


#### L2 - Ridge Regression

In [25]:

# List to maintain the different cross-validation scores
cross_val_scores_ridge = []
 
# List to maintain the different values of alpha
alpha = []
 
# Loop to compute the different values of cross-validation scores
for i in range(1, 9):
    ridgeModel = Ridge(alpha = i * 0.25)
    ridgeModel.fit(X_train, y_train)
    scores = cross_val_score(ridgeModel, X, y, cv = 10)
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_ridge.append(avg_cross_val_score)
    alpha.append(i * 0.25)
 
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
    print(str(alpha[i])+' : '+str(cross_val_scores_ridge[i]))

0.25 : 69.0901583767101
0.5 : 69.09033468066406
0.75 : 69.09049125083847
1.0 : 69.09062828999278
1.25 : 69.09074599856244
1.5 : 69.09084457469034
1.75 : 69.09092421425655
2.0 : 69.09098511090883


In [26]:
# Building and fitting the Ridge Regression model
ridgeModelChosen = Ridge(alpha = 2)
ridgeModelChosen.fit(X_train, y_train)
 
# Evaluating the Ridge Regression model
print(ridgeModelChosen.score(X_test, y_test))

0.6883500110608307


In [32]:
ridge_coef = pd.DataFrame()
ridge_coef["Columns"]= X_train.columns
ridge_coef['Coefficient Estimate'] = pd.Series(ridgeModelChosen.coef_)
print(ridge_coef)

          Columns  Coefficient Estimate
0        bedrooms         -36412.521548
1       bathrooms          44864.208899
2     sqft_living            111.201886
3        sqft_lot              0.102812
4          floors          -1742.047264
5      waterfront         571426.408566
6            view          49647.515271
7       condition          31762.015081
8           grade          99190.013815
9      sqft_above             72.413377
10  sqft_basement             38.788584
11       yr_built          -2517.942037
12   yr_renovated             24.638399
13            lat         557598.912230
14           long        -108817.697362
15  sqft_living15             22.779026
16     sqft_lot15             -0.422640


#### L1 - Lasso Regression

In [27]:
# List to maintain the cross-validation scores
cross_val_scores_lasso = []
 
# List to maintain the different values of Lambda
Lambda = []
 
# Loop to compute the cross-validation scores
for i in range(1, 9):
    lassoModel = Lasso(alpha = i * 0.25, tol = 0.0925)
    lassoModel.fit(X_train, y_train)
    scores = cross_val_score(lassoModel, X, y, cv = 10)
    avg_cross_val_score = mean(scores)*100
    cross_val_scores_lasso.append(avg_cross_val_score)
    Lambda.append(i * 0.25)
 
# Loop to print the different values of cross-validation scores
for i in range(0, len(alpha)):
    print(str(alpha[i])+' : '+str(cross_val_scores_lasso[i]))

0.25 : 69.0899676246163
0.5 : 69.08997309354582
0.75 : 69.0899785338932
1.0 : 69.08998394768226
1.25 : 69.08998932707652
1.5 : 69.08999467945569
1.75 : 69.0900000188295
2.0 : 69.09000531192424


In [28]:
# Building and fitting the Lasso Regression Model
lassoModelChosen = Lasso(alpha = 2, tol = 0.0925)
lassoModelChosen.fit(X_train, y_train)
 
# Evaluating the Lasso Regression model
print(lassoModelChosen.score(X_test, y_test))

0.6883810155406243


In [33]:
lasso_coef = pd.DataFrame()
lasso_coef["Columns"]= X_train.columns
lasso_coef['Coefficient Estimate'] = pd.Series(lassoModelChosen.coef_)
print(lasso_coef)

          Columns  Coefficient Estimate
0        bedrooms         -36337.339987
1       bathrooms          44877.678782
2     sqft_living            286.353339
3        sqft_lot              0.104182
4          floors          -1916.331156
5      waterfront         582355.535937
6            view          49194.401085
7       condition          31834.744090
8           grade          99093.860030
9      sqft_above           -102.730098
10  sqft_basement           -136.523114
11       yr_built          -2511.459379
12   yr_renovated             24.584910
13            lat         561507.708739
14           long        -109479.106722
15  sqft_living15             22.823253
16     sqft_lot15             -0.421660
