# Overfitting in linear / ridge / Lasso / Elasticnet regression 


In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from numpy import genfromtxt
from sklearn.metrics import r2_score

### Dataset with large number of feature variables - 104 !!

In [2]:
dataset = pd.read_csv('boston_housing.csv')
X = dataset.iloc[:, :-1].values
y = dataset.iloc[:, -1].values
print (X.shape)
print (y.shape)

(505, 104)
(505,)


In [3]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

## Checking overfitting on a Linear Regression model

### First we fit the model on the training dataset

In [4]:
lr = LinearRegression().fit(X_train, y_train)

### Then we evaluate the fitted model using the training dataset and test dataset
### Evaluation on test dataset is significantly worse than on training dataset
### Classic indication of overfitting 

In [5]:
y_train_pred = lr.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = lr.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.9353402315113886
R^2 on test dataset :  0.84287087704203


#### Standard linear regression performs badly because of overfitting. 
#### The model doesn't work well because the coefficient values for the feature variables don't have the best value possible due to the cost function optimization of linear regression. 

In [6]:
print(lr.coef_)

[-6.56000579e+02  1.82845512e+00 -5.92857666e+01  1.00201749e+01
 -2.53983517e+01  6.47104630e+01  4.73270639e+01 -3.88158553e+01
  4.12557369e+01  2.50660785e+01 -6.62975377e+00  1.91927729e+01
  2.33429828e+01  2.07014695e+01  2.41764970e+03  1.44806330e+03
  9.70954290e+01 -6.06765721e+01  6.22991206e+01  6.00731344e+01
  1.08168075e+02  9.44648254e+02 -1.97696260e+03  5.73260992e+02
  9.17325284e-01  5.98900993e+01 -4.89503909e+00 -1.66244529e+01
  2.45990765e+00 -5.56656273e+01  1.19101421e+01 -1.91814930e+00
 -2.12154187e+01 -2.53092085e+01  4.37918484e+01 -8.14296976e+00
  1.23883990e+01 -2.32651036e+01  3.08203488e+01 -3.41834532e+00
 -8.19474516e+00  4.47099319e+01  7.06819308e+00  1.97265795e+00
  1.73647824e+01  1.01707112e+00 -6.99336097e+00  1.61714772e+01
 -1.30004585e+01  1.00201749e+01 -2.43804133e+01 -3.34558107e+01
  5.47222956e+00  1.96658675e+01 -1.52626564e+01  3.68338940e+01
 -1.49900353e+01  3.64322407e+00 -4.95778327e+00  3.81139426e+00
  4.02029523e+01 -2.10483

## Checking overfitting on a Ridge Regression model

### Again, we first fit the model on the training dataset


In [7]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.7).fit(X_train, y_train)

### Then we evaluate the fitted model using the training dataset and test dataset
### Evaluation on test dataset is nearly the same as that on training dataset
### This could be an optimal model fit for the dataset 

In [8]:
y_train_pred = ridge.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = ridge.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))


R^2 on training dataset :  0.8711041719414669
R^2 on test dataset :  0.8537431288976209


#### Ridge regression performs better because regularization results in better values for the feature variables due to improved cost function optimization compared to linear regression. 

In [9]:
print(ridge.coef_)

[-1.63163728e+00 -2.01357526e+00 -9.31241432e-01  4.70491189e-01
  6.13803123e-01  9.24673064e+00  1.32661845e+00 -5.89940351e+00
  6.12801980e+00 -5.18288062e-01 -1.93801736e+00  1.65574330e+00
 -4.18613434e+00  8.49937034e-01  7.77475363e-03 -1.01490325e+00
  9.79343319e-01 -1.23998120e+00 -1.79557595e+00 -1.33340573e+00
 -1.88092745e-01 -2.16664461e+00 -1.80464253e+00 -1.56801113e+00
 -2.06058804e+00 -3.14974976e-01  2.71007136e+00 -2.05272772e+00
  1.85022711e-01 -3.12763249e-01  4.25457983e+00 -1.22807420e+00
 -8.06748932e-02  2.84577736e-01  1.25283968e+00  1.14542188e+00
 -1.91867976e+00 -1.52941703e+00  3.95825389e+00  1.49120568e+00
  2.10246654e+00 -3.76239367e+00  3.25157502e+00 -4.57288694e+00
  1.32909263e+00  3.25873498e+00 -3.82935635e+00  5.10514782e-01
 -4.57592649e+00  4.70491189e-01 -5.82378759e+00 -3.58162522e+00
  2.66297661e+00 -1.37757804e+00  1.87556317e+00  2.89532122e+00
  2.61912179e-01  2.35327862e+00 -3.49188312e+00 -1.56462779e+00
 -3.61639326e+00 -2.73135

## Checking overfitting on a Lasso Regression model

In [10]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1.0).fit(X_train, y_train)

### Evaluation on test dataset and training dataset is awful !
### This is a classic indication of underfitting - model is not a good fit for the dataset 

In [11]:
y_train_pred = lasso.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = lasso.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.2399356285166997
R^2 on test dataset :  0.22726292823054306


#### Lasso performs badly because of underfitting. 
#### The model doesn't work well because most of the coefficients have become exactly zero. 

In [12]:
print(lasso.coef_)

[-0.          0.         -0.          0.         -0.          0.
 -0.          0.         -0.         -0.         -0.          0.
 -5.239652   -0.          0.         -0.          0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         -0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.          0.         -0.
 -0.         -0.          0.         -0.         -0.         -0.
 -0.         -0.         -2.94821365 -0.         -0.          0.
 -0.         -0.         -0.          0.         -0.         -0.34924127
 -0.         -0. 

In [13]:
print(f"Number of features actually used : {sum(lasso.coef_ != 0)}")

Number of features actually used : 3


## Manual tuning of hyperparameter alpha to reduce underfitting in Lasso
#### Different hyperparameter values create different models which might be a better fit for the dataset

[Sci-kit learn documentation for Lasso regression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)lm)



In [14]:
lasso = Lasso(alpha=0.01, tol=1e-2).fit(X_train, y_train)

#### Evaluation on test dataset is nearly the same as that on training dataset
####  Hyperparameter tuning has resulted in a more optimal model fit for the Lasso regression model

In [15]:
y_train_pred = lasso.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = lasso.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.873030657197334
R^2 on test dataset :  0.8536025481616855


#### This time the coefficient values for the more significant variable columns are included in the model 

In [16]:
print(lasso.coef_)

[ -0.          -0.          -0.           0.          -0.
   0.           0.          -7.15687746  12.22619531   0.
  -0.           0.          -0.          -0.          -0.
  -0.           0.          -0.          -0.          -0.
  -0.          -9.1779753   -0.          -0.          -0.
  -0.           1.98743341  -0.           0.          -0.
   0.          -0.           0.          -0.           0.
   0.          -0.          -0.           1.1019535    0.
   0.          -0.           2.61245355  -4.89671657   0.
   2.62370235  -0.16967295   0.          -3.18012303   0.
  -2.51401802  -1.61594818   2.2930954   -0.           1.9735909
   0.           0.           1.13318507  -0.          -2.79370496
  -2.2969382   -0.          -0.          -0.          -0.
  -6.76431995  -0.          -0.          31.90763633  -0.
   0.         -14.7441085  -10.89514275  -8.84532558   7.76660855
 -14.7993476    0.          -2.83200332   7.33316484   0.
  -0.          -0.         -11.4865197    0.     

In [17]:
print(f"Number of features actually used : {sum(lasso.coef_ != 0)}")

Number of features actually used : 32


### Checking overfitting on a ElasticNet Regression model

In [18]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.01, tol=1e-2).fit(X_train, y_train)

In [19]:
y_train_pred = elastic_net.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = elastic_net.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.8068701710913826
R^2 on test dataset :  0.8048163676558451
