In [26]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from numpy import genfromtxt
from sklearn.metrics import r2_score

### Dataset with large number of feature variables - 104 !!

In [27]:
dataset = genfromtxt('https://raw.githubusercontent.com/m-mehdi/tutorials/main/boston_housing.csv', delimiter=',')
X = dataset[:,:-1]
y = dataset[:,-1]
print (X.shape)
print (y.shape)

(506, 104)
(506,)


In [28]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.25, random_state=0)

### Checking overfitting on a Linear Regression model

In [29]:
lr = LinearRegression().fit(X_train, y_train)

### Evaluation on test dataset is significantly worse than on training dataset
### Classic indication of overfitting 

In [30]:
y_train_pred = lr.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = lr.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.9520519609032733
R^2 on test dataset :  0.6074721959665701


#### Standard linear regression performs badly because of overfitting. 
#### The model doesn't work well because the coefficient values for the feature variables don't have the best value possible due to the cost function optimization of linear regression. 

In [31]:
print(lr.coef_)

[-4.12710947e+02 -5.22432068e+01 -1.31898815e+02 -1.20041365e+01
 -1.55107129e+01  2.87163342e+01  5.47040992e+01 -4.95346659e+01
  2.65823927e+01  3.70620316e+01 -1.18281674e+01 -1.80581965e+01
 -1.95246830e+01  1.22025403e+01  2.98078144e+03  1.50084257e+03
  1.14187325e+02 -1.69700520e+01  4.09613691e+01 -2.42636646e+01
  5.76157466e+01  1.27812142e+03 -2.23986944e+03  2.22825472e+02
 -2.18201083e+00  4.29960320e+01 -1.33981515e+01 -1.93893485e+01
 -2.57541277e+00 -8.10130128e+01  9.66019367e+00  4.91423718e+00
 -8.12114800e-01 -7.64694179e+00  3.37837099e+01 -1.14464390e+01
  6.85083979e+01 -1.73753604e+01  4.28128204e+01  1.13988209e+00
 -7.72696840e-01  5.68255921e+01  1.42875996e+01  5.39551110e+01
 -3.21709644e+01  1.92709675e+01 -1.38852338e+01  6.06343266e+01
 -1.23153942e+01 -1.20041365e+01 -1.77243899e+01 -3.39868183e+01
  7.08999816e+00 -9.22538241e+00  1.71980268e+01 -1.27718431e+01
 -1.19727581e+01  5.73871915e+01 -1.75331865e+01  4.10103194e+00
  2.93666477e+01 -1.76611

### Checking overfitting on a Ridge Regression model

In [32]:
from sklearn.linear_model import Ridge
ridge = Ridge(alpha=0.7).fit(X_train, y_train)

### Evaluation on test dataset is slightly worse than on training dataset
### This could be an optimal model fit for the dataset 

In [33]:
y_train_pred = ridge.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = ridge.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))


R^2 on training dataset :  0.8957405768115358
R^2 on test dataset :  0.7614807948118889


#### Ridge regression performs better because regularization results in better values for the feature variables due to improved cost function optimization compared to linear regression. 

In [34]:
print(ridge.coef_)

[-1.50621924e+00 -2.11678911e+00 -1.69158947e+00 -1.39618680e-01
  1.60062726e-01  9.31427114e+00  4.76956483e-01 -5.56235430e+00
  4.72632620e+00 -8.23255769e-01 -1.37993788e+00  1.18515649e+00
 -3.69120088e+00  9.14612863e-01  3.54636199e-03 -8.76814786e-01
  9.85262089e-01 -1.58410643e+00 -1.52248147e+00 -1.54938713e+00
 -3.16457443e-02 -1.93071784e+00 -1.59122380e+00 -1.46511186e+00
 -1.61431014e+00 -6.30795079e-01  2.79754845e+00 -2.42159726e+00
  4.65327122e-01 -4.96611258e-01  5.72262805e+00 -2.04878131e+00
  3.58355553e-02  8.33936325e-01 -3.93835799e-01  3.77348305e-01
 -1.72478428e+00 -3.39883756e+00  3.86694415e+00  9.33171449e-01
  2.11290564e+00 -4.14653457e+00  2.84785566e+00 -3.83353343e+00
  2.13808310e+00  3.57326283e+00 -2.66422497e+00 -2.64111830e-01
 -3.92026709e+00 -1.39618680e-01 -6.31004004e+00 -3.85599897e+00
  3.41709098e+00 -1.02194018e+00  3.29935139e+00  2.56981274e+00
  1.33978535e+00  2.38957144e+00 -3.00921598e+00 -1.12884164e+00
 -3.22795350e+00 -2.51279

### Checking overfitting on a Lasso Regression model

In [35]:
from sklearn.linear_model import Lasso
lasso = Lasso(alpha=1.0).fit(X_train, y_train)

### Evaluation on test dataset and training dataset is awful !
#### This is a classic indication of underfitting - model is not a good fit for the dataset 

In [36]:
y_train_pred = lasso.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = lasso.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.29323768991114596
R^2 on test dataset :  0.20937503255272272


#### Lasso performs badly because of underfitting. 
#### The model doesn't work well because most of the coefficients have become exactly zero. 

In [37]:
print(lasso.coef_)

[-0.          0.         -0.          0.         -0.          0.
 -0.          0.         -0.         -0.         -0.          0.
 -5.3529079  -0.          0.         -0.          0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.          0.          0.
  0.          0.          0.          0.          0.          0.
  0.          0.         -0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.          0.          0.          0.          0.          0.
  0.          0.          0.          0.         -0.         -0.
 -0.         -0.         -0.         -0.         -0.         -0.
 -0.         -0.          0.          0.          0.         -0.
 -0.         -0.          0.         -0.         -0.          0.
 -0.         -1.05063037 -3.3104274  -0.         -0.          0.
 -0.         -0.         -0.          0.         -0.         -0.41386744
 -0.         -0. 

In [38]:
print(f"Number of features actually used : {sum(lasso.coef_ != 0)}")

Number of features actually used : 4


### Tuning of hyperparameter alpha to reduce underfitting in Lasso
#### Different hyperparameter values create different models which might be a better fit for the dataset

In [39]:
lasso = Lasso(alpha=0.01, tol=1e-2).fit(X_train, y_train)

In [40]:
y_train_pred = lasso.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = lasso.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.8955681364442507
R^2 on test dataset :  0.7660240830347568


#### This time the coefficient values for the more significant variable columns are included in the model 

In [41]:
print(lasso.coef_)

[-0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -0.00000000e+00 -8.80667134e+00
  1.24181345e+01  0.00000000e+00  0.00000000e+00  0.00000000e+00
 -2.90779264e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -8.54659728e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  2.33256129e+00 -0.00000000e+00
  0.00000000e+00 -0.00000000e+00  0.00000000e+00 -1.65371847e-02
  0.00000000e+00 -0.00000000e+00 -0.00000000e+00  0.00000000e+00
 -0.00000000e+00 -0.00000000e+00  2.60668517e-02  0.00000000e+00
  0.00000000e+00 -0.00000000e+00  4.47615503e-02 -1.80975950e+00
  0.00000000e+00  3.25973213e+00 -0.00000000e+00 -0.00000000e+00
 -0.00000000e+00  0.00000000e+00 -4.46033185e+00 -2.00949936e+00
  3.99444538e+00 -0.00000000e+00  4.31197312e+00  0.00000000e+00
  0.00000000e+00  8.85575461e-02 -0.00000000e+00 -7.69700989e-01
 -4.28194566e+00 -0.00000

In [42]:
print(f"Number of features actually used : {sum(lasso.coef_ != 0)}")

Number of features actually used : 33


### Checking overfitting on a ElasticNet Regression model

In [43]:
from sklearn.linear_model import ElasticNet
elastic_net = ElasticNet(alpha=0.01, l1_ratio=0.01, tol=1e-2).fit(X_train, y_train)

In [44]:
y_train_pred = elastic_net.predict(X_train)
print('R^2 on training dataset : ', r2_score(y_train, y_train_pred))

y_test_pred = elastic_net.predict(X_test)
print('R^2 on test dataset : ', r2_score(y_test, y_test_pred))

R^2 on training dataset :  0.8354121139215439
R^2 on test dataset :  0.6975711194094201
