## Regularization (Shrinkage Models)



In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Lasso, Ridge, LassoCV, BayesianRidge
import statsmodels.formula.api as sm
from sklearn.model_selection import train_test_split

from dmba import *

In [2]:
import sklearn 
print(sklearn.__version__)

1.3.0


In [4]:
df = pd.read_csv('ToyotaCorolla.csv',sep=',')
df = df.iloc[0:1000]
predictors = ['Age_08_04','KM','Fuel_Type','HP','Met_Color',
             'Automatic','CC','Doors','Quarterly_Tax','Weight'
             ]
outcome = 'Price'
X = pd.get_dummies(df[predictors],drop_first=True)
y = df[outcome]
x_train, x_test, y_train, y_test = train_test_split(X,y,test_size = 0.4, random_state=1)

#### Linear Regression Results for reference: 

                      Mean Error (ME) : 103.6803
       Root Mean Squared Error (RMSE) : 1312.8523
            Mean Absolute Error (MAE) : 1017.5972
          Mean Percentage Error (MPE) : -0.2633
    Mean Absolute Percentage Error (MAPE) : 9.0111


### Lasso

In [5]:
lasso = Lasso(alpha = 1)
lasso.fit(x_train, y_train)
regressionSummary(y_test,lasso.predict(x_test))


Regression statistics

                      Mean Error (ME) : 104.7566
       Root Mean Squared Error (RMSE) : 1312.7798
            Mean Absolute Error (MAE) : 1017.4997
          Mean Percentage Error (MPE) : -0.2578
Mean Absolute Percentage Error (MAPE) : 9.0084


In [5]:
# Lasso selects 2 predictors 
print (lasso.coef_[lasso.coef_!=0])

[-1.40732235e+02 -1.78979828e-02  3.57524404e+01  7.70490211e+01
  3.91123192e+02  2.11528407e-02 -4.93513869e+01  1.34456885e+01
  1.31123238e+01  9.20546538e+02  2.16820642e+03]


### Ridge

In [6]:
ridge = Ridge(alpha = 1)
ridge.fit(x_train, y_train)
regressionSummary(y_test,ridge.predict(x_test))


Regression statistics

                      Mean Error (ME) : 107.2419
       Root Mean Squared Error (RMSE) : 1314.2547
            Mean Absolute Error (MAE) : 1018.3981
          Mean Percentage Error (MPE) : -0.2404
Mean Absolute Percentage Error (MAPE) : 9.0167


### Statsmodel

In [6]:
data = x_train.join(y_train)
data.head()

Unnamed: 0,Age_08_04,KM,HP,Met_Color,Automatic,CC,Doors,Quarterly_Tax,Weight,Fuel_Type_Diesel,Fuel_Type_Petrol,Price
371,35,18000,110,1,0,1600,5,85,1075,0,1,13995
45,23,84000,90,0,0,2000,5,234,1270,1,0,19000
560,54,39291,110,1,0,1600,3,69,1040,0,1,10950
748,61,81170,110,1,0,1600,4,69,1040,0,1,8950
419,55,94122,86,1,0,1300,3,69,1015,0,1,8950


In [7]:
formula = 'Price ~ ' + ' + '.join(x_train.columns)
sm_cars = sm.ols(formula = formula, data = data).fit()
sm_cars.summary()            

0,1,2,3
Dep. Variable:,Price,R-squared:,0.856
Model:,OLS,Adj. R-squared:,0.854
Method:,Least Squares,F-statistic:,319.0
Date:,"Tue, 06 Feb 2024",Prob (F-statistic):,1.73e-239
Time:,19:12:43,Log-Likelihood:,-5198.1
No. Observations:,600,AIC:,10420.0
Df Residuals:,588,BIC:,10470.0
Df Model:,11,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-1319.3544,1728.427,-0.763,0.446,-4713.997,2075.288
Age_08_04,-140.7488,5.142,-27.374,0.000,-150.847,-130.650
KM,-0.0178,0.002,-7.286,0.000,-0.023,-0.013
HP,36.1034,5.321,6.785,0.000,25.653,46.554
Met_Color,84.2818,127.005,0.664,0.507,-165.158,333.721
Automatic,416.7820,259.794,1.604,0.109,-93.454,927.018
CC,0.0177,0.099,0.179,0.858,-0.177,0.213
Doors,-50.6579,65.187,-0.777,0.437,-178.686,77.371
Quarterly_Tax,13.6253,2.518,5.411,0.000,8.680,18.571

0,1,2,3
Omnibus:,62.422,Durbin-Watson:,1.899
Prob(Omnibus):,0.0,Jarque-Bera (JB):,366.046
Skew:,0.186,Prob(JB):,3.27e-80
Kurtosis:,6.808,Cond. No.,2200000.0


For standardization with statsmodels use ```OLS.fit_regularized``` and set argument ```L1_wt = 0``` for ridge regression, and ```L1_wt=1``` for lasso. 

In [9]:
from sklearn.metrics import mean_squared_error, r2_score
# The mean squared error
print('Mean squared error: %.2f'
      % mean_squared_error(y_test, lasso.predict(x_test)))
# The coefficient of determination: 1 is perfect prediction
print('(R^2)Coefficient of determination: %.2f'
      % r2_score(y_test, lasso.predict(x_test)))

Mean squared error: 1723390.75
(R^2)Coefficient of determination: 0.88
