# IMPORT LIBRARIES

In [125]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import  OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
from sklearn import datasets
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.model_selection import cross_val_score, GridSearchCV
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) # to avoid deprecation warnings

import matplotlib.pyplot as plt
import plotly.figure_factory as ff
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# IMPORT DATASET

In [126]:
dataset = pd.read_csv('Walmart_Store_sales.csv')


# BASIC STATS 

In [127]:
# Basic stats
print("Number of rows : {}".format(dataset.shape[0]))
print()

print("Number of columns : {}".format(dataset.shape[1]))
print()

print("Display of dataset: ")
display(dataset.head())
print()

print("Basics statistics: ")
data_desc = dataset.describe(include='all')
display(data_desc)
print()

print("Percentage of missing values: ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 150

Number of columns : 8

Display of dataset: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092



Basics statistics: 


Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15



Percentage of missing values: 


Store            0.000000
Date            12.000000
Weekly_Sales     9.333333
Holiday_Flag     8.000000
Temperature     12.000000
Fuel_Price       9.333333
CPI              8.000000
Unemployment    10.000000
dtype: float64

# DATASET CLEANING 

In [128]:
# The target variable (Weekly_Sales) contains some missing values. 
# We will have to just drop the lines in the dataset for which the value in this column is missing.

dataset = dataset.dropna(subset=['Weekly_Sales'])

print("Number of rows : {}".format(dataset.shape[0]))
print()
print("Percentage of missing values after cleaning : ")
display(100*dataset.isnull().sum()/dataset.shape[0])

Number of rows : 136

Percentage of missing values after cleaning : 


Store            0.000000
Date            13.235294
Weekly_Sales     0.000000
Holiday_Flag     8.088235
Temperature     11.029412
Fuel_Price       8.823529
CPI              8.088235
Unemployment    10.294118
dtype: float64

# CORRELATIONS 

In [129]:
# Let's analyse the relationship of each variable with the target (Weekly_Sales): 

fig = px.scatter_matrix(dataset)
fig.update_layout(
        title = go.layout.Title(text = "Bivariate analysis", x = 0.5), height=1500, width=1500)
fig.show()

In [130]:
# Let's plot the correlation matrix :

corr_matrix = dataset.corr(numeric_only=True).round(2)

fig=ff.create_annotated_heatmap(corr_matrix.values,
x= corr_matrix.columns.tolist(),
y=corr_matrix.index.tolist())

fig.show()

# DATASET CLEANING 

In [131]:
# The Date column cannot be included as it is in the model. 
# We will create new columns that contain the following numeric features :
# year, month, day, day of week

# Convert the column 'Date' to datetime fromat first :
dataset['Date'] = pd.to_datetime(dataset['Date'], format='%d-%m-%Y')

# Delete the missing values in Date column :
dataset = dataset.dropna(subset=['Date'])

# Create new columns :
dataset['Year'] = dataset['Date'].dt.year.astype(int) # Because our colum Date containts float values
dataset['Month'] = dataset['Date'].dt.month.astype(int)
dataset['Day'] = dataset['Date'].dt.day.astype(int)
dataset['Day_of_week'] = dataset['Date'].dt.dayofweek.astype(int)  # 0 for monday, 1 for tuesday, etc.

print("Number of rows : {}".format(dataset.shape[0]))
print()
print("Number of columns : {}".format(dataset.shape[1]))
print()
dataset.head()

Number of rows : 118

Number of columns : 12



Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_of_week
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,2011-06-03,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4


In [132]:
# Drop lines containing invalid values or outliers
# In this project, will be considered as outliers all the numeric features that don't fall within the range : [𝑋¯−3𝜎,𝑋¯+3𝜎] . This concerns the columns : Temperature, Fuel_price, CPI and Unemployment


low =  dataset['Temperature'].mean() - 3 *  dataset['Temperature'].std()
high =  dataset['Temperature'].mean() + 3 *  dataset['Temperature'].std()
print('The number of outliers in {} is {}'.format('Temperature', dataset.loc[(dataset['Temperature'] <= low) | (dataset['Temperature'] >= high)].shape[0]))


low =  dataset['Fuel_Price'].mean() - 3 *  dataset['Fuel_Price'].std()
high =  dataset['Fuel_Price'].mean() + 3 *  dataset['Fuel_Price'].std()
print('The number of outliers in {} is {}'.format('Fuel_Price', dataset.loc[(dataset['Fuel_Price'] <= low) | (dataset['Fuel_Price'] >= high)].shape[0]))


low =  dataset['CPI'].mean() - 3 *  dataset['CPI'].std()
high =  dataset['CPI'].mean() + 3 *  dataset['CPI'].std()
print('The number of outliers in {} is {}'.format('CPI', dataset.loc[(dataset['CPI'] <= low) | (dataset['CPI'] >= high)].shape[0]))


low =  dataset['Unemployment'].mean() - 3 *  dataset['Unemployment'].std()
high =  dataset['Unemployment'].mean() + 3 *  dataset['Unemployment'].std()
print('The number of outliers in {} is {}'.format('Unemployment', dataset.loc[(dataset['Unemployment'] <= low) | (dataset['Unemployment'] >= high)].shape[0]))



The number of outliers in Temperature is 0
The number of outliers in Fuel_Price is 0
The number of outliers in CPI is 0
The number of outliers in Unemployment is 5


In [133]:
print("Droping the outliers ...")
low =  dataset['Unemployment'].mean() - 3 *  dataset['Unemployment'].std()
high =  dataset['Unemployment'].mean() + 3 *  dataset['Unemployment'].std()
dataset=dataset.drop(dataset.loc[(dataset['Unemployment'] <= low) | (dataset['Unemployment'] >= high)].index)
print('The number of outliers in {} is {}'.format('Unemployment', dataset.loc[(dataset['Unemployment'] <= low) | (dataset['Unemployment'] >= high)].shape[0]))

Droping the outliers ...
The number of outliers in Unemployment is 0


In [134]:
# Basic stats again :
print("Number of rows : {}".format(dataset.shape[0]))
print()
print("Number of columns : {}".format(dataset.shape[1]))
print()
print("Percentage of missing values after cleaning : ")
display(100*dataset.isnull().sum()/dataset.shape[0])
dataset.head()

Number of rows : 113

Number of columns : 12

Percentage of missing values after cleaning : 


Store           0.000000
Date            0.000000
Weekly_Sales    0.000000
Holiday_Flag    7.964602
Temperature     8.849558
Fuel_Price      9.734513
CPI             7.964602
Unemployment    9.734513
Year            0.000000
Month           0.000000
Day             0.000000
Day_of_week     0.000000
dtype: float64

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_of_week
0,6.0,2011-02-18,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,2011-03-25,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,2010-05-28,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,2010-05-28,1857533.7,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,2011-06-03,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4


# SEPARATING X AND Y 

In [135]:
# Target variable (Y) that we will try to predict, to separate from the others : Weekly_Sales
# Explanatory variables (X) : 
# Categorical variables : Store, Holiday_Flag
# Numerical variables : Temperature, Fuel_Price, CPI, Unemployment, Year, Month, Day, DayOfWeek

print("Separating labels from features...")

features_list = ["Store","Holiday_Flag","Temperature","Fuel_Price","CPI","Unemployment","Year","Month","Day","Day_of_week"]
target_variable = "Weekly_Sales"

X = dataset.loc[:,features_list]
Y = dataset.loc[:,target_variable]

print("...Done.")
print()

print('Y : ')
print(Y.head())
print()
print('X :')
X.head()

Separating labels from features...
...Done.

Y : 
0    1572117.54
1    1807545.43
4    1644470.66
5    1857533.70
6     695396.19
Name: Weekly_Sales, dtype: float64

X :


Unnamed: 0,Store,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,Day_of_week
0,6.0,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
5,4.0,0.0,,2.756,126.160226,7.896,2010,5,28,4
6,15.0,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4


# DIVIDING INTO TRAIN AND TEST SET

In [136]:
# Divide dataset Train set & Test set 

print("Dividing into train and test sets...")
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
print("...Done.")
print()

Dividing into train and test sets...
...Done.



# PREPROCESSING

In [137]:
# Encoding categorical features and standardizing numeric features

numeric_features = ["Temperature","Fuel_Price","CPI","Unemployment","Year","Month","Day","Day_of_week"]
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')), # missing values will be replaced by columns' mean
    ('scaler', StandardScaler())
])


categorical_features = ["Store","Holiday_Flag"]
categorical_transformer = Pipeline(
    steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', OneHotEncoder(drop='first')) # first column will be dropped to avoid creating correlations between features
    ])

preprocessor = ColumnTransformer(
    transformers=[
        ('cat', categorical_transformer, categorical_features),    
        ('num', numeric_transformer, numeric_features)
        ]
    )

In [138]:
print("Preprocessing X_train...")
print(X_train.head())
print()

X_train = preprocessor.fit_transform(X_train)
print("...Done!")
print(X_train[0:5,:]) # X_train is now a numpy array
print()

Preprocessing X_train...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
52     9.0           0.0        85.02       2.653  214.896576           NaN   
11    18.0           0.0        52.02       2.878  132.763355         9.331   
105   19.0           0.0        72.83       2.932  132.598387         8.099   
110   20.0           1.0        28.85       3.179  204.643227         7.484   
75    20.0           0.0        75.17       2.808  204.567546         7.856   

     Year  Month  Day  Day_of_week  
52   2010      6   25            4  
11   2010     10   15            4  
105  2010      7   30            4  
110  2010     12   31            4  
75   2010      6   25            4  

...Done!
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0

In [139]:
print("Preprocessing X_test...")
print(X_test.head())
print()
X_test = preprocessor.transform(X_test)
print("...Done!")
print(X_test[0:5,:]) 
print()

Preprocessing X_test...
     Store  Holiday_Flag  Temperature  Fuel_Price         CPI  Unemployment  \
141    5.0           0.0        62.37         NaN  212.560411         6.768   
13     1.0           0.0        64.74       3.734  221.211813         7.348   
29     6.0           0.0          NaN       3.523  217.270654         6.925   
10     8.0           0.0        82.92       3.554  219.070197         6.425   
144    3.0           0.0        73.44       3.594  226.968844         6.034   

     Year  Month  Day  Day_of_week  
141  2010     11   12            4  
13   2012      3   16            4  
29   2011      8   26            4  
10   2011      8   19            4  
144  2012     10   19            4  

...Done!
[[ 0.00000000e+00  0.00000000e+00  0.00000000e+00  1.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.00000000e+00  0.00000000e+00  0.00000000e+00  0.00000000e+00
   0.

# TRAINING MODEL

In [140]:
# Train model
print("Train model...")
regressor = LinearRegression()
regressor.fit(X_train, Y_train)
print("...Done.")

Train model...
...Done.


# PREDICTIONS 

In [141]:
# Predictions on training set
print("Predictions on training set...")
Y_train_pred = regressor.predict(X_train)
print("...Done.")
print(Y_train_pred)
print()

Predictions on training set...
...Done.
[ 355585.09124367 1159355.29938787 1403098.60982156 2066897.86334844
 1925751.39058183 1993378.09109697 2170861.80479594 1583913.14514438
  608309.47251647 2435430.82890442 1425012.44373189 2050457.39423482
 1963010.32321674 1988442.98996716 1292993.22892673 1895015.95229027
  577940.55754069 1356732.09545341 1370325.28797105  922168.7875471
 2118430.29683601  330062.81192987 1976468.41972023  363264.01495491
 1650427.27893564 1847044.77913999 2037621.67177058 2047708.28745338
 2074840.9320166   683658.94625479  606434.1042874  1151700.92166614
  370649.67457154  403311.09748353 1538480.70206245 1983747.60606973
 2427756.87804595  429794.14134814 1620329.82185334 1560398.34734786
  507856.88236816  317368.86338655  403695.24855286 1491505.6134957
  756685.67597289  411578.63715934 1994109.78023765 2162634.00847892
 1757242.51        454895.21759604  944538.20767007 1480509.80572267
 2169009.66795581  138690.0555809  1399893.28404557  951527.42848

In [142]:
# Predictions on test set
print("Predictions on test set...")
Y_test_pred = regressor.predict(X_test)
print("...Done.")
print(Y_test_pred)
print()

Predictions on test set...
...Done.
[ 378501.12025534 1439255.57249924 1610241.58807788  882223.27111356
  466933.48815387 1108363.56016822 2062248.08061379 2316549.59407455
 2054918.82705112 1557403.30045123 1029671.02504803 2045617.7019931
 1120297.6495339   607686.4755896   466043.15125443   79356.42838598
  612205.50606513  168683.03779802 1810919.52958697  487763.60993983
 1924420.17546454  472644.73646539 2069806.24173904]



# CALCULATING R2

In [143]:
# Print R^2 scores
print("R2 score on training set : ", r2_score(Y_train, Y_train_pred))
print("R2 score on test set : ", r2_score(Y_test, Y_test_pred))

R2 score on training set :  0.9727237843593463
R2 score on test set :  0.9396190073986596


# COEFFICIENTS 

In [144]:
## Check coefficients :

print("coefficients are: ", regressor.coef_) 
print("Constant is: ", regressor.intercept_)

coefficients are:  [  355806.60652067 -1206228.15591785   279407.69552419 -1365747.32325047
    32640.44161767 -1000065.89835311  -714341.47889698 -1172390.63116077
   264167.76347575   161876.19715782   163524.64533844   546824.36642173
 -1029657.52583133 -1154682.20563374 -1097046.09419353  -601983.40020905
  -274981.85244811   402160.56031354   -52738.11298168   -31513.61547433
   -41295.66218524   -95520.49388857   -71936.21160876     4282.20729477
    74668.67929179   -37977.28662316        0.        ]
Constant is:  1659094.164800709


In [145]:
column_names = []
for name, pipeline, features_list in preprocessor.transformers_: # loop over pipelines
    if name == 'num': # if pipeline is for numeric variables
        features = features_list # just get the names of columns to which it has been applied
    else: # if pipeline is for categorical variables
        features = pipeline.named_steps['encoder'].get_feature_names_out() # get output columns names from OneHotEncoder
    column_names.extend(features) # concatenate features names
        
print("Names of columns corresponding to each coefficient: ", column_names)

Names of columns corresponding to each coefficient:  ['x0_2.0', 'x0_3.0', 'x0_4.0', 'x0_5.0', 'x0_6.0', 'x0_7.0', 'x0_8.0', 'x0_9.0', 'x0_10.0', 'x0_11.0', 'x0_13.0', 'x0_14.0', 'x0_15.0', 'x0_16.0', 'x0_17.0', 'x0_18.0', 'x0_19.0', 'x0_20.0', 'x1_1.0', 'Temperature', 'Fuel_Price', 'CPI', 'Unemployment', 'Year', 'Month', 'Day', 'Day_of_week']


In [146]:
# Create a pandas DataFrame
coefs = pd.DataFrame(index = column_names, data = regressor.coef_.transpose(), columns=["coefficients"])
coefs

Unnamed: 0,coefficients
x0_2.0,355806.6
x0_3.0,-1206228.0
x0_4.0,279407.7
x0_5.0,-1365747.0
x0_6.0,32640.44
x0_7.0,-1000066.0
x0_8.0,-714341.5
x0_9.0,-1172391.0
x0_10.0,264167.8
x0_11.0,161876.2


# FEATURE IMPORTANCE 

In [147]:
# Compute abs() and sort values
feature_importance = abs(coefs).sort_values(by = 'coefficients', ascending=False)
feature_importance

Unnamed: 0,coefficients
x0_5.0,1365747.0
x0_3.0,1206228.0
x0_9.0,1172391.0
x0_16.0,1154682.0
x0_17.0,1097046.0
x0_15.0,1029658.0
x0_7.0,1000066.0
x0_8.0,714341.5
x0_18.0,601983.4
x0_14.0,546824.4


In [148]:
# Plot coefficients
fig = px.bar(feature_importance, orientation = 'h')
fig.update_layout(showlegend = False, 
                  margin = {'l': 120} # to avoid cropping of column names
                 )
fig.show()



# RIDGE REGULARIZATION

In [149]:
# Manually 
# Applying a ridge model with alpha = 0.01
ridge1 = Ridge(alpha=0.01)
print(ridge1)
ridge1.fit(X_train,Y_train)
print("R2 score on training set : ", ridge1.score(X_train, Y_train))
print("R2 score on test set : ", ridge1.score(X_test, Y_test))

Ridge(alpha=0.01)
R2 score on training set :  0.972673902637044
R2 score on test set :  0.9429141955509573


In [150]:
# Applying a ridge model with alpha = 0.1
ridge2= Ridge(alpha=0.1)
print(ridge2)
ridge2.fit(X_train,Y_train)
print("R2 score on training set : ", ridge2.score(X_train, Y_train))
print("R2 score on test set : ", ridge2.score(X_test, Y_test))



Ridge(alpha=0.1)
R2 score on training set :  0.9706524603355959
R2 score on test set :  0.9519054727576921


In [151]:
# Applying a ridge model with alpha = 1
ridge3= Ridge(alpha=1)
print(ridge3)
ridge3.fit(X_train,Y_train)
print("R2 score on training set : ", ridge3.score(X_train, Y_train))
print("R2 score on test set : ", ridge3.score(X_test, Y_test))

Ridge(alpha=1)
R2 score on training set :  0.9277770952086051
R2 score on test set :  0.9150396969494594


In [152]:
# Storing the coefficients into a dataframe 
data_dict = {
    'Feature': preprocessor.get_feature_names_out(),
    'Ridge1': ridge1.coef_,
    'Ridge2': ridge2.coef_,
    'Ridge3': ridge3.coef_
            }

coefficients_ridge = pd.DataFrame(data = data_dict)
coefficients_ridge.head()

Unnamed: 0,Feature,Ridge1,Ridge2,Ridge3
0,cat__Store_2.0,369415.9,435975.9,520426.504735
1,cat__Store_3.0,-1190100.0,-1097188.0,-801334.004848
2,cat__Store_4.0,331298.8,526241.2,599421.824375
3,cat__Store_5.0,-1347696.0,-1225449.0,-768208.43808
4,cat__Store_6.0,46615.43,128657.8,295520.683738


In [153]:
# Plot different Ridge coefficients 
fig = px.line(coefficients_ridge, x = 'Feature', y = ['Ridge1', 'Ridge2', 'Ridge3'])
fig.show()

In [154]:
# Perform grid search
print("Grid search...")
regressor = Ridge()
# Grid of values to be tested
params = {
    'alpha': [0.01, 0.1, 0.5 , 0.9, 1, 3, 5, 10] 
}
best_ridge = GridSearchCV(regressor, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
best_ridge.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_ridge.best_params_)
print("Best R2 score : ", best_ridge.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 0.01}
Best R2 score :  0.9058186861784415


In [155]:
# Print R^2 scores
print("R2 score on training set : ", best_ridge.score(X_train, Y_train))
print("R2 score on test set : ", best_ridge.score(X_test, Y_test))


R2 score on training set :  0.972673902637044
R2 score on test set :  0.9429141955509573


We can notice a decrease in $R^2$ when the penalization parameter $\alpha$ increases.

# LASSO REGULARIZATION

In [156]:
# Manually :
# Applying a lasso regularization with alpha = 10
lasso1 = Lasso(alpha=10)
print(lasso1)
lasso1.fit(X_train,Y_train)
print("R2 score on training set : ", lasso1.score(X_train, Y_train))
print("R2 score on test set : ", lasso1.score(X_test, Y_test))

Lasso(alpha=10)
R2 score on training set :  0.9727235351337935
R2 score on test set :  0.9398598998775397


In [157]:
# Applying a lasso regularization with alpha = 100
lasso2 = Lasso(alpha = 100)
print(lasso2)
lasso2.fit(X_train, Y_train)
# Print R^2 scores
print("R2 score on training set : ", lasso2.score(X_train, Y_train))
print("R2 score on test set : ", lasso2.score(X_test, Y_test))

Lasso(alpha=100)
R2 score on training set :  0.9727002762184318
R2 score on test set :  0.941815208575437


In [158]:
# Applying a lasso regularization with alpha = 1000
lasso3 = Lasso(alpha = 1000)
print(lasso3)
lasso3.fit(X_train, Y_train)
# Print R^2 scores
print("R2 score on training set : ", lasso3.score(X_train, Y_train))
print("R2 score on test set : ", lasso3.score(X_test, Y_test))

Lasso(alpha=1000)
R2 score on training set :  0.9707438366848931
R2 score on test set :  0.9560337267925487


In [159]:
# storing the coefficients into a dataframe 
data_dict = {
    'Feature': preprocessor.get_feature_names_out(),
    'Lasso1': lasso1.coef_,
    'Lasso2': lasso2.coef_,
    'Lasso3': lasso3.coef_
            }

coefficients_lasso = pd.DataFrame(data = data_dict)
coefficients_lasso.head()

Unnamed: 0,Feature,Lasso1,Lasso2,Lasso3
0,cat__Store_2.0,356388.5,361247.8,397734.3
1,cat__Store_3.0,-1205468.0,-1198966.0,-1140873.0
2,cat__Store_4.0,281939.4,302548.1,494909.7
3,cat__Store_5.0,-1365017.0,-1358613.0,-1295720.0
4,cat__Store_6.0,32928.06,35300.73,55899.44


In [160]:
# Plot coefficients
fig = px.line(coefficients_lasso, x = 'Feature', y = ['Lasso1', 'Lasso2', 'Lasso3'])
fig.show()

In [161]:
# Perform grid search for Lasso
print("Grid search...")
regressor = Lasso()
# Grid of values to be tested
params = {
    'alpha': [1, 3, 5, 10,30, 50, 80,100, 1000]
}
best_lasso = GridSearchCV(regressor, param_grid = params, cv = 3) # cv : the number of folds to be used for CV
best_lasso.fit(X_train, Y_train)
print("...Done.")
print("Best hyperparameters : ", best_lasso.best_params_)
print("Best R2 score : ", best_lasso.best_score_)

Grid search...
...Done.
Best hyperparameters :  {'alpha': 1000}
Best R2 score :  0.9055302047202248



Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.669e+10, tolerance: 2.796e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 1.239e+10, tolerance: 2.796e+09


Objective did not converge. You might want to increase the number of iterations, check the scale of the features or consider increasing regularisation. Duality gap: 5.462e+09, tolerance: 2.796e+09



In [162]:
# Print R^2 scores
print("LASSO / R2 score on training set : ", best_lasso.score(X_train, Y_train))
print("LASSO / R2 score on test set : ", best_lasso.score(X_test, Y_test))

LASSO / R2 score on training set :  0.9707438366848931
LASSO / R2 score on test set :  0.9560337267925487


# COMPARING RIDGE AND LASSO

In [163]:
# Comparing R2 :
print("RIDGE / R2 score on training set : ", best_ridge.score(X_train, Y_train))
print("RIDGE / R2 score on test set : ", best_ridge.score(X_test, Y_test))
print()
print("LASSO / R2 score on training set : ", best_lasso.score(X_train, Y_train))
print("LASSO / R2 score on test set : ", best_lasso.score(X_test, Y_test))

RIDGE / R2 score on training set :  0.972673902637044
RIDGE / R2 score on test set :  0.9429141955509573

LASSO / R2 score on training set :  0.9707438366848931
LASSO / R2 score on test set :  0.9560337267925487


In [164]:
data_dict = {
    'Feature': preprocessor.get_feature_names_out(),
    'Best_Ridge': best_ridge.best_estimator_.coef_,
    'Best_Lasso': best_lasso.best_estimator_.coef_
            }

coefficients = pd.DataFrame(data = data_dict)
coefficients

Unnamed: 0,Feature,Best_Ridge,Best_Lasso
0,cat__Store_2.0,369415.9,397734.3
1,cat__Store_3.0,-1190100.0,-1140873.0
2,cat__Store_4.0,331298.8,494909.7
3,cat__Store_5.0,-1347696.0,-1295720.0
4,cat__Store_6.0,46615.43,55899.44
5,cat__Store_7.0,-981743.1,-915460.9
6,cat__Store_8.0,-699330.8,-656008.0
7,cat__Store_9.0,-1155149.0,-1096740.0
8,cat__Store_10.0,307773.6,423701.7
9,cat__Store_11.0,166950.4,52642.09


In [165]:

# Plot best ridge and best lasso 
fig = px.line(coefficients, x = 'Feature', y = ['Best_Ridge', 'Best_Lasso'])
fig.show()

After applying Ridge and Lasso Regularizations, I was able to reduce the overfitting a little bit.
We notice that there isn't a big difference between the results of the Ridge and Lasso regularizations. 