Importons les librairies nécessaires.

In [None]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.model_selection import cross_val_score, GridSearchCV
from sklearn.metrics import r2_score

import plotly.express as px
import plotly.figure_factory as ff
import plotly.graph_objects as go

In [77]:
df = pd.read_csv("../src/Walmart_Store_sales.csv")

target = "Weekly_Sales"

## Part 1 : EDA and data preprocessing

Quelques statistiques de base : shape, pourcentages de valeurs manquantes, etc...

In [50]:
df.shape

(150, 8)

In [78]:
df.head()

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
0,6.0,18-02-2011,1572117.54,,59.61,3.045,214.777523,6.858
1,13.0,25-03-2011,1807545.43,0.0,42.38,3.435,128.616064,7.47
2,17.0,27-07-2012,,0.0,,,130.719581,5.936
3,11.0,,1244390.03,0.0,84.57,,214.556497,7.346
4,6.0,28-05-2010,1644470.66,0.0,78.89,2.759,212.412888,7.092


In [79]:
df.describe(include="all")

Unnamed: 0,Store,Date,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,150.0,132,136.0,138.0,132.0,136.0,138.0,135.0
unique,,85,,,,,,
top,,19-10-2012,,,,,,
freq,,4,,,,,,
mean,9.866667,,1249536.0,0.07971,61.398106,3.320853,179.898509,7.59843
std,6.231191,,647463.0,0.271831,18.378901,0.478149,40.274956,1.577173
min,1.0,,268929.0,0.0,18.79,2.514,126.111903,5.143
25%,4.0,,605075.7,0.0,45.5875,2.85225,131.970831,6.5975
50%,9.0,,1261424.0,0.0,62.985,3.451,197.908893,7.47
75%,15.75,,1806386.0,0.0,76.345,3.70625,214.934616,8.15


In [80]:
(df.isna().sum() / df.shape[0]).apply(lambda x: f"{round(x * 100)} %")

Store            0 %
Date            12 %
Weekly_Sales     9 %
Holiday_Flag     8 %
Temperature     12 %
Fuel_Price       9 %
CPI              8 %
Unemployment    10 %
dtype: object

Certaines valeurs sont manquantes ou dans un format non adéquat. Corrigeons cela !
Nous pourrions modifier les valeurs manquantes tout-de-suite comme mentionné dans le code commenté ci-dessous, mais nous préfererons plus tard l'Imputation.

In [52]:
# df["Holiday_Flag"] = df["Holiday_Flag"].fillna(0)  # because mean near 0
# for column in ["Temperature", "Fuel_Price", "CPI", "Unemployment"]:
#     df[column] = df[column].fillna(df[column].mean())

In [81]:
# Removing lines with missing values in the "target" column
df.dropna(subset=[target], inplace=True)

# Conforming the Date column
df.dropna(subset=["Date"], inplace=True)  # impossible to fill values
df["Date"] = pd.to_datetime(df["Date"], format="%d-%m-%Y")
df["Year"] = df["Date"].dt.year
df["Month"] = df["Date"].dt.month
df["Day"] = df["Date"].dt.day
df["DayOfWeek"] = df["Date"].dt.day_of_week

df = df.drop("Date", axis=1)

In [82]:
def remove_outliers(df, columns: list[str]) -> pd.DataFrame:
    for column in columns:
        mean = df[column].mean()
        std = df[column].std()
        lower_bound = mean - 3 * std
        upper_bound = mean + 3 * std
        
        df = df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]
    
    return df

df = remove_outliers(df, ["Temperature", "Fuel_Price", "CPI", "Unemployment"])

In [83]:
df.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,DayOfWeek
count,80.0,80.0,71.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0,80.0
mean,9.575,1221522.0,0.084507,61.12775,3.2907,181.077638,7.301775,2010.8875,6.3625,16.125,4.0
std,6.143382,679927.0,0.280126,17.4476,0.491223,38.847021,0.955392,0.826672,3.028321,8.521566,0.0
min,1.0,268929.0,0.0,18.79,2.548,126.1392,5.143,2010.0,1.0,1.0,4.0
25%,4.0,529510.7,0.0,45.5875,2.804,132.610242,6.52075,2010.0,4.0,10.0,4.0
50%,8.0,1260826.0,0.0,61.45,3.3905,197.500965,7.3455,2011.0,6.0,16.5,4.0
75%,15.0,1817517.0,0.0,75.4775,3.68975,214.809008,8.09,2012.0,8.25,23.25,4.0
max,20.0,2771397.0,1.0,91.65,4.17,226.968844,9.342,2012.0,12.0,31.0,4.0


In [84]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 80 entries, 0 to 149
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Store         80 non-null     float64
 1   Weekly_Sales  80 non-null     float64
 2   Holiday_Flag  71 non-null     float64
 3   Temperature   80 non-null     float64
 4   Fuel_Price    80 non-null     float64
 5   CPI           80 non-null     float64
 6   Unemployment  80 non-null     float64
 7   Year          80 non-null     int32  
 8   Month         80 non-null     int32  
 9   Day           80 non-null     int32  
 10  DayOfWeek     80 non-null     int32  
dtypes: float64(7), int32(4)
memory usage: 6.2 KB


In [85]:
df.head()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment,Year,Month,Day,DayOfWeek
0,6.0,1572117.54,,59.61,3.045,214.777523,6.858,2011,2,18,4
1,13.0,1807545.43,0.0,42.38,3.435,128.616064,7.47,2011,3,25,4
4,6.0,1644470.66,0.0,78.89,2.759,212.412888,7.092,2010,5,28,4
6,15.0,695396.19,0.0,69.8,4.069,134.855161,7.658,2011,6,3,4
7,20.0,2203523.2,0.0,39.93,3.617,213.023622,6.961,2012,2,3,4


Concentrons nous maintenant sur les corrélations.

In [86]:
corr_matrix = df.select_dtypes(exclude="object").corr()

ff.create_annotated_heatmap(corr_matrix.round(2).values, 
                            x=corr_matrix.columns.tolist(),
                            y=corr_matrix.index.tolist())

In [87]:
fig = px.scatter_matrix(df, height=900, width = 900)
fig.show()

Nous remarquons que `Fuel_Price` augmente en fonction du temps tandis que le taux d'`Unemployment` diminue.

In [None]:
for column in df.columns:
    fig = px.histogram(df[column])
    fig.show()

## Pipeline

In [114]:
X = df.drop(target, axis=1)
Y = df[target]

In [89]:
numeric_features = ["Temperature", "Fuel_Price", "CPI", "Unemployment", "Year", "Month", "Day", "DayOfWeek"]
categorical_features = ["Store", "Holiday_Flag"]

In [115]:
X_train, X_test, Y_train, Y_test = train_test_split(X,
                                                    Y,
                                                    test_size=0.1,
                                                    random_state=0)

In [116]:
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean")),
    ("scaler", StandardScaler())
])

categorical_transformer = Pipeline(
    steps=[
    ("encoder", OneHotEncoder(drop="first"))
    ])

# Use ColumnTransformer to make a preprocessor object that describes all the treatments to be done
preprocessor = ColumnTransformer(
    transformers=[
        ("num", numeric_transformer, numeric_features),
        ("cat", categorical_transformer, categorical_features)
    ])

preprocessor

## Part 2 : Baseline model (linear regression)

In [117]:
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)

In [118]:
model = LinearRegression()
model.fit(X_train, Y_train)

In [120]:
Y_train_pred = model.predict(X_train)
Y_test_pred = model.predict(X_test)

### Performances

In [147]:
print("R2 score on train set :", r2_score(Y_train, Y_train_pred))
print("R2 score on test set :", r2_score(Y_test, Y_test_pred))

R2 score on train set : 0.9828526404016
R2 score on test set : 0.9696208315589367


In [148]:
def adjust_r2(train, pred):
    r2 = r2_score(train, pred)
    n = X.shape[0]
    p = X.shape[1]
    return 1-((1-r2)*(n-1))/(n-p-1)

print("Adjusted R2 score on train set :", adjust_r2(Y_train, Y_train_pred))
print("Adjusted R2 score on test set :", adjust_r2(Y_test, Y_test_pred))

Adjusted R2 score on train set : 0.9803675158221218
Adjusted R2 score on test set : 0.965218053524


In [124]:
analyze_results = pd.DataFrame({
        "coefficients":model.coef_,
        "columns":preprocessor.get_feature_names_out()
    })
analyze_results["coefficients"] = analyze_results["coefficients"].abs()

px.bar(analyze_results.sort_values(by="coefficients", ascending=False), x="columns", y="coefficients")

In [138]:
fig = go.Figure()
fig.add_trace(go.Scatter(x=X_test, y=Y_test))
fig.add_trace(go.Scatter(x=X_test, y=Y_test_pred))
fig.show()

## Part 3 : Fight overfitting

In [149]:
regressor = Ridge()
scores = cross_val_score(regressor, X_train, Y_train, cv=3)
print("Cross-validated R2-score", scores.mean())
print("Standard deviation", scores.std())

Cross-validated R2-score 0.7080601916924096
Standard deviation 0.06709639458776724


In [145]:
params = {
    "alpha": [0.0, 0.1, 0.5, 1.0]
}
gridsearch = GridSearchCV(regressor, param_grid = params, cv = 3)
gridsearch.fit(X_train, Y_train)
print("Best hyperparameters : ", gridsearch.best_params_)
print("Best R2 score : ", gridsearch.best_score_)

Best hyperparameters :  {'alpha': 0.0}
Best R2 score :  0.9304622441659406


## Baseline vs Ridge

In [146]:
print("Linear Regression")
print("R2 score on train set :", r2_score(Y_train, Y_train_pred))
print("R2 score on test set :", r2_score(Y_test, Y_test_pred))

print("Ridge Regression")
print("R2 score on training set : ", gridsearch.score(X_train, Y_train))
print("R2 score on test set : ", gridsearch.score(X_test, Y_test))

Linear Regression
R2 score on train set : 0.9828526404016
R2 score on test set : 0.9696208315589367
Ridge Regression
R2 score on training set :  0.9828526404016
R2 score on test set :  0.9696208315589367


Le modèle est plus performant quand on n'applique pas de régularisation.