In [49]:
# Import necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import pearsonr
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Load the dataset
# Assume the dataset is in a CSV file named 'sales_data.csv'
data = pd.read_csv('../data/raw/Walmart_Store_sales.csv')

# Display the first few rows of the dataset
data.head()

# Check for missing values
data.isnull().sum()

# Exploratory Data Analysis (EDA)
# Summary statistics
data.describe()

Unnamed: 0,Store,Weekly_Sales,Holiday_Flag,Temperature,Fuel_Price,CPI,Unemployment
count,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0,6435.0
mean,23.0,1046965.0,0.06993,60.663782,3.358607,171.578394,7.999151
std,12.988182,564366.6,0.255049,18.444933,0.45902,39.356712,1.875885
min,1.0,209986.2,0.0,-2.06,2.472,126.064,3.879
25%,12.0,553350.1,0.0,47.46,2.933,131.735,6.891
50%,23.0,960746.0,0.0,62.67,3.445,182.616521,7.874
75%,34.0,1420159.0,0.0,74.94,3.735,212.743293,8.622
max,45.0,3818686.0,1.0,100.14,4.468,227.232807,14.313


Check relationship between Weekly_Sales and Holiday_Flag

Check relationship between Weekly_Sales and Temperature

Check relationship between Weekly_Sales and Fuel_Price

Check relationship between Weekly_Sales and CPI 

In [50]:
# Standardize the variable
df = data[['Weekly_Sales','CPI','Unemployment']]
ss = StandardScaler()
df_scaled = pd.DataFrame(ss.fit_transform(df),columns = df.columns)
df

Unnamed: 0,Weekly_Sales,CPI,Unemployment
0,1643690.90,211.096358,8.106
1,1641957.44,211.242170,8.106
2,1611968.17,211.289143,8.106
3,1409727.59,211.319643,8.106
4,1554806.68,211.350143,8.106
...,...,...,...
6430,713173.95,192.013558,8.684
6431,733455.07,192.170412,8.667
6432,734464.36,192.327265,8.667
6433,718125.53,192.330854,8.667


In [51]:
# Define the X and Y 
Y = df['Weekly_Sales']

# Create the model matrix
X = sm.add_constant(df['CPI'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5)

#Build the model
model4 = sm.OLS(y_train, X_train)
results4 = model4.fit()
results4.summary()

0,1,2,3
Dep. Variable:,Weekly_Sales,R-squared:,0.004
Model:,OLS,Adj. R-squared:,0.004
Method:,Least Squares,F-statistic:,19.18
Date:,"Wed, 19 Jun 2024",Prob (F-statistic):,1.22e-05
Time:,15:45:13,Log-Likelihood:,-66019.0
No. Observations:,4504,AIC:,132000.0
Df Residuals:,4502,BIC:,132100.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.207e+06,3.75e+04,32.217,0.000,1.13e+06,1.28e+06
CPI,-930.9141,212.588,-4.379,0.000,-1347.692,-514.137

0,1,2,3
Omnibus:,284.606,Durbin-Watson:,2.032
Prob(Omnibus):,0.0,Jarque-Bera (JB):,340.843
Skew:,0.673,Prob(JB):,9.7e-75
Kurtosis:,3.056,Cond. No.,788.0


In [52]:
# Make predictions on the test set
y_pred = model4.predict(X_test)

# Calculate the mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

ValueError: shapes (4504,2) and (1931,2) not aligned: 2 (dim 1) != 1931 (dim 0)

Check relationship between Weekly_Sales and Unemployment 

In [None]:
# Define the X and Y 
Y = df['Weekly_Sales']

# Create the model matrix
X = sm.add_constant(df['Unemployment'])

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, random_state=5)

# Build the model
model5 = sm.OLS(y_train, X_train)
result5 = model5.fit()
result5.summary()

0,1,2,3
Dep. Variable:,Weekly_Sales,R-squared:,0.013
Model:,OLS,Adj. R-squared:,0.013
Method:,Least Squares,F-statistic:,59.91
Date:,"Wed, 19 Jun 2024",Prob (F-statistic):,1.22e-14
Time:,15:11:49,Log-Likelihood:,-65999.0
No. Observations:,4504,AIC:,132000.0
Df Residuals:,4502,BIC:,132000.0
Df Model:,1,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.319e+06,3.62e+04,36.465,0.000,1.25e+06,1.39e+06
Unemployment,-3.413e+04,4410.069,-7.740,0.000,-4.28e+04,-2.55e+04

0,1,2,3
Omnibus:,248.086,Durbin-Watson:,2.029
Prob(Omnibus):,0.0,Jarque-Bera (JB):,290.52
Skew:,0.622,Prob(JB):,8.210000000000001e-64
Kurtosis:,3.006,Cond. No.,36.1


In [None]:
# Make predictions on the test set
y_pred = model5.predict(X_test)

# Calculate the mean squared error and R^2 score
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

ValueError: shapes (4504,2) and (1931,2) not aligned: 2 (dim 1) != 1931 (dim 0)