In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
import joblib
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from statsmodels.tsa.stattools import adfuller, acf, pacf

In [6]:
# Load the dataset
data_path = r'E:\Kifiya\Week 4\Challenge wk\customer-traffic-analysis-rossmann\data\synthetic_sales_data.csv'
sales_data = pd.read_csv(data_path)

# Display the first few rows of the dataset
print(sales_data.head())

         Date  Store   Store_Type  Store_Status  Promo  Promo2  \
0  2021-01-01      1  supermarket             1      0       0   
1  2021-01-02      1     pharmacy             1      0       0   
2  2021-01-03      1  supermarket             1      1       0   
3  2021-01-04      1  supermarket             0      1       0   
4  2021-01-05      1     pharmacy             1      1       0   

   School_Holiday  Customers  Sales  
0               0        124      0  
1               0         87      0  
2               0         74    778  
3               0          0      0  
4               0         76   1002  


In [7]:
# Check for missing values
print(sales_data.isnull().sum())

# Fill missing values or handle them accordingly
sales_data.fillna(0, inplace=True)  # Assuming missing values mean 0

Date              0
Store             0
Store_Type        0
Store_Status      0
Promo             0
Promo2            0
School_Holiday    0
Customers         0
Sales             0
dtype: int64


In [8]:
# Convert Date column to datetime
sales_data['Date'] = pd.to_datetime(sales_data['Date'])

# Extract additional features
sales_data['Weekday'] = sales_data['Date'].dt.weekday
sales_data['Weekend'] = (sales_data['Weekday'] >= 5).astype(int)
sales_data['Day'] = sales_data['Date'].dt.day
sales_data['Month'] = sales_data['Date'].dt.month
sales_data['Year'] = sales_data['Date'].dt.year

# Convert categorical variables into dummy/indicator variables
sales_data = pd.get_dummies(sales_data, columns=['Store_Type', 'Store_Status'], drop_first=True)

# Display the first few rows after feature engineering
print(sales_data.head())

        Date  Store  Promo  Promo2  School_Holiday  Customers  Sales  Weekday  \
0 2021-01-01      1      0       0               0        124      0        4   
1 2021-01-02      1      0       0               0         87      0        5   
2 2021-01-03      1      1       0               0         74    778        6   
3 2021-01-04      1      1       0               0          0      0        0   
4 2021-01-05      1      1       0               0         76   1002        1   

   Weekend  Day  Month  Year  Store_Type_pharmacy  Store_Type_supermarket  \
0        0    1      1  2021                False                    True   
1        1    2      1  2021                 True                   False   
2        1    3      1  2021                False                    True   
3        0    4      1  2021                False                    True   
4        0    5      1  2021                 True                   False   

   Store_Status_1  
0            True  
1         