In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import holidays
import calendar
from sklearn.preprocessing import StandardScaler
from pmdarima import auto_arima
from statsmodels.tsa.statespace.sarimax import SARIMAX
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Create results directory if it doesn't exist
os.makedirs("results/daily3", exist_ok=True)

# ---------------------------
# Load Daily Data
# ---------------------------
# The CSV is expected to have columns: Day, Chargers, Chargers achteraan, Grid Organi lbc, Solar
df = pd.read_csv('api_data/aggregated_daily_measurements.csv')

# Set 'Day' as datetime index
df.set_index("Day", inplace=True)
df.index = pd.to_datetime(df.index)

# Use only the "Grid Organi lbc" column for total consumption
df['Total_consumption'] = df['Grid Organi lbc']

# Drop unused columns
df = df.drop(['Chargers', 'Chargers achteraan', 'Solar', 'Grid Organi lbc'], axis=1)

# As the data is already daily, we directly copy it
df_daily = df.copy()

print("Dataset Information (Daily Data):")
print(f"Time range: {df_daily.index.min()} to {df_daily.index.max()}")
print(f"Total observations: {len(df_daily)}")
print(f"Missing values: {df_daily['Total_consumption'].isna().sum()}")

# ---------------------------
# Transform Target: Log Consumption
# ---------------------------
# Ensure all consumption values are positive by shifting if needed
shift_val = abs(df_daily["Total_consumption"].min()) + 1  
df_daily["log_consumption"] = np.log(df_daily["Total_consumption"] + shift_val)

# ---------------------------
# Feature Engineering for Daily Data
# ---------------------------
be_holidays = holidays.BE()  # Belgian holidays

# Basic time features
df_daily['day_of_week'] = df_daily.index.dayofweek
df_daily['month'] = df_daily.index.month

# Categorical features
df_daily['is_weekend'] = df_daily['day_of_week'].apply(lambda x: 1 if x >= 5 else 0)
df_daily['is_festive'] = df_daily.index.to_series().apply(lambda x: 1 if x in be_holidays else 0)

# Seasonal features
df_daily['is_summer'] = df_daily.index.month.isin([6, 7, 8]).astype(int)
df_daily['is_winter'] = df_daily.index.month.isin([12, 1, 2]).astype(int)

# Cyclical features for day of week
df_daily['day_of_week_sin'] = np.sin(2 * np.pi * df_daily['day_of_week'] / 7)
df_daily['day_of_week_cos'] = np.cos(2 * np.pi * df_daily['day_of_week'] / 7)

# Lagged features (avoid data leakage)
df_daily['consumption_lag_1d'] = df_daily['Total_consumption'].shift(1)   # 1-day lag
df_daily['consumption_lag_7d'] = df_daily['Total_consumption'].shift(7)   # 7-day lag

# Additional lag features for monthly and yearly
df_daily['consumption_lag_30d'] = df_daily['Total_consumption'].shift(30)   # Approx. monthly lag
df_daily['consumption_lag_365d'] = df_daily['Total_consumption'].shift(365)   # Yearly lag

# ---------------------------
# Advanced Lag Features
# ---------------------------
# In addition to the basic lags above, we can engineer additional lag-based features to capture recurring patterns.
# For instance, lagged consumption from 14 and 21 days ago may capture biweekly or other cyclical patterns.
# Moving averages and rolling statistics help capture short-term momentum and volatility.
df_daily['consumption_lag_14d'] = df_daily['Total_consumption'].shift(14)   # 14-day lag for biweekly patterns
df_daily['consumption_lag_21d'] = df_daily['Total_consumption'].shift(21)   # 21-day lag for extended cycles
df_daily['rolling_avg_3d'] = df_daily['Total_consumption'].rolling(window=3).mean()  # 3-day moving average
df_daily['rolling_std_3d'] = df_daily['Total_consumption'].rolling(window=3).std()   # 3-day rolling standard deviation

# Drop rows with NaN values resulting from lag features and rolling calculations
df_daily.dropna(inplace=True)

# ---------------------------
# Visualization: Correlation Heatmap (Daily Data)
# ---------------------------
plt.figure(figsize=(12, 10))
numerical_features = df_daily.select_dtypes(include=[np.number]).columns
correlation = df_daily[numerical_features].corr()
mask = np.triu(np.ones_like(correlation, dtype=bool))
sns.heatmap(correlation, annot=True, fmt=".2f", cmap="coolwarm", mask=mask, vmin=-1, vmax=1)
plt.title("Daily Feature Correlation Heatmap", fontsize=16)
plt.tight_layout()
plt.savefig("results/daily3/correlation_heatmap.png")
plt.close()

# Plot original daily time series of Total Consumption
plt.figure(figsize=(15, 6))
plt.plot(df_daily.index, df_daily['Total_consumption'], color='blue', alpha=0.6)
plt.title('Daily Power Consumption Over Time', fontsize=14)
plt.xlabel('Date', fontsize=12)
plt.ylabel('Power Consumption (kWh)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.savefig("results/daily3/time_series_plot.png")
plt.close()

print("\nBasic Statistics (Total Consumption):")
print(df_daily['Total_consumption'].describe())

# ---------------------------
# Prepare Data for Modeling (Daily)
# ---------------------------
# Use the log-transformed consumption as the target
target = "log_consumption"
y_orig = df_daily["Total_consumption"]

# Define exogenous features suitable for daily forecasting
exog_features = [
    "day_of_week_sin", "day_of_week_cos", "is_weekend", "is_festive",
    "is_summer", "is_winter", "consumption_lag_1d", "consumption_lag_7d",
    "consumption_lag_30d", "consumption_lag_365d", "consumption_lag_14d", "consumption_lag_21d",
    "rolling_avg_3d", "rolling_std_3d"
]

df_daily

FileNotFoundError: [Errno 2] No such file or directory: 'api_data/aggregated_daily_measurements.csv'