In [8]:
import sys
import pandas as pd
import numpy as np
from pathlib import Path

sys.path.append(str(Path("..").resolve()))


In [9]:
from src.data_loader import load_demand, load_weather
from src.features import add_hdd, add_lag_features
from src.models import baseline_predict, train_linear_regression, train_random_forest
from src.metrics import mae


In [10]:
# Load data
demand = load_demand("../data/raw/uk_gas_demand_daily.csv")
weather = load_weather("../data/raw/uk_temperature_daily.csv")

# Merge
df = demand.merge(weather, on="date", how="inner")

# Features
df = add_hdd(df)
df = add_lag_features(df)

df_model = df.dropna().copy()

# Prepare data
features = ["hdd", "demand_lag_1", "demand_lag_7", "demand_roll_7"]
X = df_model[features]
y = df_model["demand_gwh"]


In [11]:
# Baseline
mae_baseline = mae(y, baseline_predict(df_model))
mae_baseline


140.19242163436712

In [12]:
# Linear Regression
lr = train_linear_regression(X, y)
mae_lr = mae(y, lr.predict(X))
mae_lr


129.48417560393793

In [13]:
# RandomForest
rf = train_random_forest(X, y)
mae_rf = mae(y, rf.predict(X))
mae_rf


86.85361164236727

In [14]:
pd.Series(
    rf.feature_importances_,
    index=features
).sort_values(ascending=False)


demand_roll_7    0.655762
demand_lag_1     0.299305
hdd              0.036305
demand_lag_7     0.008628
dtype: float64