# Exercise: Multi-Step Time Series Forecasting for Building Energy Consumption

## Objective:
- Work with endogenous and exogenous variables
- Use real-world-like features (e.g., temperature, occupancy)
- Highlight inputs vs. outputs in multistep context
- Comparing performance estimation based on different parameters (input lags, forecasted horizon)

### Import required packages/libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import MinMaxScaler

### Step 1: Load the dataset

In [None]:
def load_dataset(filename):
    filename = f"./{filename}"
    building_df = pd.read_excel(filename)
    return building_df

### Step 2: Create supervised format for multistep forecasting

In [None]:
def create_multistep_data(df, feature_cols, input_lags=24, forecast_horizon=6):
    X, y = [], []
    for i in range(input_lags, len(df) - forecast_horizon): # sliding window
        
        # flatten past 'input_lags' values of all features into a 1D input vector
        X.append(df.iloc[i - input_lags:i][feature_cols].values.flatten())
        
        #  next 'forecast_horizon' values of the target as output
        y.append(df.iloc[i:i + forecast_horizon]['energy'].values)

        
    return np.array(X), np.array(y)

### Step 3: Train-test split

In [None]:
def split_data(X, y, train_ratio=0.8):
    
    split_idx = int(train_ratio * len(X))
    
    return X[:split_idx], X[split_idx:], y[:split_idx], y[split_idx:]

### Step 4: Train multi-output model

In [None]:
def train_model(X_train, y_train):
    # for this exercise we are using linear regression wrapped in MultiOutputRegressor
    model = MultiOutputRegressor(Ridge())
    
    # input variables with corresponding targets (in our case sequences)
    model.fit(X_train, y_train)
    return model



### Step 5: Evaluate Model

In [None]:
def evaluate_model(model, X_test, y_test):
    
    # make predictions (sequence(s))
    y_pred = model.predict(X_test) 
    
    #calculate MSE
    mse = mean_squared_error(y_test, y_pred)
    
    print(f"Multi-step Forecast MSE: {mse:.2f}")
    
    return y_test, y_pred

### Step 6: Visualization

In [None]:
def plot_forecast(y_test, y_pred, forecast_horizon, sample_idx=10):
    plt.figure(figsize=(10, 4)) #set figure size
    plt.plot(range(len(y_test[sample_idx])), y_test[sample_idx], marker='o', label='Actual') #actual
    plt.plot(range(len(y_pred[sample_idx])), y_pred[sample_idx], marker='x', label='Predicted') #predicted
    plt.title(f"{forecast_horizon}-Step Ahead Forecast (Energy Consumption)")
    plt.xlabel("Hours Ahead")
    plt.ylabel("kWh")
    plt.legend()
    plt.grid(True)
    plt.show()

# Execution

In [None]:
input_lags = 3 # alternatively known as features
forecast_horizon = 6
feature_cols = ['energy', 'temperature', 'occupancy'] # feature names
building_df = load_dataset('sim_building_data.xlsx')

In [None]:
building_df.head()

In [None]:
X, y = create_multistep_data(building_df, feature_cols, input_lags=input_lags, forecast_horizon=forecast_horizon)

X_train, X_test, y_train, y_test = split_data(X, y)

model = train_model(X_train, y_train)

y_test, y_pred = evaluate_model(model, X_test, y_test)

plot_forecast(y_test, y_pred, forecast_horizon=forecast_horizon)

# Further Tasks
### 1. Remove exogenous variables (temp, occupancy) and observe changes
### 2. Try different forecast horizons (e.g., 3, 12)
### 3. Compare performance using MAE
### 4. Normalize features to see if performance improves
### 5. Try different models such LinearRegression, RandomForest
### BONUS: Iterative Forecasting