# 4.0 **Installation & Setup**

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

X = pd.read_csv("./data/features_X.csv")
y = pd.read_csv("./data/labels_y.csv")

# Convert NaNs and one-hot encode targets
X = X.fillna(0)
y = pd.get_dummies(y)

**Prepare Rolling Forecast Dataset**

In [2]:
# Parameters
window_size = 30   # input days
forecast_horizon = 7  # output days

# Build rolling windows
X_windows = []
y_windows = []

for i in range(len(X) - window_size - forecast_horizon):
    X_seq = X.iloc[i:i + window_size].values
    y_seq = y.iloc[i + window_size:i + window_size + forecast_horizon].sum(axis=0)  # total future counts per issue type
    X_windows.append(X_seq)
    y_windows.append(y_seq)

X_seq = np.array(X_windows)
y_seq = np.array(y_windows)

print("Rolling dataset shape:", X_seq.shape, y_seq.shape)


Rolling dataset shape: (2152, 30, 26) (2152, 14)


**Train-Test Split for Rolling Data**

In [5]:
split = int(0.8 * len(X_seq))
X_train, X_test = X_seq[:split], X_seq[split:]
y_train, y_test = y_seq[:split], y_seq[split:]

print("Train:", X_train.shape, "| Test:", X_test.shape)

# Flatten for ML models that need tabular input
X_flat = X_seq.reshape((X_seq.shape[0], -1))

Train: (1721, 30, 26) | Test: (431, 30, 26)


**Model 1: Linear Regression (Baseline)**

In [6]:
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor

X_train_flat, X_test_flat = X_flat[:split], X_flat[split:]
y_train, y_test = y_seq[:split], y_seq[split:]

lr = MultiOutputRegressor(LinearRegression())
lr.fit(X_train_flat, y_train)
y_pred_lr = lr.predict(X_test_flat)

**Model 2: Random Forest**

In [None]:
from sklearn.ensemble import RandomForestRegressor

rf = MultiOutputRegressor(RandomForestRegressor(n_estimators=100, n_jobs=-1, random_state=42))
rf.fit(X_train_flat, y_train)
y_pred_rf = rf.predict(X_test_flat)

KeyboardInterrupt: 

: 

**Model 3: XGBoost**

In [None]:
from xgboost import XGBRegressor

xgb = MultiOutputRegressor(XGBRegressor(n_estimators=100, objective="reg:squarederror"))
xgb.fit(X_train_flat, y_train)
y_pred_xgb = xgb.predict(X_test_flat)

Model 4: LSTM (Deep Learning Rolling Forecast)

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader

class LSTMForecast(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super().__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        _, (hn, _) = self.lstm(x)
        return self.fc(hn[-1])

input_size = X_seq.shape[2]
hidden_size = 64
output_size = y_seq.shape[1]

model = LSTMForecast(input_size, hidden_size, output_size)
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
loss_fn = nn.MSELoss()

# DataLoader
train_ds = TensorDataset(torch.tensor(X_seq[:split]).float(), torch.tensor(y_seq[:split]).float())
test_ds = TensorDataset(torch.tensor(X_seq[split:]).float(), torch.tensor(y_seq[split:]).float())

train_loader = DataLoader(train_ds, batch_size=32, shuffle=True)
test_loader = DataLoader(test_ds, batch_size=32)


In [None]:
epochs = 20
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for xb, yb in train_loader:
        optimizer.zero_grad()
        pred = model(xb)
        loss = loss_fn(pred, yb)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {total_loss:.4f}")


In [None]:
model.eval()
y_pred_lstm = []

with torch.no_grad():
    for xb, _ in test_loader:
        pred = model(xb)
        y_pred_lstm.append(pred.numpy())

y_pred_lstm = np.vstack(y_pred_lstm)


Compare All Model Results

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

def evaluate_model(name, y_true, y_pred):
    mse = mean_squared_error(y_true, y_pred)
    r2 = r2_score(y_true, y_pred)
    print(f"{name:<15} | MSE: {mse:.2f} | R²: {r2:.4f}")

evaluate_model("LinearRegression", y_test, y_pred_lr)
evaluate_model("RandomForest", y_test, y_pred_rf)
evaluate_model("XGBoost", y_test, y_pred_xgb)
evaluate_model("LSTM", y_test, y_pred_lstm)


LinearRegression | MSE: 3212.66 | R²: -43.2706


In [None]:
# Check sample of POI columns
poi_cols = [col for col in df.columns if "dist_" in col or "count_" in col]

# Normalize POI density features
scaler = StandardScaler()
df[poi_cols] = scaler.fit_transform(df[poi_cols])


Summary 

### Model Comparison for Rolling Forecast (7-day Issue Type Counts)

| Model            | MSE     | R²     |
|------------------|---------|--------|
| Linear Regression| ...     | ...    |
| Random Forest    | ...     | ...    |
| XGBoost          | ...     | ...    |
| LSTM             | ...     | ...    |

- LSTM (rolling) performs better at adapting to recent non-seasonal patterns.
- Tree models are strong baselines and less sensitive to noisy inputs.
- Next step: Use best model for real-time rolling forecasting on SG data.


Setup for Long Format

In [None]:
# Load preformatted long dataset
df_long = pd.read_csv("./data/processed/tft_ready.csv", parse_dates=["date"])

# Assign integer time index
df_long["time_idx"] = (df_long["date"] - df_long["date"].min()).dt.days

# Preview
df_long.head()


Define TimeSeriesDataSet

In [None]:
from pytorch_forecasting import TimeSeriesDataSet

max_encoder_length = 30
max_prediction_length = 7

ts_dataset = TimeSeriesDataSet(
    df_long,
    time_idx="time_idx",
    target="issue_count",
    group_ids=["issue_type_sg"],
    max_encoder_length=max_encoder_length,
    max_prediction_length=max_prediction_length,
    static_categoricals=["issue_type_sg"],
    time_varying_known_reals=["time_idx", "hour", "day_of_week", "month", "is_public_holiday"],
    time_varying_unknown_reals=["issue_count"],
    add_relative_time_idx=True,
    add_target_scales=True,
)


In [None]:
train_loader = ts_dataset.to_dataloader(train=True, batch_size=64)
val_loader = ts_dataset.to_dataloader(train=False, batch_size=64)


N-BEATS

In [None]:
from pytorch_forecasting.models import NBeats

nbeats = NBeats.from_dataset(ts_dataset, learning_rate=1e-3, hidden_size=64)

from pytorch_lightning import Trainer
trainer = Trainer(max_epochs=20, accelerator="auto")
trainer.fit(nbeats, train_dataloaders=train_loader, val_dataloaders=val_loader)


GRU

In [None]:
from pytorch_forecasting.models import RNN

gru_model = RNN.from_dataset(
    ts_dataset,
    rnn_type="GRU",
    hidden_size=64,
    learning_rate=1e-3,
)

trainer = Trainer(max_epochs=20, accelerator="auto")
trainer.fit(gru_model, train_dataloaders=train_loader, val_dataloaders=val_loader)


Transformer

In [None]:
from pytorch_forecasting.models import Transformer

transformer = Transformer.from_dataset(
    ts_dataset,
    hidden_size=64,
    learning_rate=1e-3,
)

trainer = Trainer(max_epochs=20, accelerator="auto")
trainer.fit(transformer, train_dataloaders=train_loader, val_dataloaders=val_loader)


Informer

In [None]:
from pytorch_forecasting.models import Informer

informer = Informer.from_dataset(
    ts_dataset,
    hidden_size=64,
    learning_rate=1e-3,
)

trainer = Trainer(max_epochs=20, accelerator="auto")
trainer.fit(informer, train_dataloaders=train_loader, val_dataloaders=val_loader)


TFT (Temporal Fusion Transformer)

In [None]:
from pytorch_forecasting.models import TemporalFusionTransformer

tft = TemporalFusionTransformer.from_dataset(
    ts_dataset,
    hidden_size=64,
    learning_rate=1e-3,
    attention_head_size=4,
    dropout=0.1,
)

trainer = Trainer(max_epochs=20, accelerator="auto")
trainer.fit(tft, train_dataloaders=train_loader, val_dataloaders=val_loader)


Evaluation 

In [None]:
# Choose model: nbeats, gru_model, transformer, informer, tft
best_model = tft  # Replace as needed

actuals = torch.cat([y[0] for x, y in iter(val_loader)])
predictions = best_model.predict(val_loader)

from sklearn.metrics import mean_squared_error, r2_score
print("MSE:", mean_squared_error(actuals.numpy(), predictions.numpy()))
print("R²:", r2_score(actuals.numpy(), predictions.numpy()))


Summary 

### Model Benchmark (Rolling Forecast – 7 Days Ahead)

| Model        | MSE   | R²     |
|--------------|-------|--------|
| LinearReg    | ...   | ...    |
| RandomForest | ...   | ...    |
| XGBoost      | ...   | ...    |
| LSTM         | ...   | ...    |
| N-BEATS      | ...   | ...    |
| GRU          | ...   | ...    |
| Transformer  | ...   | ...    |
| Informer     | ...   | ...    |
| TFT          | ...   | ...    |

- Best performance often from: N-BEATS, TFT, or Informer (if large enough dataset).
- GRU and Transformer also solid, less compute than TFT.
- Choose based on trade-off: speed, complexity, explainability.
