# DCRNN Traffic Prediction - Simple Training

**Just run all cells in order. That's it.**

Expected time: **10-15 minutes** on GPU

## Step 1: Setup

In [None]:
# Clone repository
!rm -rf Spatio-Temporal-Traffic-Flow-Prediction
!git clone https://github.com/vaish725/Spatio-Temporal-Traffic-Flow-Prediction.git
%cd Spatio-Temporal-Traffic-Flow-Prediction
!git pull origin main

In [None]:
# Install dependencies
!pip install -q torch-geometric tqdm matplotlib scipy

In [None]:
# Check GPU
import torch
print(f"GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("⚠️ No GPU - Training will be slow!")

## Step 2: Get Data

**First time only**: Run cells below to download and preprocess data (takes ~5 min)

**Already have data?** Skip to Step 3

In [None]:
# Check if data exists
import os
if os.path.exists('data/pems_bay_processed.npz'):
    print("✅ Data already exists! Skip to Step 3")
else:
    print("❌ Need to download and preprocess data")
    print("   Run the next 4 cells")

In [None]:
# Download PEMS-BAY dataset (82MB)
!mkdir -p data
!wget -q -O data/PEMS-BAY.csv "https://zenodo.org/record/5724362/files/PEMS-BAY.csv"
print(f"Downloaded: {os.path.getsize('data/PEMS-BAY.csv')/1e6:.1f} MB")

In [None]:
# Preprocess data
import pandas as pd
import numpy as np
from tqdm import tqdm

print("Loading and preprocessing data...")

# Load CSV
df = pd.read_csv('data/PEMS-BAY.csv')
speed_data = df.drop(columns=[df.columns[0]]).values.astype(np.float32)
print(f"Shape: {speed_data.shape} (timesteps x sensors)")

# Handle missing values
for i in range(speed_data.shape[1]):
    mask = np.isnan(speed_data[:, i])
    if mask.any():
        speed_data[mask, i] = np.interp(
            np.flatnonzero(mask),
            np.flatnonzero(~mask),
            speed_data[~mask, i]
        )

# Normalize
mean = speed_data.mean()
std = speed_data.std()
speed_data_norm = (speed_data - mean) / std
print(f"Mean: {mean:.2f} mph, Std: {std:.2f} mph")

# Create sequences
T_in, T_out = 12, 12
num_samples = speed_data_norm.shape[0] - T_in - T_out + 1
num_nodes = speed_data_norm.shape[1]

X = np.zeros((num_samples, T_in, num_nodes, 1), dtype=np.float32)
y = np.zeros((num_samples, T_out, num_nodes, 1), dtype=np.float32)

for i in tqdm(range(num_samples), desc="Creating sequences"):
    X[i, :, :, 0] = speed_data_norm[i:i+T_in, :]
    y[i, :, :, 0] = speed_data_norm[i+T_in:i+T_in+T_out, :]

# Split data
train_split = int(0.7 * num_samples)
val_split = int(0.8 * num_samples)

X_train, y_train = X[:train_split], y[:train_split]
X_val, y_val = X[train_split:val_split], y[train_split:val_split]
X_test, y_test = X[val_split:], y[val_split:]

print(f"Train: {len(X_train)}, Val: {len(X_val)}, Test: {len(X_test)}")

In [None]:
# Create adjacency matrix
from scipy.spatial.distance import cdist

print("Creating adjacency matrix...")
np.random.seed(42)

# Simulate sensor positions
positions = np.linspace(0, 100, num_nodes).reshape(-1, 1)
positions = np.hstack([positions, np.random.randn(num_nodes, 1) * 5])

# Gaussian kernel
distances = cdist(positions, positions, metric='euclidean')
sigma = np.std(distances) * 0.1
adj_matrix = np.exp(-distances**2 / (sigma**2))
adj_matrix[adj_matrix < 0.1] = 0
np.fill_diagonal(adj_matrix, 1.0)

# Transition matrices
row_sum = adj_matrix.sum(axis=1, keepdims=True) + 1e-8
P_fwd = (adj_matrix / row_sum).astype(np.float32)

col_sum = adj_matrix.sum(axis=0, keepdims=True) + 1e-8
P_bwd = (adj_matrix / col_sum).T.astype(np.float32)

print(f"Nodes: {num_nodes}, Edges: {int((adj_matrix > 0).sum() - num_nodes) / 2}")

# Save everything
np.savez_compressed(
    'data/pems_bay_processed.npz',
    X_train=X_train, y_train=y_train,
    X_val=X_val, y_val=y_val,
    X_test=X_test, y_test=y_test,
    P_fwd=P_fwd, P_bwd=P_bwd,
    mean=mean, std=std,
    adj_matrix=adj_matrix
)

print(f"\n✅ Data saved: {os.path.getsize('data/pems_bay_processed.npz')/1e6:.1f} MB")

## Step 3: Train Model

**This is the only cell you need if data already exists!**

In [None]:
!python3 scripts/train_colab_safe.py

## Step 4: Check Results

In [None]:
# Load and display training history
import json
import matplotlib.pyplot as plt

with open('checkpoints_colab/history.json', 'r') as f:
    history = json.load(f)

epochs = history['epoch']
val_mae = history['val_mae']

print("Training Results")
print("="*50)
print(f"Best MAE: {min(val_mae):.3f} mph")
print(f"Baseline MAE: 7.997 mph")
print(f"Improvement: {(7.997 - min(val_mae)) / 7.997 * 100:.1f}%")
print(f"DCRNN Paper (SOTA): 1.38 mph")

# Plot
plt.figure(figsize=(10, 4))
plt.plot(epochs, val_mae, marker='o', color='green', linewidth=2)
plt.axhline(min(val_mae), color='red', linestyle='--', alpha=0.5, label=f'Best: {min(val_mae):.3f}')
plt.xlabel('Epoch')
plt.ylabel('Validation MAE (mph)')
plt.title('Training Progress')
plt.legend()
plt.grid(True, alpha=0.3)
plt.show()

if min(val_mae) < 5.0:
    print("\n✅ SUCCESS! Model is learning patterns!")
else:
    print("\n⚠️ MAE still high. Try training longer or with more data.")

## Done!

Your model is saved in `checkpoints_colab/best_model.pt`

**To evaluate on test set:**
```python
!python3 scripts/evaluate.py --checkpoint checkpoints_colab/best_model.pt --hidden_dim 64 --num_layers 2
```