# Black Hole Evolution Dataset Preparation

This notebook extracts and formats time-series data from TNG100 to enable a LSTM to predict supermassive black hole evolution.


### 1. Environment Setup
---
Import necessary libraries and configure global settings for reproducibility.


In [1]:
import requests
import numpy as np
import torch
import random

random.seed(42)  # Ensures reproducible random sampling later

print(f"NumPy version: {np.__version__}")
print(f"PyTorch version: {torch.__version__}")


NumPy version: 1.24.3
PyTorch version: 2.0.1+cpu


### 2. Load and Filter TNG100 Subhalo Catalog  
---
Locate the TNG100 simulation directory and loading the subhalo catalog from snapshot 33. We then extract all subhalos hosting supermassive black holes (SMBHs).


#### 2.1 Load Dataset
---
This cell loads the preprocessed black hole evolution dataset from the data directory and confirms its structure. It also sets the simulation base path for future data access.

In [2]:
import illustris_python as il
import pandas as pd

# Set simulation base path
basePath = "/home/tnguser/sims.TNG/TNG100-1"

# Load precompiled black hole sample from CSV
csv_path = "/home/tnguser/cosmic-evolution-ml/black_hole_evolution/data/black_hole_evolution_tng100.csv"
df = pd.read_csv(csv_path)

print(f"Dataset loaded with shape: {df.shape}")
print("Columns:", df.columns.tolist())


Dataset loaded with shape: (37500, 8)
Columns: ['subhalo_id', 'snapshot', 'bh_mass', 'bh_acc', 'stellar_mass', 'sfr', 'halo_mass', 'vel_disp']


#### 2.2 Save Processed Data as NumPy Arrays
---
This section converts the cleaned long-format CSV dataset into NumPy arrays for efficient model training and stores them alongside the CSV in the data directory.

In [6]:
import numpy as np
import pandas as pd
from pathlib import Path

# Paths
DATA_DIR = Path("../data")
CSV_PATH = DATA_DIR / "black_hole_evolution_tng100.csv"  # long-format dataset

# Load
df = pd.read_csv(CSV_PATH)

# Numeric columns and sanitization
NUM_COLS = ["bh_mass","bh_acc","stellar_mass","sfr","halo_mass","vel_disp"]
df[NUM_COLS] = df[NUM_COLS].apply(pd.to_numeric, errors="coerce")
df[NUM_COLS] = df[NUM_COLS].replace([np.inf, -np.inf], np.nan)

# NaN imputation: per-snapshot median, then global median, final zero-fill safety
for col in NUM_COLS:
    med_by_snap = df.groupby("snapshot")[col].transform("median")
    df[col] = df[col].fillna(med_by_snap)
    df[col] = df[col].fillna(df[col].median())
df[NUM_COLS] = df[NUM_COLS].fillna(0.0)

# Save arrays + normalization stats
ids        = df["subhalo_id"].to_numpy()
snapshots  = df["snapshot"].to_numpy().astype(int)
features   = df[NUM_COLS].to_numpy(dtype=np.float32)

feat_mean = features.mean(axis=0)
feat_std  = features.std(axis=0) + 1e-8

np.save(DATA_DIR / "ids.npy", ids)
np.save(DATA_DIR / "snapshots.npy", snapshots)
np.save(DATA_DIR / "features.npy", features)
np.save(DATA_DIR / "feat_mean.npy", feat_mean)
np.save(DATA_DIR / "feat_std.npy",  feat_std)

print(f"[OK] Cleaned and saved arrays to: {DATA_DIR}")


[OK] Cleaned and saved arrays to: ../data


### 3. Data Loading
---
This section prepares the processed dataset for model training by defining a PyTorch-compatible `Dataset` and `DataLoader`. The goal is to efficiently feed the model sequential input–output pairs representing black hole and galaxy properties across snapshots.

#### 3.1 Dataset and DataLoader Setup
---
We load the processed `.npy` files generated in Section 2, organize them into sequences of `(initial_conditions, final_conditions)`, and configure a `DataLoader` to support batching, shuffling, and efficient GPU training.

In [8]:
# 3.1 — Dataset and DataLoader Setup (normalized, NaN-safe)

import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, random_split
from pathlib import Path

DATA_DIR = Path("../data")

class BlackHoleEvolutionDataset(Dataset):
    """
    Samples are (initial_conditions at t, final_conditions at t+1).
    """
    def __init__(self, data_dir: Path, sequence_length: int = 2, normalize: bool = True):
        self.ids        = np.load(data_dir / "ids.npy")
        self.snapshots  = np.load(data_dir / "snapshots.npy")
        self.features   = np.load(data_dir / "features.npy")  # [N_rows, F]
        self.sequence_length = sequence_length

        # Replace lingering NaNs just in case
        nan_mask = np.isnan(self.features)
        if nan_mask.any():
            col_means = np.nanmean(self.features, axis=0)
            self.features[nan_mask] = np.take(col_means, np.where(nan_mask)[1])

        # Normalize using saved stats
        if normalize:
            mean = np.load(data_dir / "feat_mean.npy")
            std  = np.load(data_dir / "feat_std.npy")
            self.features = (self.features - mean) / std
            self.features = np.nan_to_num(self.features, nan=0.0, posinf=0.0, neginf=0.0)

        # Group by subhalo and sort by snapshot
        self.subhalo_sequences = {}
        for sid in np.unique(self.ids):
            m = self.ids == sid
            seq_feat = self.features[m]
            seq_snap = self.snapshots[m]
            order = np.argsort(seq_snap)
            self.subhalo_sequences[sid] = seq_feat[order]

        # Build (t -> t+1) pairs
        self.samples = []
        L = self.sequence_length
        for seq in self.subhalo_sequences.values():
            if len(seq) >= L:
                for i in range(len(seq) - L + 1):
                    x0 = seq[i]          # t
                    x1 = seq[i + L - 1]  # t+1 for L=2
                    self.samples.append((x0, x1))

    def __len__(self):
        return len(self.samples)

    def __getitem__(self, idx):
        x, y = self.samples[idx]
        x = torch.from_numpy(np.asarray(x, dtype=np.float32))
        y = torch.from_numpy(np.asarray(y, dtype=np.float32))
        x = torch.nan_to_num(x, nan=0.0)
        y = torch.nan_to_num(y, nan=0.0)
        return x, y

# Dataset, split, loaders
dataset = BlackHoleEvolutionDataset(DATA_DIR, sequence_length=2, normalize=True)
train_size = int(0.8 * len(dataset))
val_size   = len(dataset) - train_size
train_ds, val_ds = random_split(dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))

batch_size = 64
train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)
val_loader   = DataLoader(val_ds,   batch_size=batch_size, shuffle=False)

print(f"[OK] Train samples: {len(train_ds)} | Val samples: {len(val_ds)}")
xb, yb = next(iter(train_loader))
print("Batch shapes:", xb.shape, yb.shape)  # [B, F], [B, F]


[OK] Train samples: 28000 | Val samples: 7000
Batch shapes: torch.Size([64, 6]) torch.Size([64, 6])


#### 3.2 Model Architecture Definition
---
We define a neural network model to learn the mapping from `initial_conditions` to `final_conditions`. The architecture consists of fully connected layers with nonlinear activations, allowing the model to capture complex relationships in the astrophysical data.


In [9]:
# 3.2 — Model Architecture Definition (multi-output full state)

import torch.nn as nn
import torch

class BlackHoleEvolutionModel(nn.Module):
    def __init__(self, input_dim, hidden_dim=128, output_dim=None, p_drop=0.1):
        super().__init__()
        if output_dim is None:
            output_dim = input_dim  # predict full vector
        self.net = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p_drop),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Dropout(p_drop),
            nn.Linear(hidden_dim, output_dim)
        )

    def forward(self, x):
        return self.net(x)

# Infer feature size F from a batch
F = xb.shape[1]
model = BlackHoleEvolutionModel(input_dim=F, hidden_dim=128, output_dim=F, p_drop=0.1)
print(model)


BlackHoleEvolutionModel(
  (net): Sequential(
    (0): Linear(in_features=6, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.1, inplace=False)
    (3): Linear(in_features=128, out_features=128, bias=True)
    (4): ReLU()
    (5): Dropout(p=0.1, inplace=False)
    (6): Linear(in_features=128, out_features=6, bias=True)
  )
)


#### 3.3 Loss Function & Optimizer Setup
---
We configure the loss function to measure prediction accuracy and the optimizer to update model weights. Mean Squared Error (MSE) is used since we are predicting continuous astrophysical quantities, and Adam is chosen for its adaptive learning rate capabilities.


In [10]:
import torch.optim as optim

criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4, weight_decay=1e-4)

print("Loss/optimizer ready.")

Loss/optimizer ready.


#### 3.4 Training Loop
---
We iterate over the dataset for multiple epochs, performing forward passes, computing the loss, backpropagating gradients, and updating model parameters. Progress is printed each epoch to monitor convergence.


In [None]:
from math import isfinite

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

num_epochs = 20
max_grad_norm = 1.0

for epoch in range(1, num_epochs + 1):
    # Train
    model.train()
    train_loss_sum = 0.0
    n_train = 0
    for xb, yb in train_loader:
        xb = torch.nan_to_num(xb, nan=0.0).to(device)
        yb = torch.nan_to_num(yb, nan=0.0).to(device)

        optimizer.zero_grad()
        preds = model(xb)
        loss = criterion(preds, yb)
        if not isfinite(loss.item()):
            continue

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)
        optimizer.step()

        train_loss_sum += loss.item() * xb.size(0)
        n_train += xb.size(0)

    train_loss = train_loss_sum / max(n_train, 1)

    # Validate
    model.eval()
    val_loss_sum = 0.0
    n_val = 0
    with torch.no_grad():
        for xb, yb in val_loader:
            xb = torch.nan_to_num(xb, nan=0.0).to(device)
            yb = torch.nan_to_num(yb, nan=0.0).to(device)
            preds = model(xb)
            vloss = criterion(preds, yb)
            if isfinite(vloss.item()):
                val_loss_sum += vloss.item() * xb.size(0)
                n_val += xb.size(0)

    val_loss = val_loss_sum / max(n_val, 1)
    print(f"Epoch {epoch:02d} | train {train_loss:.6f} | val {val_loss:.6f}")


Epoch 01 | train 0.448956 | val 0.374274
Epoch 02 | train 0.238087 | val 0.334526
Epoch 03 | train 0.228949 | val 0.335228
Epoch 04 | train 0.226160 | val 0.326805
Epoch 05 | train 0.222004 | val 0.328571


#### 3.5 Validation Metrics per Feature
---
de-normalized to original units

In [None]:
# 3.5 — Validation Metrics per Feature (de-normalized to original units)

import numpy as np
import pathlib
import torch

FEATURES = ["bh_mass","bh_acc","stellar_mass","sfr","halo_mass","vel_disp"]

DATA_DIR = pathlib.Path("../data")
mean = np.load(DATA_DIR / "feat_mean.npy")
std  = np.load(DATA_DIR / "feat_std.npy")

model.eval()
all_pred, all_true = [], []
with torch.no_grad():
    for xb, yb in val_loader:
        xb = torch.nan_to_num(xb, nan=0.0).to(device)
        yb = torch.nan_to_num(yb, nan=0.0).to(device)
        pred = model(xb)
        all_pred.append(pred.cpu().numpy())
        all_true.append(yb.cpu().numpy())

P = np.concatenate(all_pred, axis=0)
T = np.concatenate(all_true, axis=0)

# Back to physical units
P_real = P * std + mean
T_real = T * std + mean

rmse = np.sqrt(np.mean((P_real - T_real) ** 2, axis=0))
mae  = np.mean(np.abs(P_real - T_real), axis=0)

print("Per-feature RMSE / MAE (original units):")
for i, name in enumerate(FEATURES):
    print(f"- {name}: RMSE={rmse[i]:.4e} | MAE={mae[i]:.4e}")
