In [1]:
import requests
import numpy as np
import torch
import random
random.seed(42)
print(f"NumPy: {np.__version__} | PyTorch: {torch.__version__}")


NumPy: 1.24.3 | PyTorch: 2.0.1+cpu


In [2]:
import illustris_python as il

basePath = "/home/tnguser/sims.TNG/TNG100-1/output"  # <- same as before
SNAP_START, SNAP_END = 18, 33
required_snaps = set(range(SNAP_START, SNAP_END+1))   # 18..33 (16 snaps)

subhalos = il.groupcat.loadSubhalos(basePath, 33, fields=['SubhaloBHMass','SubhaloMassType'])
bh_mass = subhalos['SubhaloBHMass']
stellar_mass = subhalos['SubhaloMassType'][:,4]
bh_mask = bh_mass > 0
bh_list = [i for i, has_bh in enumerate(bh_mask) if has_bh]
print("Total subhalos with black holes:", len(bh_list))


Total subhalos with black holes: 29415


In [3]:
full_histories = {}
for count, sub_id in enumerate(bh_list, start=1):
    try:
        tree = il.sublink.loadTree(
            basePath, 33, sub_id,
            fields=['SubhaloID','SnapNum','SubhaloBHMass','SubhaloBHMdot',
                    'SubhaloMassType','SubhaloSFR','SubhaloVelDisp'],
            onlyMPB=True
        )
        mask = (tree['SnapNum'] <= 32) & (tree['SnapNum'] >= 18)
        snaps = set(tree['SnapNum'][mask])

        if len(snaps & required_snaps) >= int(0.9 * len(required_snaps)):
            idx = np.argsort(tree['SnapNum'][mask])
            full_histories[sub_id] = {
                "snap_nums":       tree['SnapNum'][mask][idx].tolist(),
                "bh_mass":         tree['SubhaloBHMass'][mask][idx].tolist(),
                "bh_accretion":    tree['SubhaloBHMdot'][mask][idx].tolist(),
                "stellar_mass":    tree['SubhaloMassType'][mask][idx, 4].tolist(),
                "halo_mass":       tree['SubhaloMassType'][mask][idx].sum(axis=1).tolist(),
                "sfr":             tree['SubhaloSFR'][mask][idx].tolist(),
                "vel_dispersion":  tree['SubhaloVelDisp'][mask][idx].tolist(),
            }
    except Exception:
        pass
    if count % 5000 == 0:
        print(f"Checked {count}/{len(bh_list)} subhalos...")

print("BHs with ≥90% snapshot coverage:", len(full_histories))


Checked 5000/29415 subhalos...
Checked 10000/29415 subhalos...
Checked 15000/29415 subhalos...
Checked 20000/29415 subhalos...
Checked 25000/29415 subhalos...
BHs with ≥90% snapshot coverage: 29232


In [4]:
import random
sampled_ids = random.sample(list(full_histories.keys()), 2500)
print("Sampled:", len(sampled_ids))

check_id = random.choice(sampled_ids)
print("Inspect ID:", check_id)
print("Snapshots:", full_histories[check_id]["snap_nums"][:10], "...")
print("BH Mass seq (first 5):", full_histories[check_id]["bh_mass"][:5])


Sampled: 2500
Inspect ID: 163461
Snapshots: [18, 19, 20, 21, 22, 23, 24, 25, 26, 27] ...
BH Mass seq (first 5): [9.446660260437056e-05, 9.979942115023732e-05, 0.00010838297748705372, 0.00012950033124070615, 0.00015543214976787567]


In [5]:
import pandas as pd
from pathlib import Path

rows = []
for sid in sampled_ids:
    rec = full_histories[sid]
    for s, m, mdot, sm, hm, sfr, vd in zip(
        rec["snap_nums"], rec["bh_mass"], rec["bh_accretion"],
        rec["stellar_mass"], rec["halo_mass"], rec["sfr"], rec["vel_dispersion"]
    ):
        rows.append({
            "subhalo_id": int(sid),
            "snapshot":   int(s),
            "bh_mass":    float(m),
            "bh_acc":     float(mdot),
            "stellar_mass": float(sm),
            "sfr":        float(sfr),
            "halo_mass":  float(hm),
            "vel_disp":   float(vd),
        })

tidy = pd.DataFrame(rows).sort_values(["subhalo_id","snapshot"]).reset_index(drop=True)

OUT_CSV = "../data/black_hole_evolution_tng100.csv"
Path("../data").mkdir(parents=True, exist_ok=True)
tidy.to_csv(OUT_CSV, index=False)

tidy.shape, tidy.head()


((37412, 8),
    subhalo_id  snapshot  bh_mass  bh_acc  stellar_mass       sfr  halo_mass  \
 0          13        18      0.0     0.0      0.006245  0.421397   0.802091   
 1          13        19      0.0     0.0      0.016149  0.643598   8.043409   
 2          13        20      0.0     0.0      0.023937  1.254688  11.051660   
 3          13        21      0.0     0.0      0.030710  1.306005  13.369424   
 4          13        22      0.0     0.0      0.067673  5.782820  17.118586   
 
      vel_disp  
 0   53.604897  
 1   75.514915  
 2   86.856369  
 3   93.939201  
 4  112.977501  )

In [6]:
FEATURE_ORDER = ["bh_mass","bh_acc","stellar_mass","sfr","halo_mass","vel_disp"]

ids = np.array(sorted(tidy["subhalo_id"].unique()))
rng = np.random.default_rng(1337)
rng.shuffle(ids)
n = len(ids); n_tr = int(0.70*n); n_va = int(0.15*n)
tr, va, te = ids[:n_tr], ids[n_tr:n_tr+n_va], ids[n_tr+n_va:]

train = tidy[tidy.subhalo_id.isin(tr)].copy()
val   = tidy[tidy.subhalo_id.isin(va)].copy()
test  = tidy[tidy.subhalo_id.isin(te)].copy()

stats = {}
for c in FEATURE_ORDER:
    mu = float(train[c].mean()); sd = float(train[c].std() + 1e-12)
    stats[c] = {"mean": mu, "std": sd}
    train[c] = (train[c]-mu)/sd
    val[c]   = (val[c]-mu)/sd
    test[c]  = (test[c]-mu)/sd

outp = Path("../data/processed"); outp.mkdir(parents=True, exist_ok=True)
train.to_parquet(outp/"train.parquet", index=False)
val.to_parquet(outp/"val.parquet", index=False)
test.to_parquet(outp/"test.parquet", index=False)
pd.DataFrame(stats).T.to_csv(outp/"standardization_stats.csv")

{"n_tracks": n, "train": len(tr), "val": len(va), "test": len(te), "csv": OUT_CSV}


{'n_tracks': 2500,
 'train': 1750,
 'val': 375,
 'test': 375,
 'csv': '../data/black_hole_evolution_tng100.csv'}