In [2]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
STEP 1: Filter subhalos by availability of OBSERVED galaxy inclination in the catalog.

- Reads:  /Users/tsingh65/github_repos/COS-GASS/data/COS_GASS_TNG.txt
- Keeps only rows where 'inc' is numeric (skips '-')
- Explodes SubhaloID1..4 -> unique SubhaloID list to run
- Writes:
    /Users/tsingh65/github_repos/COS-GASS/data/sids_with_inc.txt         (one SID per line)
    /Users/tsingh65/github_repos/COS-GASS/data/sids_with_inc_detailed.csv (mapping per galaxy)
    /Users/tsingh65/github_repos/COS-GASS/data/sids_skipped_by_inc.csv    (galaxies with '-' in inc)
"""

import os
import pandas as pd
import numpy as np

CATALOG_CSV = r"../data/COS_GASS_TNG.txt"
REPO_DATA   = os.path.dirname(CATALOG_CSV)

OUT_TXT     = os.path.join(REPO_DATA, "sids_with_inc.txt")
OUT_DETAIL  = os.path.join(REPO_DATA, "sids_with_inc_detailed.csv")
OUT_SKIPPED = os.path.join(REPO_DATA, "sids_skipped_by_inc.csv")

def main():
    print("[STEP1] Reading catalog:", CATALOG_CSV)
    if not os.path.isfile(CATALOG_CSV):
        raise FileNotFoundError(f"Catalog not found: {CATALOG_CSV}")

    # Read; treat '-' as NaN for *all* columns to be safe, then force numeric on inc
    df = pd.read_csv(CATALOG_CSV, na_values=["-"])
    original_rows = len(df)
    print(f"[STEP1] Catalog rows loaded: {original_rows}")

    # Split by availability of numeric inc
    df["inc"] = pd.to_numeric(df["inc"], errors="coerce")
    has_inc = df["inc"].notna()
    df_keep = df.loc[has_inc].copy()
    df_skip = df.loc[~has_inc].copy()

    print(f"[STEP1] Galaxies with numeric 'inc': {len(df_keep)}")
    print(f"[STEP1] Galaxies skipped (inc == '-'): {len(df_skip)}")

    # Explode SubhaloID1..4 to one SID per row
    sid_cols = ["SubhaloID1","SubhaloID2","SubhaloID3","SubhaloID4"]
    for c in sid_cols:
        if c not in df_keep.columns:
            df_keep[c] = np.nan

    # Build long list of (Galaxy, COS_ID, inc, SID)
    rows = []
    for _, r in df_keep.iterrows():
        base = dict(COS_ID=r["COS_ID"], Galaxy=r["Galaxy"], inc=float(r["inc"]))
        for c in sid_cols:
            sid = r.get(c)
            if pd.notna(sid):
                try:
                    rows.append({**base, "SubhaloID": int(sid)})
                except Exception:
                    pass
    dfl = pd.DataFrame(rows).drop_duplicates(subset=["SubhaloID"]).sort_values("SubhaloID")
    print(f"[STEP1] Unique SubhaloID to run: {len(dfl)}")

    # Save artifacts
    os.makedirs(REPO_DATA, exist_ok=True)
    dfl.to_csv(OUT_DETAIL, index=False)
    with open(OUT_TXT, "w") as f:
        for sid in dfl["SubhaloID"].tolist():
            f.write(f"{sid}\n")
    df_skip.to_csv(OUT_SKIPPED, index=False)

    # Quick debugging summary
    print("\n[STEP1] Outputs written:")
    print("  • SIDs list:         ", OUT_TXT)
    print("  • Detailed include:  ", OUT_DETAIL)
    print("  • Skipped (inc='-'): ", OUT_SKIPPED)
    if len(dfl) == 0:
        print("[STEP1][WARN] No subhalos selected. Did your catalog have numeric 'inc' values?")

if __name__ == "__main__":
    main()

[STEP1] Reading catalog: ../data/COS_GASS_TNG.txt
[STEP1] Catalog rows loaded: 18
[STEP1] Galaxies with numeric 'inc': 13
[STEP1] Galaxies skipped (inc == '-'): 5
[STEP1] Unique SubhaloID to run: 52

[STEP1] Outputs written:
  • SIDs list:          ../data/sids_with_inc.txt
  • Detailed include:   ../data/sids_with_inc_detailed.csv
  • Skipped (inc='-'):  ../data/sids_skipped_by_inc.csv


In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
STEP 2: Iterate SIDs and call the unmodified single-subhalo script for each.

PREREQS:
- Run step1_select_sids_with_inc.py first; it writes sids_with_inc.txt
- Make sure your LOS generator (the orientations/LOS endpoints code) has
  already produced the per-SID rays CSVs in each subhalo directory:
    sub_<SID>/rays_and_recipes_sid<SID>_snap99_L3Rvir/rays_sid<SID>.csv
    sub_<SID>/rays_and_recipes_sid<SID>_snap99_L4Rvir/rays_sid<SID>.csv

REUSES (unchanged):
- single_subhalo_rays_spectra.py  (YOUR original single-SID code;
  do NOT edit it; this runner imports it as a module and sets globals)
"""

import os
import sys
import glob
import traceback
import importlib.util
from contextlib import contextmanager

# ========= USER PATHS (match your environment) =========
# The repo data dir where step1 wrote the SIDs file:
REPO_DATA = r"../data"
SID_LIST  = os.path.join(REPO_DATA, "sids_with_inc.txt")

# The parent dir that contains sub_<SID>/ folders + cutouts:
TNG_SUBHALOS_ROOT = r"../../../ASU Dropbox/Tanmay Singh/COS_GASS/TNG_Subhalos"

# Path to your UNCHANGED single-subhalo script:
WORKER_SCRIPT_PATH = os.path.join(os.path.dirname(__file__), "single_subhalo_rays_spectra.py")

# Snapshot (your single-SID script also defaults to 99)
SNAP = 99

# Which runs we *prefer* to attempt. We’ll auto-skip missing CSVs per run.
PREFERRED_RUN_LABELS = ["L3Rvir", "L4Rvir"]

# =======================================================

@contextmanager
def pushd(new_dir):
    """Temporarily cd into new_dir, then back."""
    prev = os.getcwd()
    os.chdir(new_dir)
    try:
        yield
    finally:
        os.chdir(prev)

def debug_print_paths(sid, cutout_h5, sub_dir):
    print(f"[STEP2][SID={sid}] CUTOUT_H5:   {cutout_h5}")
    print(f"[STEP2][SID={sid}] OUTPUT_BASE: {sub_dir}")
    print(f"[STEP2][SID={sid}] CWD for worker calls will be this OUTPUT_BASE.\n")

def find_cutout_h5_for_sid(sid: int) -> str:
    """
    Try a couple patterns to locate a cutout HDF5 for this subhalo.
    Returns first hit or None.
    """
    sub_dir = os.path.join(TNG_SUBHALOS_ROOT, f"sub_{int(sid)}")
    if not os.path.isdir(sub_dir):
        return None
    pats = [
        os.path.join(sub_dir, f"cutout*sub{int(sid)}*.hdf5"),
        os.path.join(sub_dir, "*.hdf5"),
    ]
    for pat in pats:
        hits = sorted(glob.glob(pat))
        if hits:
            return hits[0]
    return None

def rays_csv_exists(sub_dir: str, sid: int, run_label: str) -> bool:
    rel = f"rays_and_recipes_sid{sid}_snap{SNAP}_{run_label}/rays_sid{sid}.csv"
    path = os.path.join(sub_dir, rel)
    return os.path.isfile(path)

def load_worker_module():
    if not os.path.isfile(WORKER_SCRIPT_PATH):
        raise FileNotFoundError(
            f"Cannot find worker script at {WORKER_SCRIPT_PATH}\n"
            f"Please save your unmodified single-subhalo code there "
            f"(filename: single_subhalo_rays_spectra.py)."
        )
    spec = importlib.util.spec_from_file_location("single_subhalo_worker", WORKER_SCRIPT_PATH)
    mod  = importlib.util.module_from_spec(spec)
    spec.loader.exec_module(mod)
    return mod

def read_sid_list(path: str):
    if not os.path.isfile(path):
        raise FileNotFoundError(f"Missing SIDs list: {path}\nRun step1_select_sids_with_inc.py first.")
    sids = []
    with open(path, "r") as f:
        for line in f:
            line = line.strip()
            if not line: continue
            try:
                sids.append(int(line))
            except Exception:
                print("[STEP2][WARN] Non-integer SID in list, skipping:", line)
    sids = sorted(set(sids))
    print(f"[STEP2] Loaded {len(sids)} unique SIDs from:", path)
    return sids

def main():
    print("[STEP2] Loading worker module (your single-SID script, unmodified):")
    worker = load_worker_module()
    print("[STEP2] Worker module loaded OK:", WORKER_SCRIPT_PATH)
    print("        (We will set its globals per SID and call main())\n")

    sids = read_sid_list(SID_LIST)
    total = len(sids)
    successes = 0
    failures  = 0

    for idx, sid in enumerate(sids, 1):
        print("="*78)
        print(f"[STEP2] [{idx}/{total}] SID={sid}")

        sub_dir   = os.path.join(TNG_SUBHALOS_ROOT, f"sub_{sid}")
        cutout_h5 = find_cutout_h5_for_sid(sid)

        if not os.path.isdir(sub_dir):
            print(f"[STEP2][ERROR] Subhalo directory missing: {sub_dir}  -> SKIP")
            failures += 1
            continue
        if not cutout_h5 or not os.path.isfile(cutout_h5):
            print(f"[STEP2][ERROR] Could not find HDF5 cutout for sid={sid} under {sub_dir}  -> SKIP")
            failures += 1
            continue

        # Determine which runs we can actually do (rays CSV present)
        available_runs = []
        for run_label in PREFERRED_RUN_LABELS:
            ok = rays_csv_exists(sub_dir, sid, run_label)
            print(f"[STEP2][SID={sid}] Check rays CSV for {run_label}: {'FOUND' if ok else 'MISSING'}")
            if ok:
                available_runs.append(run_label)

        if not available_runs:
            print(f"[STEP2][WARN] No rays CSV found for SID={sid} (looked for: {PREFERRED_RUN_LABELS})  -> SKIP")
            failures += 1
            continue

        debug_print_paths(sid, cutout_h5, sub_dir)

        # Set the worker's globals for THIS SID
        worker.CUTOUT_H5   = cutout_h5
        worker.SID         = int(sid)
        worker.SNAP        = int(SNAP)         # keep consistent
        worker.OUTPUT_BASE = sub_dir
        worker.RUN_LABELS  = available_runs    # only process runs that have rays CSV
        # Optional: Filter by mode? (None = both)
        # worker.FILTER_MODE = None

        # Call worker.main() with CWD switched to the subhalo dir
        try:
            with pushd(sub_dir):
                print(f"[STEP2][SID={sid}] CWD now: {os.getcwd()}")
                print(f"[STEP2][SID={sid}] Calling worker.main() for runs: {available_runs}")
                worker.main()
            print(f"[STEP2][SID={sid}] DONE.")
            successes += 1
        except Exception as e:
            print(f"[STEP2][ERROR] SID={sid} failed: {e}")
            traceback.print_exc()
            failures += 1

    print("\n" + "="*78)
    print(f"[STEP2] Finished. Successes={successes}  Failures={failures}  Total={total}")
    if failures > 0:
        print("[STEP2][NOTE] See error traces above; failures do not stop the whole batch.")

if __name__ == "__main__":
    try:
        main()
    except Exception as e:
        print("[STEP2][FATAL]", e)
        traceback.print_exc()
        sys.exit(2)