# Match SIMLIB WFD entries to TNS

This notebook matches SIMLIB WFD events to TNS transients using redshift, discovery time, and sky position. It then writes:

- A match table (`WFD_to_TNS_match_table.csv`)
- A list of unmatched TNS entries within the date window (`TNS_unmatched.csv`)
- An annotated SIMLIB with match comments injected after each `NOBS/REDSHIFT` line (`SIMLIB_MATCHED_ANNOTATED.SIMLIB`)

The matching is **one-to-one** on TNS objects (each TNS entry is used at most once), and the matching is greedy over SIMLIB entries ordered by first observation time.


## Imports and utilities

We define:
- A simple UTC datetime to MJD converter.
- An angular separation function using the great-circle formula.


In [None]:
import re
import math
import argparse
import datetime as dt
from bisect import bisect_left, bisect_right

import numpy as np
import pandas as pd


In [None]:
# -----------------------------
# time conversion: datetime -> MJD
# MJD(1970-01-01 00:00:00 UTC) = 40587.0
# (Ignores leap seconds; for 30-day tolerance this is fine.)
# -----------------------------

def datetime_to_mjd(d: dt.datetime) -> float:
    epoch = dt.datetime(1970, 1, 1)
    return 40587.0 + (d - epoch).total_seconds() / 86400.0


def ang_sep_deg(ra1, dec1, ra2, dec2) -> float:
    """Great-circle angular separation in degrees. Inputs in degrees."""
    r1 = math.radians(ra1)
    d1 = math.radians(dec1)
    r2 = math.radians(ra2)
    d2 = math.radians(dec2)
    cosang = math.sin(d1) * math.sin(d2) + math.cos(d1) * math.cos(d2) * math.cos(r1 - r2)
    cosang = max(-1.0, min(1.0, cosang))
    return math.degrees(math.acos(cosang))


## SIMLIB parsing

We parse the SIMLIB into per-LIBID records and keep the original lines so we can inject match comments later.


In [None]:
# -----------------------------
# SIMLIB parsing
# -----------------------------

def parse_simlib(simlib_path: str):
    """
    Parse SIMLIB into records list (LIBID, RA, DEC, REDSHIFT, first_mjd).
    Also returns raw lines for rewriting/annotation.
    """
    with open(simlib_path, "r") as f:
        lines = f.readlines()

    records = []
    current = None
    in_block = False

    for i, line in enumerate(lines):
        if line.startswith("LIBID:"):
            m = re.search(r"LIBID:\s*([0-9]+)", line)
            libid = int(m.group(1)) if m else None
            current = {
                "LIBID": libid,
                "RA": None,
                "DEC": None,
                "REDSHIFT": None,
                "first_mjd": None,
                "line_nobs": None,
            }
            in_block = True

        elif in_block and line.strip().startswith("RA:"):
            m = re.search(r"RA:\s*([0-9.+-Ee]+)\s+DEC:\s*([0-9.+-Ee]+)", line)
            if m:
                current["RA"] = float(m.group(1))
                current["DEC"] = float(m.group(2))

        elif in_block and ("NOBS:" in line and "REDSHIFT:" in line):
            m = re.search(r"REDSHIFT:\s*([0-9.+-Ee]+)", line)
            if m:
                current["REDSHIFT"] = float(m.group(1))
            current["line_nobs"] = i

        elif in_block and line.startswith("S:"):
            parts = line.split()
            if len(parts) >= 2:
                try:
                    mjd = float(parts[1])
                    # robust: take min, even if lines are not sorted
                    if current["first_mjd"] is None or mjd < current["first_mjd"]:
                        current["first_mjd"] = mjd
                except ValueError:
                    pass

        elif in_block and line.strip().startswith("END_LIBID"):
            records.append(current)
            current = None
            in_block = False

    return records, lines


## Matching logic

We do a 1-to-1 greedy match of SIMLIB entries to TNS objects, constrained by:

- Redshift tolerance (`z_tol`)
- Discovery time tolerance in days (`t_tol`)
- Angular separation tolerance (`ang_tol`, degrees)

The best candidate within tolerances is selected using a normalized quadratic score.


In [None]:
# -----------------------------
# Matching (WFD/SIMLIB-driven, 1-to-1 on TNS)
# -----------------------------

def match_wfd_to_tns(records, df_tns, z_tol=0.005, t_tol=30.0, ang_tol=120.0):
    # Sort TNS by redshift for fast z-window lookup
    tns = df_tns.sort_values("redshift").reset_index(drop=True)
    z_arr = tns["redshift"].to_numpy()
    used = np.zeros(len(tns), dtype=bool)

    # sort WFD by time (optional but reasonable for greedy)
    wfd_sorted = sorted(
        [r for r in records if r.get("first_mjd") is not None and r.get("REDSHIFT") is not None],
        key=lambda r: r["first_mjd"],
    )

    matches = []
    for r in wfd_sorted:
        z0 = r["REDSHIFT"]
        mjd0 = r["first_mjd"]

        lo = bisect_left(z_arr, z0 - z_tol)
        hi = bisect_right(z_arr, z0 + z_tol)

        best = None
        for j in range(lo, hi):
            if used[j]:
                continue

            mjd_t = float(tns.at[j, "discovery_mjd"])
            dt_days = abs(mjd_t - mjd0)
            if dt_days > t_tol:
                continue

            ra_t = float(tns.at[j, "ra"])
            dec_t = float(tns.at[j, "declination"])

            if r.get("RA") is not None and r.get("DEC") is not None:
                ang = ang_sep_deg(r["RA"], r["DEC"], ra_t, dec_t)
            else:
                ang = 0.0

            if ang > ang_tol:
                continue

            dz = abs(float(z_arr[j]) - z0)
            score = (dz / z_tol) ** 2 + (dt_days / t_tol) ** 2 + (ang / ang_tol) ** 2

            if best is None or score < best["score"]:
                best = {"j": j, "score": score, "dz": dz, "dt": dt_days, "ang": ang}

        if best is not None:
            j = best["j"]
            used[j] = True
            matches.append(
                {
                    "LIBID": int(r["LIBID"]),
                    "wfd_z": float(z0),
                    "wfd_ra": r.get("RA"),
                    "wfd_dec": r.get("DEC"),
                    "wfd_first_mjd": float(mjd0),
                    "TNS_objid": int(tns.at[j, "objid"]),
                    "TNS_name": f"{tns.at[j,'name_prefix']}{tns.at[j,'name']}",
                    "tns_z": float(tns.at[j, "redshift"]),
                    "tns_ra": float(tns.at[j, "ra"]),
                    "tns_dec": float(tns.at[j, "declination"]),
                    "tns_discovery_mjd": float(tns.at[j, "discovery_mjd"]),
                    "dz": float(best["dz"]),
                    "dt_days": float(best["dt"]),
                    "ang_sep_deg": float(best["ang"]),
                    "score": float(best["score"]),
                }
            )

    match_df = pd.DataFrame(matches).sort_values("LIBID")
    unmatched_tns = tns.loc[~used].copy()
    return match_df, unmatched_tns


## Annotate SIMLIB

For matched LIBIDs, we inject a single-line comment after the `NOBS/REDSHIFT` line with the TNS match details.


In [None]:
def annotate_simlib_lines(lines, match_df):
    # map LIBID -> match row (namedtuple)
    match_map = {int(row.LIBID): row for row in match_df.itertuples(index=False)}

    new_lines = []
    current_libid = None

    for line in lines:
        if line.startswith("LIBID:"):
            m = re.search(r"LIBID:\s*([0-9]+)", line)
            current_libid = int(m.group(1)) if m else None
            new_lines.append(line)
            continue

        new_lines.append(line)

        if (
            current_libid is not None
            and ("NOBS:" in line and "REDSHIFT:" in line)
            and current_libid in match_map
        ):
            row = match_map[current_libid]
            comment = (
                f"# MATCH_TNS: objid={row.TNS_objid} name={row.TNS_name} "
                f"z_tns={row.tns_z:.5f} mjd_tns={row.tns_discovery_mjd:.5f} "
                f"dt_days={row.dt_days:.2f} ang_deg={row.ang_sep_deg:.2f}
"
            )
            new_lines.append(comment)

    return new_lines


## I/O and end-to-end run

Set paths and matching tolerances, then run the workflow. The defaults match the script version you provided.


In [None]:
# Inputs
TNS_CSV = "../TNS_with_WFD_fake_injected.csv"
SIMLIB_PATH = "_10y_LOWZ_REDSHIFT_LT015_FROM_SIMDATA.SIMLIB"

# Outputs
OUT_UNMATCHED = "TNS_unmatched.csv"
OUT_MATCH = "WFD_to_TNS_match_table.csv"
OUT_ANNOTATED = "SIMLIB_MATCHED_ANNOTATED.SIMLIB"

# Matching tolerances
Z_TOL = 0.005
T_TOL_DAYS = 30.0
ANG_TOL_DEG = 120.0

# Discovery date window (inclusive of start, exclusive of end)
START_DATE = "2022-10-01"
END_DATE = "2024-01-01"


In [None]:
# --- read & filter TNS

df = pd.read_csv(TNS_CSV)
df["discovery_dt"] = pd.to_datetime(df["discoverydate"], errors="coerce")
df = df.dropna(subset=["discovery_dt", "redshift", "ra", "declination"]).copy()

start = pd.Timestamp(START_DATE + " 00:00:00")
end = pd.Timestamp(END_DATE + " 00:00:00")
df = df[(df["discovery_dt"] >= start) & (df["discovery_dt"] < end)].copy()

df["discovery_mjd"] = df["discovery_dt"].apply(lambda x: datetime_to_mjd(x.to_pydatetime()))

# --- parse simlib
records, lines = parse_simlib(SIMLIB_PATH)

# --- match
match_df, unmatched_tns = match_wfd_to_tns(
    records,
    df,
    z_tol=Z_TOL,
    t_tol=T_TOL_DAYS,
    ang_tol=ANG_TOL_DEG,
)

# --- write match table
match_df.to_csv(OUT_MATCH, index=False)

# --- write unmatched TNS CSV (keep original columns + discovery_mjd)
original_cols = [c for c in df.columns if c in pd.read_csv(TNS_CSV, nrows=1).columns]
out_df = unmatched_tns[original_cols].copy()
out_df["discovery_mjd"] = unmatched_tns["discovery_mjd"].values
out_df.to_csv(OUT_UNMATCHED, index=False)

# --- write annotated simlib
annotated_lines = annotate_simlib_lines(lines, match_df)
with open(OUT_ANNOTATED, "w") as f:
    f.writelines(annotated_lines)

print(f"Matched pairs: {len(match_df)}")
print(f"Unmatched TNS (in date window): {len(out_df)}")
print("Wrote:")
print("  ", OUT_UNMATCHED)
print("  ", OUT_MATCH)
print("  ", OUT_ANNOTATED)
