In [None]:
import re
from pathlib import Path

input_file = Path(r"C:\Users\rumki\globby_rrs\globby_metallicity.txt")
output_file = Path(r"C:\Users\rumki\globby_rrs\globby_metallicity_clean.txt")

colnames = ["ID","[Fe/H]","wt","E(B-V)","V_HB","(m-M)V","V_t","M_V,t",
            "U-B","B-V","V-R","V-I","spt","ellip"]
N_COLS = len(colnames)

# Expanded list of known cluster prefixes
prefixes = {
    "NGC","IC","ESO","M","PAL","TERZAN","ARP","UGC","AM","BH",
    "DJORG","LYNGA","E","K","LILLER","WHITING","PYXIS","RUPRECHT"
}

with input_file.open("r", encoding="utf-8") as fin, output_file.open("w", encoding="utf-8") as fout:
    for line in fin:
        line = line.strip()
        if not line:
            continue
        if line.startswith("ID "):  # header line
            fout.write(" ".join(colnames) + "\n")
            continue

        tokens = line.split()
        # Merge prefix + number into one ID if applicable
        if len(tokens) >= 2 and tokens[0].upper() in prefixes:
            tokens[0] = tokens[0] + "_" + tokens[1]
            tokens = [tokens[0]] + tokens[2:]

        # Replace any leftover spaces with underscores in ID
        tokens[0] = re.sub(r"\s+", "_", tokens[0])

        # Pad missing values with NA
        if len(tokens) < N_COLS:
            tokens += ["NA"] * (N_COLS - len(tokens))
        elif len(tokens) > N_COLS:
            tokens = tokens[:N_COLS]

        fout.write(" ".join(tokens) + "\n")