In [1]:
import os
import glob
import re
import pandas as pd
import subprocess

def build_pyclone_vi_tsv(input_dir: str, output_dir: str):
    """
    For every CSV matching “CliPP2Sim4k_*_obs.csv” in input_dir:
      1. Read it into pandas.
      2. Extract `purity` from the filename.
      3. Verify required columns exist.
      4. Fill or drop missing IDs, then cast `mutation_id` and `sample_id` to str.
      5. Double‐check that all entries in those columns are Python str.
      6. Build a new DataFrame with exactly the columns PyClone‐VI needs:
         mutation_id (str), sample_id (str), ref_counts, alt_counts, normal_cn,
         major_cn, minor_cn, tumour_content.
      7. Write out `<base>_py_clone_vi.tsv` (tab‐separated) into output_dir.
      8. Return a list of all newly created TSV paths.
    """

    # 1) Verify input_dir exists
    if not os.path.isdir(input_dir):
        raise FileNotFoundError(f"Input directory not found: {input_dir!r}")
    os.makedirs(output_dir, exist_ok=True)

    # 2) Find all matching CSVs
    pattern = os.path.join(input_dir, "CliPP2Sim4k_*_obs.csv")
    csv_paths = glob.glob(pattern)
    if not csv_paths:
        print(f"[Warning] No files found matching pattern: {pattern!r}")
        return []

    required_cols = {
        "mutation_id", "sample_id", "depth", "reads", "total_true", "minor_est"
    }

    tsv_paths = []
    for csv_path in csv_paths:
        fname = os.path.basename(csv_path)
        key   = fname.rsplit(".csv", 1)[0]  # e.g. "CliPP2Sim4k_purity0.3_cna0.2_depth100_K2_M2_6_obs"

        # 3) Extract purity from filename
        match = re.search(r"purity([\d\.]+)", fname)
        if match is None:
            raise ValueError(f"Filename {fname!r} missing ‘purity<number>’ token.")
        purity_value = float(match.group(1))

        # 4) Read CSV
        df = pd.read_csv(csv_path)

        # 5) Check required columns
        missing = required_cols - set(df.columns)
        if missing:
            raise KeyError(f"File {fname!r} is missing columns: {sorted(missing)}")

        # 6) Fill or drop missing IDs, then cast to str
        #    Option A) Fill missing with placeholders:
        df["mutation_id"] = df["mutation_id"].fillna("UNKNOWN_MUT").astype(str)
        df["sample_id"]   = df["sample_id"].fillna("UNKNOWN_SAM").astype(str)

        #    Option B) (Alternative) Drop rows where either ID is missing:
        #    df = df.dropna(subset=["mutation_id", "sample_id"])
        #    df["mutation_id"] = df["mutation_id"].astype(str)
        #    df["sample_id"]   = df["sample_id"].astype(str)

        # 7) Double‐check that every value in those columns is a Python str
        bad_mut_idx = df[~df["mutation_id"].map(lambda x: isinstance(x, str))].index
        bad_sam_idx = df[~df["sample_id"].map(lambda x: isinstance(x, str))].index

        if len(bad_mut_idx) > 0 or len(bad_sam_idx) > 0:
            # Report exactly which rows are invalid
            msg_lines = []
            if len(bad_mut_idx) > 0:
                msg_lines.append(
                    f"  → mutation_id has non‐string entries at rows: {list(bad_mut_idx[:10])}"
                )
            if len(bad_sam_idx) > 0:
                msg_lines.append(
                    f"  → sample_id   has non‐string entries at rows: {list(bad_sam_idx[:10])}"
                )
            raise TypeError(
                f"After fill/astype, file {fname!r} still contains non‐string IDs:\n" +
                "\n".join(msg_lines)
            )

        # 8) Build the PyClone‐VI input DataFrame
        pyclone_vi_df = pd.DataFrame({
            "mutation_id"    : 'SNV' + df['mutation_id'].astype(str),
            "sample_id"      : 'R' + df["sample_id"].astype(str),
            "ref_counts"     : df["depth"] - df["reads"],
            "alt_counts"     : df["reads"],
            "normal_cn"      : 2,
            "major_cn"       : df["total_true"] - df["minor_est"],
            "minor_cn"       : df["minor_est"],
            "tumour_content" : purity_value
        })

        # 9) Write out as TSV
        out_fname = f"{key}_py_clone_vi.tsv"
        out_path  = os.path.join(output_dir, out_fname)
        pyclone_vi_df.to_csv(out_path, sep="\t", index=False)
        print(f"[OK] Wrote: {out_path!r} ({pyclone_vi_df.shape[0]} rows)")

        tsv_paths.append(out_path)

    return tsv_paths


def run_pyclone_on_all(tsv_list, working_dir: str,
                      num_chains: int = 40,
                      noise_model: str = "binomial",
                      num_restarts: int = 10):
    """
    For each TSV in tsv_list, run:
      1) pyclone-vi fit -i <tsv> -o <h5> -c num_chains -d noise_model -r num_restarts
      2) pyclone-vi write-results-file -i <h5> -o <results.tsv>
    Logs progress and skips any file whose fit step fails.
    """

    if not tsv_list:
        print("[Info] No TSV files to process. Exiting.")
        return

    for tsv_path in tsv_list:
        fname     = os.path.basename(tsv_path)                # e.g. “CliPP2Sim4k_…_py_clone_vi.tsv”
        base_name = fname.rsplit(".tsv", 1)[0]                 # e.g. “CliPP2Sim4k_…_py_clone_vi”

        h5_out     = os.path.join(working_dir, f"{base_name}.h5")
        results_tsv = os.path.join(working_dir, f"{base_name}_results.tsv")

        print(f"\n[INFO] Processing: {fname}")
        print(f"      → Fit output:    {os.path.basename(h5_out)}")
        print(f"      → Results TSV:   {os.path.basename(results_tsv)}\n")

        # --- 1) pyclone-vi fit ---
        fit_cmd = [
            "pyclone-vi", "fit",
            "-i", tsv_path,
            "-o", h5_out,
            "-c", str(num_chains),
            "-d", noise_model,
            "-r", str(num_restarts)
        ]
        try:
            subprocess.run(fit_cmd, check=True)
            print(f"[OK] fit → {os.path.basename(h5_out)}")
        except subprocess.CalledProcessError as e:
            print(
                f"[ERROR] ‘pyclone-vi fit’ failed for {fname!r} "
                f"(exit {e.returncode}). Skipping this file."
            )
            continue

        # --- 2) pyclone-vi write-results-file ---
        write_cmd = [
            "pyclone-vi", "write-results-file",
            "-i", h5_out,
            "-o", results_tsv
        ]
        try:
            subprocess.run(write_cmd, check=True)
            print(f"[OK] write-results-file → {os.path.basename(results_tsv)}")
        except subprocess.CalledProcessError as e:
            print(
                f"[ERROR] ‘pyclone-vi write-results-file’ failed for {h5_out!r} "
                f"(exit {e.returncode})."
            )
            continue

    print("\n[INFO] All done.")


if __name__ == "__main__":
    # === User‐settable paths ===
    INPUT_DIR   = "/mnt/data/Dropbox/GitHub/Simulation/input"
    OUTPUT_DIR  = "/mnt/data/Dropbox/GitHub/Simulation/py_clone_simulations"

    # 1) Build all “*_py_clone_vi.tsv” (filling and casting IDs to string)
    tsv_paths = build_pyclone_vi_tsv(input_dir=INPUT_DIR, output_dir=OUTPUT_DIR)

    # 2) Run PyClone‐VI fit + write‐results‐file
    run_pyclone_on_all(
        tsv_list    = tsv_paths,
        working_dir = OUTPUT_DIR,
        num_chains  = 40,
        noise_model = "binomial",
        num_restarts= 10
    )


[Info] No TSV files to process. Exiting.
