In [None]:
import os
import glob
import pandas as pd
import re

In [17]:
# Read in all csvs that begin with "6.Probes" and end with ".csv"
# set the directory where the subdirectories are located
parent_dir = "/nemo/lab/znamenskiyp/scratch/Olfr_genes"

# create an empty list to hold the csv files
csv_files = []

for subdir in [d for d in os.listdir(parent_dir) if d.startswith("Olfr")]:
    subpath = os.path.join(parent_dir, subdir)
    if os.path.isdir(subpath) and glob.glob(
        os.path.join(subpath, "6.ProbesRandom*.csv")
    ):
        csv_files.append(glob.glob(os.path.join(subpath, "6.ProbesRandom*.csv"))[0])


# Robust parser handling variable delimiter and extra transcript_region columns
def read_probe_csv(path):
    records = []
    with open(path, "r") as fh:
        header = next(fh, None)  # skip header line
        for raw in fh:
            raw = raw.rstrip("\r\n")
            if not raw.strip():
                continue
            # Decide delimiter: prefer tab, else comma if many commas, else whitespace
            if "\t" in raw:
                parts = raw.split("\t")
            else:
                if raw.count(",") >= 6:
                    parts = raw.split(",")
                else:
                    parts = re.split(r"\s+", raw)
            # Ensure at least 6 fixed columns (pad if short)
            if len(parts) < 6:
                parts += [""] * (6 - len(parts))
            fixed = parts[:6]
            extra_parts = parts[6:]  # transcript_region (possibly split)
            # Drop pure empty trailing tokens
            extra_parts = [p for p in extra_parts if p != ""]
            region_raw = " ".join(extra_parts).strip()
            tokens = []
            if region_raw:
                # Remove brackets
                region_clean = region_raw.replace("[", " ").replace("]", " ")
                # Remove quotes
                region_clean = region_clean.replace("'", "").replace('"', "")
                # Split on commas or whitespace
                for t in re.split(r"[\s,]+", region_clean):
                    if t:
                        tokens.append(t)
            records.append(fixed + [tokens])
    cols = [
        "acronym",
        "target",
        "Tm",
        "startpos",
        "endpos",
        "padlock",
        "transcript_region",
    ]
    df_local = pd.DataFrame(records, columns=cols)
    # Per-file gene_name extraction: find row whose acronym starts with '>'
    if not df_local.empty:
        header_mask = df_local["acronym"].astype(str).str.startswith(">")
        if header_mask.any():
            gene_name = df_local.loc[header_mask, "acronym"].iloc[0][1:]
            df_local = df_local.loc[~header_mask].copy()  # drop header row(s)
            df_local["gene_name"] = gene_name
        else:
            # Fallback: strip any leading '>' just in case
            df_local["gene_name"] = df_local["acronym"].astype(str).str.lstrip(">")
    return df_local


# Read and concatenate (each df already contains gene_name, header rows removed)
dfs = [read_probe_csv(f) for f in csv_files]
df = (
    pd.concat(dfs, ignore_index=True)
    if dfs
    else pd.DataFrame(
        columns=[
            "acronym",
            "target",
            "Tm",
            "startpos",
            "endpos",
            "padlock",
            "transcript_region",
            "gene_name",
        ]
    )
)

# Drop rows missing required probe fields
df = df.dropna(subset=["target", "padlock"])


def remove_chars(s, indices):
    for i in sorted(indices, reverse=True):
        s = s[:i] + s[i + 1 :]
    return s


df.to_csv("Olfr_monahan_probes.csv", index=False)