In [1]:
import pandas as pd
import random
from pathlib import Path


def replace_padlock_strings(csv_file, txt_file, output_name=""):
    padlock_dict = {}

    # Read the .txt file containing the list of 10-character strings
    with open(txt_file, "r") as txt:
        padlock_strings = txt.read().splitlines()

    df = pd.read_csv(csv_file)

    # create a dictionary of padlock strings grouped by gene_name
    for _, row in df.iterrows():
        acronym = row["gene_name"]
        padlock = row["padlock"]
        if acronym not in padlock_dict:
            padlock_dict[acronym] = []
        padlock_dict[acronym].append(padlock)

    # Assign a randomly selected and unique substituted string to all padlocks within each gene_name group
    barcodes = []
    acronym_to_barcode = {}
    for acronym, padlocks in padlock_dict.items():
        substituted_string = random.choice(padlock_strings)
        acronym_to_barcode[acronym] = substituted_string
        for i in range(len(padlocks)):
            barcodes.append(substituted_string)
        padlock_strings.remove(substituted_string)

    df["barcode"] = df["gene_name"].map(acronym_to_barcode)
    df["padlock"] = df.apply(
        lambda r: r["padlock"].replace("N" * 10, r["barcode"] + "C"), axis=1
    )
    add_reverse_complement(df)

    # Write the updated DataFrame to a new .csv file
    if output_name:
        df.to_csv(f"{output_name}.csv", index=False)

    return df, padlock_dict, padlock_strings


def reverse_complement(dna_sequence):
    complement_dict = {"A": "T", "T": "A", "C": "G", "G": "C"}
    reverse_complement_seq = ""

    for base in dna_sequence[::-1]:
        reverse_complement_seq += complement_dict.get(base, base)

    return reverse_complement_seq


def add_reverse_complement(df):
    df["RT primer"] = df["padlock"].str[:20].apply(reverse_complement)
    return df


def unscramble_padlock_file(
    scrambled_csv: str | Path,
    group_col: str = "gene_name",
    output_csv: str | Path | None = None,
) -> pd.DataFrame:
    """
    Restore the correct row-to-padlock relationship in a barcoded non gene_name ordered file.

    Args:
        scrambled_csv : str or Path
            Path to the scrambled CSV.
        group_col : str, default "acronym"
            Column that was used to group padlocks (usually 'acronym').
        output_csv : str or Path, optional
            If supplied, the fixed CSV is written here.

    Returns:
        pandas.DataFrame
            The unscrambled table.
    """
    df = pd.read_csv(scrambled_csv)

    # Find the order in which the gene_names were met and where they occur
    first_seen_order, positions = [], {}
    for idx, key in enumerate(df[group_col]):
        if key not in positions:
            first_seen_order.append(key)
        positions.setdefault(key, []).append(idx)

    scrambled_cols = [c for c in ("padlock", "barcode", "RT primer") if c in df.columns]

    for col in scrambled_cols:
        flat = df[col].tolist()
        fixed = [""] * len(df)

        k = 0
        for key in first_seen_order:
            rows = positions[key]
            block = flat[k : k + len(rows)]
            for r_i, val in zip(rows, block):
                fixed[r_i] = val
            k += len(rows)
        df[col] = fixed

    if "RT primer" in df.columns:
        df["RT primer"] = df["padlock"].str[:20].apply(reverse_complement)

    if output_csv:
        df.to_csv(output_csv, index=False)

    return df

In [5]:
padlock_df, padlock_dict, barcode_strings = replace_padlock_strings(
    "/nemo/lab/znamenskiyp/home/users/becalia/code/multi_padlock_design/caroline_ecm_wishlist_probes.csv",
    "/nemo/lab/znamenskiyp/home/users/becalia/code/multi_padlock_design/GII_compatible_10bp_barcodes.txt",
    output_name="csm_panel_barcoded",
)