In [4]:
import pandas as pd
from pathlib import Path

def write_fasta_from_padlocks(padlock_file_path: str, output_dir: str):
    """
    This function reads a CSV file containing padlocks, genes and their target sequences,
    and writes a .fasta file for each unique gene.

    Args:
        padlock_file_path (str): The path to the CSV file containing padlocks, genes and their target sequences.
        output_dir (str): The directory where the .fasta files will be written.
    """
    # Load and format file with padlocks, genes and their target sequences
    padlock_file = Path(padlock_file_path)
    padlocks = pd.read_csv(padlock_file, header=0)
    #padlocks = padlocks.drop(labels="Unnamed: 0", axis="columns")
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    # Iterate through each gene
    for gene in padlocks.gene_name.unique():
        tmp = padlocks.loc[padlocks['gene_name'] == gene, ["target", "gene_name", "padlock_name"]]
        print(f"Writing .fasta for {gene} padlocks", flush=True)
        output_path = Path(output_dir) / f"{gene}_query.fasta"
        with open(output_path, 'w') as output_file:
            # Write a .fasta file with the padlock name as the identifier for each target
            for padlock in tmp.padlock_name:
                identifier = ">" + padlock + "\n"
                sequence = tmp.loc[tmp["padlock_name"] == padlock, "target"].to_string(index=False) + "\n"
                output_file.write(identifier)
                output_file.write(sequence)
    
    # Write the list of fasta file paths to a .txt file
    fasta_paths = list(output_dir.glob("*.fasta"))
    with open(output_dir / "fasta_list.txt", 'w') as path_file:
        for path in fasta_paths:
            path_file.write(str(path) + "\n")

In [9]:
output_dir = Path("/nemo/lab/znamenskiyp/scratch/tim_genes_queries")    
# Write the list of fasta file paths to a .txt file
fasta_paths = list(output_dir.glob("*.fasta"))
with open(output_dir / "fasta_list.txt", 'w') as path_file:
    for path in fasta_paths:
        path_file.write(str(path) + "\n")

In [8]:
write_fasta_from_padlocks("/nemo/lab/znamenskiyp/scratch/tim_genes/tim_genes_barcoded.csv", "/nemo/lab/znamenskiyp/scratch/tim_genes_queries")

Writing .fasta for Grin1 padlocks


Writing .fasta for Grin2b padlocks
Writing .fasta for Grin2a padlocks
Writing .fasta for Grin2a padlocks
Writing .fasta for Grin2c padlocks
Writing .fasta for Grin2d padlocks
Writing .fasta for Grin3a padlocks
Writing .fasta for Grin3b padlocks
Writing .fasta for Cbln4 padlocks
Writing .fasta for Cdh13 padlocks
Writing .fasta for Npnt padlocks
