In [1]:
from pathlib import Path
from datetime import datetime

#Define paths for current project
#Starting in notebooks directory
# --- Centralized paths ---
ROOT = Path("..")
DATA = ROOT / "data"
LOGS = ROOT / "logs"
SCRIPTS = ROOT / "scripts"
RESULTS = ROOT / "results"
ALIGN_DIR = RESULTS / "align"
TREE_DIR = RESULTS / "trees"
FIGURES = RESULTS / "figures"

In [2]:
import pandas as pd
from pathlib import Path
from Bio import Entrez

Entrez.email = "oakley@ucsb.edu"

csv_path = DATA / "hasegawa24" / "Supporting_Data_3.csv"
df = pd.read_csv(csv_path)
accession_col = "rhodopsin_accessions"  # Adjust if needed

output_fasta = DATA / "hasegawa24" / "rhodopsins_from_accessions.fasta"

def parse_accession(entry):
    entry = entry.strip().strip('"').strip("'")
    # Example: "BAC88139.1 (XLR)"
    if "(" in entry and ")" in entry:
        acc = entry.split("(")[0].strip()
        clade = entry.split("(")[1].split(")")[0].strip()
        return acc, clade
    else:
        return entry, ""

# Flatten and parse accessions
all_entries = (
    df[accession_col]
    .dropna()
    .apply(lambda x: [a.strip().strip('"').strip("'") for a in str(x).split(",")])
    .explode()
    .dropna()
)

parsed = [parse_accession(e) for e in all_entries if e]

with open(output_fasta, "w") as out_handle:
    for acc, clade in parsed:
        try:
            handle = Entrez.efetch(db="protein", id=acc, rettype="fasta", retmode="text")
            seq = handle.read()
            if seq.strip() and seq.startswith(">"):
                # Add clade to FASTA header if present
                lines = seq.splitlines()
                if clade and lines:
                    lines[0] = f">{clade} {lines[0][1:]}"
                    seq = "\n".join(lines)
                out_handle.write(seq + "\n")
            else:
                print(f"Did not find {acc}")
        except Exception:
            print(f"Did not find {acc}")

print(f"Sequences written to {output_fasta}")

Did not find SRR6869043_N0001541_6
Did not find SRR6869043_N0010062_2
Did not find SRR6869040_N0001326_5
Did not find SRR6869040_N0001714_12
Sequences written to ../data/hasegawa24/rhodopsins_from_accessions.fasta
