In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
from pathlib import Path
from pyfaidx import Fasta
import pandas as pd

fasta_path = "../data/protspace_after_dan/3FTx_isoform_yes.fasta"
fasta_full_path = "../data/protspace_after_dan/3FTx_full.fasta"
csv_path = "../data/protspace_after_dan/3FTx.csv"

bmul_fasta_path = "../data/bmul_manual_ivan/Bungarus_genomic.fasta"
missed_fasta_path = "../data/bmul_manual_ivan/missed_cobra.fasta"
new_fasta_path = "../data/bmul_manual_ivan/all_new.fasta"
mature_fasta_path = "../data/bmul_manual_ivan/new_mature.fasta"

out_dir = Path("../data/protspace_after_ivan_manual_bmul")
out_fasta = out_dir / "3FTx.fasta"
out_csv = out_dir / "3FTx.csv"

In [96]:
# merge seqs together
with open(new_fasta_path, "w") as handle:
    for path in [bmul_fasta_path, missed_fasta_path]:
        for header, seq in Fasta(path).items():
            handle.write(f">{header}\n")
            handle.write(f"{seq}\n")

# remove duplicates and save without duplicates
seqs = dict()
for header, seq in Fasta(new_fasta_path).items():
    seqs.setdefault(str(seq), []).append(header)

with open(new_fasta_path, "w") as handle:
    for seq, headers in seqs.items():
        handle.write(f">{headers[0]}\n")
        handle.write(f"{seq}\n")

In [3]:
# --- create new sequences ---
df = pd.read_csv(csv_path)
new_seqs = dict()
for header, seq in Fasta(mature_fasta_path).items():
    if header.startswith("ScVE01q"):
        species = "Naja naja"
    else:
        species = "Bungarus multicinctus"
    seq = str(seq)
    entry = df.loc[(df["species"] == species) & (df["mature_seq"] == seq)]
    if len(entry) == 0:
        uid = f"None|{header}|{species.replace(' ', '_')}"
        new_seqs[uid] = seq
        # print(f">{uid}")
        # print(f"{seq}")

# -- get full seq
full_seq_map = {}
for header, seq in Fasta(new_fasta_path).items():
    header = header.replace(";", "_").replace("=", "_")
    full_seq_map[header] = str(seq)

# --- save FASTA & CSV with new seqs
with open(out_fasta, "w") as handle:
    for header, seq in Fasta(fasta_path).items():
        handle.write(f">{header}\n")
        handle.write(f"{seq}\n")
    for uid, seq in new_seqs.items():
        db, header, species = uid.split("|")
        full_seq = full_seq_map[header]
        handle.write(f">{uid}\n")
        handle.write(f"{seq}\n")
        new_row = pd.Series(
            {
                "identifier": uid,
                "mature_seq": seq,
                "cysteine_group": "new",
                "full_seq": full_seq,
                "species": species.replace("_", " "),
            }
        )
        df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
df.to_csv(out_csv, index=False)

In [105]:
full_seq_map

{'B.multicinctus_HiC_scaffold6_G00174.t1': 'MKTLLLTLMVVTIVCLDLGYTRKCLNCPTRYCTTFHTCPDGQDLCFKRFYDGNQLGWRATRGCAATCPEAKPRETVECCATDKCNL',
 'B.multicinctus_HiC_scaffold7_G00718.t1': 'MKSLLLMTLVVVTIMCLDLGYTRKCLLTPLPLFYQTCPVGQNLCFKMTVKVLPFKYDVIRGCASTCPKNTANVVVVCCETDKCNQ',
 'B.multicinctus_HiC_scaffold7_G00738.t1': 'MKTLLLTLVVVTIVCLDLGYTIICRTRDTYQIPITFTNCEEGHVCYKYSTTETPNRILIHRGCAAKCPKRLRVICCSTDKCNK',
 'B.multicinctus_HiC_scaffold7_G00790.t1': 'MKTLLLTLVVVAIVCLDLGYTLTCLICPEKYCHTVHTCRNEEKICVKRFYDNKLLGWKAHRGCAVTCPETKPDETVVCCSTDKCNK',
 'B.multicinctus_HiC_scaffold7_G00795.t1': 'MKVSLATWLIFAASVDLVFSLRCYTCSEPMDVSYCVAVTHCPANTTSCKTTVHSVDSGFPFFGNITVSKSCSKNCVPSEPDTIGDNHPNYCCYTDLCNVGAGQAPTAEFSALSFTIILALSLLWLQG',
 'B.multicinctus_HiC_scaffold7_G00800.t1': 'MPAGMKAPLAILLAACLCVDGVFSLVCWSCENVESNWGCWRTQICPDGFNYCATTYTGAGIGEYSAQSISKGCVSTCPSVGVDIGIAAVSIHCCSSFLCNISGANSIQINHLVLALAMLASFFYLFGSRL',
 'B.multicinctus_HiC_scaffold7_G00801.t1': 'MLSDAGAAAASSSSSGPFSRHRPLTMKPLFAALLAGLLCMQTAMSSLICISCDKVDHNSKCYDLKVCDEDNDRYCYT