In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
from pathlib import Path
from pyfaidx import Fasta
import pandas as pd

fasta_path = "../data/protspace_after_dan/3FTx_isoform_yes.fasta"
fasta_full_path = "../data/protspace_after_dan/3FTx_full.fasta"
csv_path = "../data/protspace_after_dan/3FTx.csv"

bmul_fasta_path = "../data/bmul_manual_ivan/Bungarus_genomic.fasta"
missed_fasta_path = "../data/bmul_manual_ivan/missed_cobra.fasta"
new_fasta_path = "../data/bmul_manual_ivan/all_new.fasta"
mature_fasta_path = "../data/bmul_manual_ivan/new_mature.fasta"

out_dir = Path("../data/protspace_after_ivan_manual_bmul")
out_fasta = out_dir / "3FTx.fasta"
out_csv = out_dir / "3FTx.csv"

In [3]:
# merge seqs together
with open(new_fasta_path, "w") as handle:
    for path in [bmul_fasta_path, missed_fasta_path]:
        for header, seq in Fasta(path).items():
            handle.write(f">{header}\n")
            handle.write(f"{seq}\n")

# remove duplicates and save without duplicates
seqs = dict()
for header, seq in Fasta(new_fasta_path).items():
    seqs.setdefault(str(seq), []).append(header)

with open(new_fasta_path, "w") as handle:
    for seq, headers in seqs.items():
        handle.write(f">{headers[0]}\n")
        handle.write(f"{seq}\n")

In [4]:
# --- create new sequences ---
df = pd.read_csv(csv_path)
new_seqs = dict()
for header, seq in Fasta(mature_fasta_path).items():
    if header.startswith("ScVE01q"):
        species = "Naja naja"
    else:
        species = "Bungarus multicinctus"
    seq = str(seq)
    entry = df.loc[(df["species"] == species) & (df["mature_seq"] == seq)]
    if len(entry) == 0:
        uid = f"None|{header}|{species.replace(' ', '_')}"
        new_seqs[uid] = seq
        # print(f">{uid}")
        # print(f"{seq}")

# -- get full seq
full_seq_map = {}
for header, seq in Fasta(new_fasta_path).items():
    header = header.replace(";", "_").replace("=", "_")
    full_seq_map[header] = str(seq)

# --- save FASTA & CSV with new seqs
with open(out_fasta, "w") as handle:
    for header, seq in Fasta(fasta_path).items():
        handle.write(f">{header}\n")
        handle.write(f"{seq}\n")
    for uid, seq in new_seqs.items():
        db, header, species = uid.split("|")
        full_seq = full_seq_map[header]
        handle.write(f">{uid}\n")
        handle.write(f"{seq}\n")
        new_row = pd.Series(
            {
                "identifier": uid,
                "mature_seq": seq,
                "cysteine_group": "new",
                "full_seq": full_seq,
                "species": species.replace("_", " "),
            }
        )
        df = pd.concat([df, new_row.to_frame().T], ignore_index=True)
df["seq_start"] = df["full_seq"].str[:3]
df.to_csv(out_csv, index=False)

In [11]:
df

Unnamed: 0,identifier,evolutionary_order,major_group,cysteine_group,taxon_numerical,taxon_of_interest,family,genus,species,taxon_id,...,Long chain cysteine 1 (pos 89),Long chain cysteine 2 (pos 93),Basal,Dimeric,Derived,Short-chain,Long-chain,Non-standard,pred_membran,seq_start
0,None|Xetr_1|Xenopus_tropicalis,1.0,Ly-6,Ly-6,1.0,Anura,Pipidae,Xenopus,Xenopus tropicalis,8364.0,...,G,-,True,False,False,False,False,False,Membrane,MAA
1,None|Xetr_2|Xenopus_tropicalis,1.0,Ly-6,Ly-6,1.0,Anura,Pipidae,Xenopus,Xenopus tropicalis,8364.0,...,A,-,True,False,False,False,False,False,Soluble,MQC
2,None|Xetr_3|Xenopus_tropicalis,1.0,Ly-6,Ly-6,1.0,Anura,Pipidae,Xenopus,Xenopus tropicalis,8364.0,...,-,-,False,False,False,False,False,True,Membrane,
3,None|Xetr_5|Xenopus_tropicalis,1.0,Ly-6,Ly-6,1.0,Anura,Pipidae,Xenopus,Xenopus tropicalis,8364.0,...,A,-,True,False,False,False,False,False,Membrane,MAD
4,None|Xetr_6|Xenopus_tropicalis,1.0,Ly-6,Ly-6,1.0,Anura,Pipidae,Xenopus,Xenopus tropicalis,8364.0,...,S,F,True,False,False,False,False,False,Membrane,MSP
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1423,None|B.multicinctus_HiC_scaffold7_G00801.t1|Bu...,,,new,,,,,Bungarus multicinctus,,...,,,,,,,,,,MLS
1424,None|denovo_2_scaffold7|Bungarus_multicinctus,,,new,,,,,Bungarus multicinctus,,...,,,,,,,,,,MKT
1425,None|denovo_scaffold35|Bungarus_multicinctus,,,new,,,,,Bungarus multicinctus,,...,,,,,,,,,,MKT
1426,None|ScVE01q_1072_HRSCAF_1231_-_32|Naja_naja,,,new,,,,,Naja naja,,...,,,,,,,,,,MKT


In [10]:
new_data = """>B.multicinctus_HiC_scaffold9_G00014.t1
MKTLLLTLVVVTIICLDLGYTEMCNMCVRPYPFMSSCCPEGQDRCYKSYWVNENGKQEAYHGKYPVILERGCVTACTGPGSGSIYNLYTCCPTNRCGSSSTSG
>B.multicinctus_HiC_scaffold27_G00002.t1
MKTLLLTLVVVTIVCLDLGYTIVCHTTATSPISAVTCPPGENLCYRKMWCDAFCSSRGKVVELGCAATCPSKKPYEEVTCCSTDKCNPHPKQRPG
>B.multicinctus_HiC_scaffold35_G00003.t1
MKTLLLTLVVVTIVCLDLGYTIVCHTTATSPISAVTCPPGENLCYRKMWCDAFCSSRGKVVELGCAATCPSKKPYEEVTCCSTDKCNPHPKQRPG
>B.multicinctus_HiC_scaffold52_G00001.t1
MKTLLLTLVVVTIVCLDLGYTRTCLISPSSPPQTCPKGEDICIVKARCDEWCLSRGPLIERGCAATCPEFRSNYRSLLCCTTDNCNH
>B.multicinctus_HiC_scaffold56_G00001.t1
MKTLLLTLVVVTIVCLDLGYTRTCLISPSSPPQTCPKGEDICIVKARCDEWCLSRGPLIERGCAATCPEFRSNYRSLLCCTTDNCNH
>B.multicinctus_HiC_scaffold72_G00001.t1
MKTLLLTLVVVTIVCLDLGYTIVCHTTATSPISAVTCPPGENLCYRKMWCDAFCSSRGKVVELGCAATCPSKKPYEEVTCCSTDKCNPHPKQRPG
>B.multicinctus_HiC_scaffold72_G00002.t1
MKTLLLTLVVVTIVCLDLGYTRTCLISPSSPPQTCPKGEDICIVKARCDEWCLSRGPLIERGCAATCPEFRSNYRSLLCCTTDNCNH
>B.multicinctus_HiC_scaffold142_G00001.t1
MKTLLLTLVVVTIVCLDLGYTIVCHTTATSPISAVTCPPGENLCYRKMWCDVFCSSRGKVVELGCAATCPSKKPYEEVTCCSTDKCNPHPKQRPG
>B.multicinctus_HiC_scaffold142_G00002.t1
MKTLLLTLVVVTIVCLDLGYTIVCHTTATSPISAVTCPPGENLCYRKMWCDAFCSSRGKVVELGCAATCPSKKPYEEVTCCSTDKCNPHPKQRPG"""

for line in new_data.split("\n"):
    # print(line)
    if line.startswith(">"):
        continue
    else:
        a = df.loc[df["full_seq"] == line]
        print(a[["identifier", ""]])

547    SP|Q9PW19|Bungarus_multicinctus
Name: identifier, dtype: object
542    SP|P60615|Bungarus_multicinctus
Name: identifier, dtype: object
542    SP|P60615|Bungarus_multicinctus
Name: identifier, dtype: object
534    None|Bmul_scaffold27_G00003_t1|Bungarus_multic...
Name: identifier, dtype: object
534    None|Bmul_scaffold27_G00003_t1|Bungarus_multic...
Name: identifier, dtype: object
542    SP|P60615|Bungarus_multicinctus
Name: identifier, dtype: object
534    None|Bmul_scaffold27_G00003_t1|Bungarus_multic...
Name: identifier, dtype: object
543    SP|P60616|Bungarus_multicinctus
Name: identifier, dtype: object
542    SP|P60615|Bungarus_multicinctus
Name: identifier, dtype: object


## Add membran predictions

In [7]:
ppihp_dir = Path("../data/protspace_after_ivan_manual_bmul/PPIHP")
ids_path = ppihp_dir / "ids.txt"
mem_pred_path = ppihp_dir / "la_mem_pred.txt"

In [14]:
# secreted not soluble
with open(ids_path, "r") as handle_uid, open(mem_pred_path, "r") as handle_pred:
    for uid, mem_pred in zip(handle_uid, handle_pred):
        uid = uid.strip()[1:]
        mem_pred = mem_pred.strip()
        if mem_pred == "Soluble":
            mem_pred = "Secreted"
        df.loc[df["identifier"] == uid, "membran_prediction"] = mem_pred
df.to_csv(out_csv, index=False)