In [2]:
import jupyter_black

jupyter_black.load()

In [3]:
%load_ext autoreload
%autoreload 2

In [72]:
import importlib
from pathlib import Path
import pandas as pd

import dset_3FTx
import uniprot_helper
import ncbi_helper

# importlib.reload(dset_3FTx)
# importlib.reload(uniprot_helper)

# pd.io.clipboards.to_clipboard(df.to_markdown(), excel=False)

# --- PATHS ---
base = Path("../data")
out_dir = base / "protspace"
raw = base / "raw"
helpers = base / "helpers"

csv_in = raw / "Ivan_3FTx.csv"
fasta_in = raw / "3and6_new-2.fasta"
genomic_fasta = raw / "Translation of 156 sequences.fasta"
zhang_fasta = raw / "BungarusMulticinctus.fasta"
zhang_sp6_fasta = raw / "zhang_sp6_mature_seq.fasta"
zhang_annotation = raw / "zhang_annotation.xlsx"
ritu_csv = raw / "drysdalia.csv"
french_excel = raw / "french_data.xls"
uniprot_uids_files = [raw / "dashev_uniprot.txt", raw / "snake_3FTx_sp.txt"]

ncbi_dir = base / "ncbi_entries"
uniprot_dir = base / "uniprot_entries"
blast_dir = base / "blast_out"
# nuc_dir = base / "gi_number"
taxon_mapper_file = helpers / "taxon_mapper.csv"
# gi2accid_file = helpers / "gi2accid.json"

fasta_out = out_dir / "3FTx.fasta"
csv_out = out_dir / "3FTx.csv"

# --- MAIN ---
uniprot_collector = uniprot_helper.UniProtDataGatherer(uniprot_dir=uniprot_dir)
ncbi_collector = ncbi_helper.NcbiDataGatherer(ncbi_dir=ncbi_dir)

df_original = dset_3FTx.OriginalDset(
    csv_path=csv_in, fasta_path=fasta_in, genomic_fasta_path=genomic_fasta
).df
df_zhang = dset_3FTx.ZhangDset(
    fasta_path=zhang_fasta, mature_fasta_path=zhang_sp6_fasta
).df
df_ritu = dset_3FTx.RituDset(csv_path=ritu_csv).df
df_french = dset_3FTx.FrenchDset(excel_path=french_excel).df
df_uniprot = dset_3FTx.parse_uniprot_ids_file(uniprot_uids_files=uniprot_uids_files)
df = pd.concat(
    [df_original, df_french, df_zhang, df_ritu, df_uniprot], ignore_index=True
)
df = dset_3FTx.map_ids2uniprot(df=df)
# 28 of original mature_seq have missing ends or no UniProt entry
df = dset_3FTx.get_uniprot_metadata(df=df)
df = dset_3FTx.get_ncbi_metadata(df=df)
df = df.dropna(subset="species")
df = dset_3FTx.add_taxon_id(df=df, taxon_mapper_file=taxon_mapper_file)
# TODO: run BLASTp to find UniProt entries
#       ignore entries that do already have an acession number
# df = dset_3FTx.run_blast(
#     df=df, blast_dir=blast_dir, uniprot_collector=uniprot_collector
# )
# df = dset_3FTx.get_uniprot_metadata(df=df)
df = dset_3FTx.remove_low_quality_entries(df=df)
df = dset_3FTx.manual_curation(df=df)
dset_3FTx.save_data(df=df, csv_file=csv_out, fasta_file=fasta_out)

Original - 954 entries: 623 UniProt IDs; 1 RefSeq IDs; 47 GenBank IDs identified.
- 127 full sequences information added by genomic supported alignment


TypeError: ZhangDset.__init__() missing 1 required positional argument: 'annotation_excel_path'

In [61]:
df_annot = pd.read_excel(zhang_annotation, header=1)
df_annot = df_annot.rename(
    columns={
        "Transcript ID": "fasta_id",
        "Toxin family": "major_group",
        "Subfamily": "zhang_subfamily",
        "Toxin type": "zhang_toxinType",
        "Conformation": "zhang_conformation",
    }
)
df_annot = df_annot[df_annot["major_group"] == "3FTX"]
df_annot["fasta_id"] = (
    df_annot["fasta_id"]
    .str.replace(pat=r"B.multicinctus_HiC", repl="Bmul", regex=True)
    .str.replace(".", "_", regex=True)
)
cols2keep = [
    "fasta_id",
    "major_group",
    "zhang_subfamily",
    "zhang_toxinType",
    "zhang_conformation",
]
df_annot = df_annot[cols2keep]
df_annot.loc[
    df_annot["zhang_subfamily"].str.startswith("Uncharacterized"), "zhang_subfamily"
] = None
df_annot = df_annot.replace("unknown", None)

In [71]:
df_annot.head(2)

Unnamed: 0,fasta_id,major_group,zhang_subfamily,zhang_toxinType,zhang_conformation
0,Bmul_scaffold7_G00718_t1,3FTX,Aminergic,cardiotoxin,short chain
1,Bmul_scaffold142_G00001_t1,3FTX,IIalpha,neurotoxin,long chain


In [70]:
df_zhang.join(df_annot.set_index("fasta_id"), on="fasta_id").shape, df_zhang.shape

((970, 12), (970, 8))

In [36]:
"A0A7T7DMY7" in df["uniprot_id"].to_list()

True

In [4]:
df["data_origin"].value_counts()

original          954
paper_zhang       555
paper_ritu         27
snake_3FTx_sp      25
dashev_uniprot     23
Name: data_origin, dtype: int64

DB|ID(uniprot, ncbi, genomic_name, original_fasta)|species name

In [22]:

# df.loc[
#     pd.isna(a) & (df["data_origin"] == "original"), :  # "data_origin"
# ]  # .value_counts()

In [28]:
identifier = (
    df["uniprot_id"]
    .fillna(df["genbank_id"])
    .fillna(df["refseq_id"])
    .fillna(df["genomic_id"])
    .fillna(df["fasta_id"])
)

col_id_vals = df["db"].astype(str) + "|" + identifier + "|" + df["species"]
df.insert(loc=0, column="id", value=col_id_vals)

In [29]:
df.to_csv("../data/protspace/3FTx.csv", index=False)