In [1]:
import jupyter_black

jupyter_black.load()

In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from pyfaidx import Fasta
from pathlib import Path
import re

import uniprot_helper
import ncbi_helper

base = Path("../data")
raw = base / "raw"
file = raw / "BungarusMulticinctus.fasta"
ncbi_dir = base / "ncbi_entries"
uniprot_dir = base / "uniprot_entries"

In [40]:
re.match("^[A-Z][a-z]{3}", "Cvir_PDHV02000188_1_35190_37247_mka")

<re.Match object; span=(0, 4), match='Cvir'>

In [256]:
fasta = Fasta(file)


uniprot_collector = uniprot_helper.UniProtDataGatherer(uniprot_dir=uniprot_dir)
ncbi_collector = ncbi_helper.NcbiDataGatherer(ncbi_dir=ncbi_dir)

taxon_mapper = {
    "Bmul": "Bungarus multicinctus",
    "Cvir": "Crotalus viridis",
    "Dacu": "Deinagkistrodon acutus",
    "Hcur": "Hydrophis curtus",
    "Nnaj": "Naja naja",
    # "Pbiv": "Python bivittatus",
    # "Pgut": "Pantherophis guttatus",
    # "Tele": "Thamnophis elegans",
}

# --- patterns ---
# UniProt ID confined: https://www.uniprot.org/help/accession_numbers
pattern_uniprot = r"(?:_|^)([OPQ][0-9][A-Z0-9]{3}[0-9]|[A-NR-Z][0-9]([A-Z][A-Z0-9]{2}[0-9]){1,2})(?:\.|_|$)"
# RefSeq ID: https://www.ncbi.nlm.nih.gov/books/NBK21091/table/ch18.T.refseq_accession_numbers_and_mole/?report=objectonly
pattern_refseq = r"([CMNPRXW]{2}_\d{6,9})"
# GenBank ID, 3.4.6 ACCESSION Format: https://www.ncbi.nlm.nih.gov/genbank/release/current/
pattern_genbank = r"([A-Z]{1,4}\d{5,8})"
# found by GenBank pattern, however there are no entries retrieved through API
exclusion_lst = [
    "PDHV02000066",
    "PDHV02000188",
    "SOZL01001066",
    "SS00042983",
    "SS00017057",
]

# --- map NCBI entries to UniProt
ncbi_ids = []
refseq_ids = []
for header, _ in fasta.items():
    uniprot_match = re.search(pattern_uniprot, header)
    refseq_match = re.search(pattern_refseq, header)
    genbank_match = re.search(pattern_genbank, header)
    if uniprot_match:
        continue
    elif refseq_match:
        refseq_id = refseq_match[1]
        refseq_ids.append(refseq_id)
    elif (
        (genbank_match is not None)
        and (genbank_match[1] not in exclusion_lst)
        and ("scaffold" not in header)
    ):
        ncbi_id = genbank_match[0]
        ncbi_ids.append(ncbi_id)
ncbi_collector.map_uniprot_acc_ids(
    ncbi_ids=refseq_ids, from_dbs=["RefSeq_Nucleotide", "RefSeq_Protein"]
)
ncbi_collector.map_uniprot_acc_ids(ncbi_ids=ncbi_ids)

# --- create entries
entries = []
for header, seq in fasta.items():
    seq = str(seq).replace("-", "")
    uniprot_match = re.search(pattern_uniprot, header)
    refseq_match = re.search(pattern_refseq, header)
    genbank_match = re.search(pattern_genbank, header)
    # match in UniProt
    if uniprot_match:
        acc_id = uniprot_match[1]
        data = uniprot_collector.get_entry(acc_id=acc_id)
        species, taxon_id = uniprot_collector.parse_taxon(rec=data)
        full_seq, mature_peptide = uniprot_collector.parse_seq(rec=data)
    # match in NCBI nuccore or protein DB
    elif refseq_match or (
        (genbank_match is not None)
        and (genbank_match[1] not in exclusion_lst)
        and ("scaffold" not in header)
    ):
        ncbi_id = [m[1] for m in [refseq_match, genbank_match] if m is not None][0]
        acc_id, db = ncbi_collector.get_uniprot_acc_id(gb_id=ncbi_id)
        if acc_id is not None:
            data = uniprot_collector.get_entry(acc_id=acc_id)
            species, taxon_id = uniprot_collector.parse_taxon(rec=data)
            full_seq, mature_peptide = uniprot_collector.parse_seq(rec=data)
        if (acc_id is None) or ((acc_id is not None) and (full_seq is None)):
            ncbi_rec = ncbi_collector.get_record(ncbi_id)
            species, taxon_id = ncbi_collector.parse_taxon(rec=ncbi_rec)
            full_seq = ncbi_collector.parse_seq(rec=ncbi_rec)
            mature_peptide = None
    else:
        # BLASTp sequences
        taxa_abb = list(set(header.split("_")) & set(taxon_mapper.keys()))[0]
        species = taxon_mapper[taxa_abb]
        full_seq = seq
        acc_id, taxon_id, mature_peptide = None, None, None

    entry = dict(
        fasta_id=header,
        acc_id=acc_id,
        db=db,
        full_seq=full_seq,
        seq=mature_peptide,
        species=species,
        taxon_id=taxon_id,
        data_origin="paper_zhang",
    )
    entries.append(entry)

In [257]:
import pandas as pd

df = pd.DataFrame(entries)
df = df.dropna(subset=["seq", "full_seq"], how="all")
df

Unnamed: 0,fasta_id,acc_id,db,full_seq,seq,species,taxon_id,data_origin
2,Cvir_PDHV02000066_1_88429_90401,,,MKTLLLTLMVVAFMYLDSGYTLRCRSCIGLCCDDVKNCAEGQKYCY...,,Crotalus viridis,,paper_zhang
3,Cvir_PDHV02000188_1_35190_37247_mka,,,MKALLFALLLLAFVCEDPVTSLECYVLSDWKIKCFRGEKYCYNVKF...,,Crotalus viridis,,paper_zhang
4,Dacu_scaffold1201_gene_1_mRNA_1_mka,,,MKALLFALLLVAFMCKDPVMSLQCYTCGEFFCNVKLPCSKEEKFCY...,,Deinagkistrodon acutus,,paper_zhang
5,Dacu_scaffold1201_gene_1_mRNA_2_mka,,,MKALLFALLLVAFMCKDPVTSLTCYTCGGSYCEWKVRCMKEEKLCY...,,Deinagkistrodon acutus,,paper_zhang
6,Dacu_scaffold1201_gene_1_mRNA_3_mka,,,MKALLFALLVVAFTCEDTVAGLECLRCYGACKEEICLEDNPVCYTL...,,Deinagkistrodon acutus,,paper_zhang
...,...,...,...,...,...,...,...,...
990,VERAN_GAHJ01000009.1_37_252,R4FKG3,TR,MKTLLLTLVVVTIVCLDLGDSLICYVGYNIPQICPTGEVVCFTKTW...,LICYVGYNIPQICPTGEVVCFTKTWCDAHCGERGKRVELGCAATCP...,Vermicella annulata,1295044,paper_zhang
991,VERAN_GAHJ01000013.1_37_249,R4FIZ2,TR,MKTLLLTLVVVTIVCLDLGYTMTCYNQQSSEDQTTTTCPGGVSSCY...,MTCYNQQSSEDQTTTTCPGGVSSCYKKTWRDHRGTIIERGCGCPDV...,Vermicella annulata,1295044,paper_zhang
992,VIPTR_OL439441.1_1_264,,,MKTLLLILGVVAFVYLDSGYSLTCASCPSVKCMVTPNVQCTEGSNQ...,,Vipera transcaucasiana,235552,paper_zhang
993,WALAE_EU196556.1_1_246,C1IC48,SP,MKTLLLTLVVVTIVCLDLGHTLLCHNQQSSTSPTTTCCSGGESKCY...,LLCHNQQSSTSPTTTCCSGGESKCYKKRWPTHRGTITERGCGCPTV...,Walterinnesia aegyptia,64182,paper_zhang


In [245]:
df[df["seq"].isna() & df["full_seq"].isna()].shape

(334, 8)

In [None]:
import pandas as pd
import dset_3FTx

base = Path("../data")
helpers = base / "helpers"
blast_dir = base / "blast_out"
taxon_mapper_file = helpers / "taxon_mapper.json"

df = pd.DataFrame(entries)
df = dset_3FTx.add_taxon_id(df=df, taxon_mapper_file=taxon_mapper_file)
df = dset_3FTx.run_blast(df, blast_dir=blast_dir)

In [8]:
pd.DataFrame(entries)["acc_id"].isna().sum()

435

In [9]:
df.shape

(705, 7)

In [18]:
df["acc_id"].dropna().str.split(",").str.len().value_counts()

1    577
Name: acc_id, dtype: int64

In [19]:
df["db"].value_counts(dropna=False)

NaN     285
TR      236
None    113
SP       71
Name: db, dtype: int64

In [65]:
import idmapping

missing = list(mapper.keys())
for from_db in ["EMBL-GenBank-DDBJ", "EMBL-GenBank-DDBJ_CDS"]:
    # get mapped IDs
    job_id = idmapping.submit_id_mapping(
        from_db=from_db, to_db="UniProtKB", ids=missing
    )
    if idmapping.check_id_mapping_results_ready(job_id):
        link = idmapping.get_id_mapping_results_link(job_id)
        results = idmapping.get_id_mapping_results_search(link)
    # add mapped ID
#     seqdb2gi = {gb: gi for gi, gb in gi2seqdb.items()}
#     for result in results["results"]:
#         gi_nr = seqdb2gi[result["from"]]
#         acc_id = result["to"]["primaryAccession"]
#         df_ritu.loc[df_ritu["gi_number"] == gi_nr, ["acc_id"]] = acc_id
#     # search missing ones in next DB
#     missing = results["failedIds"].copy()
# print(
#     "No UniProt AccID found for"
#     f" {len(results['failedIds'])} sequence(s)"
# )

Fetched: 275 / 275
Fetched: 53 / 53
