In [155]:
import re
from time import sleep

import requests
import taxoniq

BLAST_URL = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
FORMAT_TYPE = "JSON2"


def initiate_blast_request(sequences: list[str]) -> tuple[str, int]:
    fasta_data = ""

    for i, seq in enumerate(sequences):
        fasta_data += f">{i}\n{seq}\n"

    print(fasta_data)

    params = {
        "CMD": "Put",
        "PROGRAM": "QuickBLASTP",
        "DATABASE": "nr",
        "QUERY": fasta_data,
        "HITLIST_SIZE": 1,
    }

    response = requests.post(BLAST_URL, data=params)
    if response.status_code != 200:
        raise Exception(f"BLAST submission failed: {response.text}")

    print(response.text)

    request_id = re.search(r"RID = (\S+)", response.text).group(1)
    wait_time_s = int(re.search(r"RTOE = (\d+)", response.text).group(1))

    return request_id, wait_time_s

In [156]:
def wait_until_ready(request_id: str, wait_time_s: int):
    result_params = {
        "CMD": "Get",
        "RID": request_id,
    }

    response = requests.get(BLAST_URL, params=result_params)
    status = re.findall("Status=(.*)", response.text)[0]

    print(status)

    match status:
        case "READY":
            return
        case "WAITING":
            sleep(wait_time_s)
            wait_until_ready(request_id, 5)
        case _:
            raise Exception(f"BLAST submission failed: {status}")

In [157]:
def fetch_results(request_id: str) -> dict:
    result_params = {
        "CMD": "Get",
        "RID": request_id,
        "FORMAT_TYPE": "JSON2_S",
        "ALIGNMENTS": 1,
        "DESCRIPTIONS": 1,
    }

    response = requests.get(BLAST_URL, params=result_params)

    result = response.json()
    return result

In [158]:
def extract_taxon_ids(blast_json: dict) -> list[str | None]:
    taxon_ids = []

    for result in a["BlastOutput2"]:
        try:
            taxon_id = result["report"]["results"]["search"]["hits"][0]["description"][
                0
            ]["taxid"]
        except:
            taxon_id = None

        taxon_ids.append(taxon_id)

    return taxon_ids

In [159]:
def look_up_taxon_id(taxon_id: str) -> tuple[str, list[str]]:
    taxon = taxoniq.Taxon(taxon_id)

    return taxon.scientific_name, [t.scientific_name for t in taxon.ranked_lineage]

In [160]:
sequences = [
    "MAYPLQMGLQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQEVETVWTILPAIILILIALP",
    "MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQEIEMVWTIMPAISLIMIALP",
]

request_id, wait_time_s = initiate_blast_request(sequences)

>0
MAYPLQMGLQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQEVETVWTILPAIILILIALP
>1
MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQEIEMVWTIMPAISLIMIALP

ï»¿<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">
<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">
<head>
<meta http-equiv="X-UA-Compatible" content="IE=edge,chrome=1"/>
<meta name="jig" content="ncbitoggler ncbiautocomplete"/>
<meta name="ncbitoggler" content="animation:'none'"/>

<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />
<META NAME="keywords" CONTENT="blast2seq,Align two sequences using BLAST (bl2seq)"/>
<meta name="referrer" content="origin-when-cross-origin" />
<meta name="ncbi_app" content="blast" />
<meta name="ncbi_pdid" content="blastsearch" />

<meta name="ncbi_db" content="" />
<meta name="ncbi_program" content="blastn" />
<meta name="ncbi_algorithm" content="" />

<meta name="ncbi_stat" cont

AttributeError: 'NoneType' object has no attribute 'group'

In [161]:
request_id, wait_time_s

('5US85XJH016', 25)

In [162]:
wait_until_ready(request_id, 0)

WAITING
WAITING
WAITING
WAITING
WAITING
WAITING


KeyboardInterrupt: 

In [125]:
blast_json = fetch_results(request_id)

In [126]:
taxon_ids = extract_taxon_ids(blast_json)

In [127]:
taxon_ids

[231225, 231225]

In [129]:
look_up_taxon_id(231225)

('Pseudacris streckeri',
 ['Pseudacris streckeri',
  'Pseudacris',
  'Hylidae',
  'Anura',
  'Amphibia',
  'Chordata',
  'Metazoa',
  'Eukaryota'])

In [140]:
wait_time_s

25

In [143]:
result_params = {
    "CMD": "Get",
    "RID": request_id,
}

response = requests.get(BLAST_URL, params=result_params)

In [144]:
response.text

'<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.0 Transitional//EN" "http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd">\n<html xmlns="http://www.w3.org/1999/xhtml" xml:lang="en" lang="en">\n<head>\n<meta http-equiv="Content-Type" content="text/html; charset=UTF-8" />\n<meta name="jig" content="ncbitoggler"/>\n<meta name="ncbitoggler" content="animation:\'none\'"/>\n<title>NCBI Blast:</title>\n<script type="text/javascript" src="/core/jig/1.15.2/js/jig.min.js             "></script>\n<script type="text/javascript">    jQuery.getScript("/core/alerts/alerts.js", function() {\n        galert([\'div#header\', \'body > *:nth-child(1)\'])\n    });</script>\n<meta http-equiv="Pragma" content="no-cache">\n<link rel="stylesheet" type="text/css" href="css/uswds.min.css" media="screen" />\n<link rel="stylesheet"  type="text/css" href="https://www.ncbi.nlm.nih.gov/style-guide/static/nwds/css/nwds.css"/>\n\n<link rel="stylesheet" href="css/headerNew.css?v=1"/>\n<link rel="stylesheet" href="https:

In [146]:
fasta_data = ""

for i, seq in enumerate(sequences):
    fasta_data += f">{i}\n{seq}\n"
print(fasta_data)

>0
MAYPLQMGLQDATSPIMEELLHFHDHTLMIVFLISSLVLYIISLMLTTKLTHTSTMDAQEVETVWTILPAIILILIALP
>1
MAHPSQLGFQDAASPIMEELLHFHDHTLMAVFLISTLVLYIITIMMTTKLTNTNLMDAQEIEMVWTIMPAISLIMIALP

