In [81]:
import re
import requests
from time import sleep
import taxoniq

BLAST_URL = "https://blast.ncbi.nlm.nih.gov/blast/Blast.cgi"
FORMAT_TYPE = "JSON2"


def initiate_blast_request(sequences: list[str]) -> tuple[str, int]:
    fasta_data = ""

    for i, seq in enumerate(sequences):
        fasta_data += f">{i}\n{seq}\n"

    params = {
        "CMD": "Put",
        "PROGRAM": "blastn&MEGABLAST=on",
        "DATABASE": "core_nt",
        "QUERY": fasta_data,
        "HITLIST_SIZE": 1,
    }

    response = requests.post(BLAST_URL, data=params)
    if response.status_code != 200:
        raise Exception(f"BLAST submission failed: {response.text}")

    request_id = re.search(r"RID = (\S+)", response.text).group(1)
    wait_time_s = int(re.search(r"RTOE = (\d+)", response.text).group(1))

    return request_id, wait_time_s

In [54]:
def wait_until_ready(request_id: str, wait_time_s: int):
    result_params = {
        "CMD": "Get",
        "RID": request_id,
    }

    response = requests.get(BLAST_URL, params=result_params)
    status = re.findall("Status=(.*)", response.text)[0]

    match status:
        case "READY":
            return
        case "WAITING":
            sleep(wait_time_s)
            wait_until_ready(request_id, 5)
        case _:
            raise Exception(f"BLAST submission failed: {status}")

In [55]:
def fetch_results(request_id: str) -> dict:
    result_params = {
        "CMD": "Get",
        "RID": request_id,
        "FORMAT_TYPE": "JSON2_S",
        "ALIGNMENTS": 1,
        "DESCRIPTIONS": 1,
    }

    response = requests.get(BLAST_URL, params=result_params)

    result = response.json()
    return result

In [75]:
def extract_taxon_ids(blast_json: dict) -> list[str | None]:
    taxon_ids = []

    for result in a["BlastOutput2"]:
        try:
            taxon_id = result["report"]["results"]["search"]["hits"][0]["description"][
                0
            ]["taxid"]
        except:
            taxon_id = None

        taxon_ids.append(taxon_id)

    return taxon_ids

In [85]:
def look_up_taxon_id(taxon_id: str) -> tuple[str, list[str]]:
    taxon = taxoniq.Taxon(taxon_id)

    return taxon.scientific_name, [t.scientific_name for t in taxon.ranked_lineage]

In [51]:
sequences = [
    "TCTTGCGGCACTGAAGATTCCCGAGCACTATAAAGGCCTGATATGGAGAGGAATCCAAGAGCTGAACAAAAGCCACGACTATGGAGCCCAACAGCTGATCCGATCCAGCAGTAACGCTTCCACCATTTCCATAGGTGGCTCTGGTGAGCTCCAGCGCCAGCGCGTCATGGAGGCCGTACATTTTCGAGTGCGACACACCATCACCATTCCCAACCGGAGTGGAGCCGATGACTGGGCAGACTTTGGGTTCGATTTACCGGACTGCAAGGCGCGAAAACAGTCTATAAAAGAAGAGTTTGCCGATTCTGATATTAACTAATATGTTGAGAATTGGGAGGAACATTGACCAAAATTACACTCAACCTGATGAAGCCAAAAGTGAATAATTC-----GCAAGCCCGGGTGAGAGTTCTACCAGCGATGGCTTTGCATTGGTCTTACCTGGTGGGTTGCACAGTAGGGTGTTCCAATGTGTCATTGGTGTCAACTCTTCATAACTATATTA",
    "TCTTGCGGCACTGAAGATTCCCGAGCACTATAAAGGCCTGATATGGAGAGGAATCCAAGAGCTGAACAAAAGCCACGACTATGGAGCCCAACAGCTGATCCGATCCAGCAGTAACGCTTCCACCATTTCCATAGGTGGCTCTGGTGAGCTCCAGCGCCAGCGCGTCATGGAGGCCGTACATTTTCGAGTGCGACACACCATCACCATTCCCAACCGGAGTGGAGCCGATGACTGGGCAGACTTTGGGTTCGATTTACCGGACTGCAAGGCGCGAAAACAGTCTATAAAAGAAGAGTTTGCCGAGTCTGATATTAACTAAAATGTTGAGAATTGGGAGGAACATTGACCAAAATTACACTCAACCTGATGAAACCAAAAGTGAATAATTC-----GCAAGCCCGGGTGAGAGTTCTACCAGCGATGGCTTTGCATTGGTCTTACCTGGTGGGTTGCACAGTAGGGTGTTCCAATGTGTCATTGGTGTCAACTCTTCATAACTATATTATTGTCAAACTTTTAATG",
]

request_id, wait_time_s = initiate_blast_request(sequences)

In [52]:
wait_until_ready(request_id, wait_time_s)

In [76]:
blast_json = fetch_results(request_id)

In [77]:
taxon_ids = extract_taxon_ids(blast_json)

In [78]:
taxon_ids

[231225, 231225]

In [87]:
look_up_taxon_id(23125)

KeyError: '23125'