# Goal

* Create a general supervisor-working agent that can use entrez tools to explore an Entrez record
* Better dealing with large number of Entrez records

# Init

In [3]:
# import 
import os
import time
import json
import xmltodict
from enum import Enum
from pprint import pprint
from typing import Annotated, List, Dict, Tuple, Optional, Union, Any
import xml.etree.ElementTree as ET
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from Bio import Entrez
import pandas as pd
import threading
from dotenv import load_dotenv

In [4]:
# setup
load_dotenv()
pd.set_option('display.max_colwidth', 1000)
os.environ["DEBUG_MODE"] = "TRUE"

In [5]:
# checks
if os.getenv("DEBUG_MODE") == "TRUE":
    print("DEBUG_MODE is enabled.")

DEBUG_MODE is enabled.


In [6]:
# set up Entrez
Entrez.email = "nick.youngblut@arcinstitute.org"
Entrez.api_key = os.getenv("NCBI_API_KEY")
# Set to 10 for API key users
request_limiter = threading.Semaphore(10)

# Tools

In [7]:
def batch_ids(ids, batch_size):
    """
    Batch a list of IDs into smaller lists of a given size.
    """
    for i in range(0, len(ids), batch_size):
        yield ids[i:i + batch_size]

In [8]:
def truncate_values(record, max_length):
    # truncate long values in the record
    root = ET.fromstring(record)
    for item in root.findall(".//Item"):
        if item.text and len(item.text) > max_length:
            item.text = item.text[:max_length] + "...[truncated]"
    # convert back to string
    return ET.tostring(root, encoding="unicode")

#### esearch

In [9]:
@tool 
def esearch(
    esearch_query: Annotated[str, "Entrez query string."],
    database: Annotated[str, "Database name (e.g., sra, gds, or pubmed)"],
    )-> Annotated[List[str], "Entrez IDs of database records"]:
    """
    Run an Entrez search query and return the Entrez IDs of the results.
    Example query for single cell RNA-seq:
        `("single cell"[Title] OR "single-cell"[Title] OR "scRNA-seq"[Title])`
    Example query for an ENA accession number (database = sra):
        `ERX13336121`
    Example query for a GEO accession number (database = gds):
        `GSE51372`
    """
    # debug model
    if os.getenv("DEBUG_MODE") == "TRUE":
        max_records = 2 

    # query
    records = []
    retstart = 0
    retmax = 50
    while True:
        try:
            search_handle = Entrez.esearch(
                db=database, 
                term=esearch_query, 
                retstart=retstart, 
                retmax=retmax
            )
            search_results = Entrez.read(search_handle)
            search_handle.close()
            # delete unneeded keys
            to_rm = ["RetMax", "RetStart"]
            for key in to_rm:
                if key in search_results.keys():
                    del search_results[key]
            # add to records
            records.append(str(search_results))
            # update retstart
            retstart += retmax
            time.sleep(0.33)
            if max_records and len(records) >= max_records:
                break
            if retstart >= int(search_results['Count']):
                break
        except Exception as e:
            print(f"Error searching {database} with query: {esearch_query}: {str(e)}")
            break 
        
    # return records
    if len(records) == 0:
        return(f"No records found for query: {esearch_query}")
    if os.getenv("DEBUG_MODE") == "TRUE":
        records = records[:max_records]  # debug
    return records

# esearch.invoke({"esearch_query" : "GSE51372", "database" : "sra"})
# esearch.invoke({"esearch_query" : "GSE121737", "database" : "gds"})

#### efetch

In [10]:
@tool 
def efetch(
    entrez_ids: Annotated[List[str], "List of Entrez IDs"],
    database: Annotated[str, "Database name (e.g., sra, gds, or pubmed)"],
) -> Annotated[str, "eFetch results in XML format"]:
    """
    Run an Entrez efetch query on Entrez IDs to obtain metadata for the records.
    Useful for obtaining metadata for specific records.
    """
    batch_size = 200  # Maximum number of IDs per request as per NCBI guidelines
    records = []

    for id_batch in batch_ids(entrez_ids, batch_size):
        time.sleep(0.34)  # Respect the rate limit of 3 requests per second
        id_str = ",".join(id_batch)
        try:
            # Fetch the records for the current batch of IDs
            handle = Entrez.efetch(db=database, id=id_str, retmode="xml")
            batch_record = handle.read()
            handle.close()
        except Entrez.Parser.ValidationError:
            print(f"Failed to fetch record for IDs: {id_str}")
            continue  # Skip this batch and proceed to the next
        except Exception as e:
            print(f"An error occurred: {e}")
            continue
        finally:
            try:
                handle.close()
            except:
                pass  # Handle cases where handle might not be open

        # Decode the record if necessary
        if isinstance(batch_record, bytes):
            try:
                batch_record = batch_record.decode("utf-8")
            except Exception as e:
                print(f"Decoding error: {e}")
                continue

        # Truncate long values in the record
        batch_record = truncate_values(batch_record, max_length=1000)

        # convert to XML to JSON
        batch_record = json.dumps(xmltodict.parse(batch_record), indent=2)

        # Check for errors in the response
        if "Error occurred: cannot get document summary" in batch_record:
            print(f"Failed to fetch record for IDs: {id_str}. Try a different database.")
            continue

        records.append(batch_record)

    # Combine all records into a single string
    combined_records = "\n".join(records)

    return combined_records

# records = efetch.invoke({ "entrez_ids" : ["35966237"], "database" : "sra"})
# pprint(records)
# records = efetch.invoke({"entrez_ids" : ["200254051"], "database" : "gds"})
# pprint(records)

#### esummary

In [11]:
@tool
def esummary(
    entrez_ids: Annotated[List[str], "List of Entrez IDs"],
    database: Annotated[str, "Database name (e.g., sra, gds, or pubmed)"],
) -> Annotated[str, "eSummary results in XML format"]:
    """
    Run an Entrez esummary query on Entrez IDs to obtain summary information for the records.
    Useful for obtaining summary information for specific records.
    """
    batch_size = 200  # Maximum number of IDs per request as per NCBI guidelines
    max_string_length = 500  # Maximum length of a string in the record
    records = []
    
    for id_batch in batch_ids(entrez_ids, batch_size):
        time.sleep(0.34)  # Respect NCBI's rate limits (no more than 3 requests per second)
        id_str = ",".join(id_batch)
        
        try:
            # Fetch summary record for the current batch
            handle = Entrez.esummary(db=database, id=id_str, retmode="xml")
            batch_record = handle.read()
            handle.close()
        except Entrez.Parser.ValidationError:
            print(f"Failed to fetch summary for IDs: {id_str}. Check if the IDs exist.")
            continue 
        except Exception as e:
            print(f"An error occurred: {e}")
            continue
        finally:
            try:
                handle.close()
            except:
                pass  # Handle cases where the handle might not be open
        
        # Decode the record if necessary
        if isinstance(batch_record, bytes):
            try:
                batch_record = batch_record.decode("utf-8")
            except Exception as e:
                print(f"Decoding error: {e}")
                continue
            
        # Truncate long values in the record
        batch_record = truncate_values(batch_record, max_length=500)

        # convert to XML to JSON
        batch_record = json.dumps(xmltodict.parse(batch_record), indent=2)

        # Check for errors in the response
        if "ERROR" in batch_record.upper() or "INVALID_ID" in batch_record.upper():
            print(f"Failed to fetch summary for IDs: {id_str}. Try a different database or verify the IDs.")
            continue

        # Append the batch record to the list of records
        records.append(batch_record)
    
    # Combine all batch records into a single string
    combined_records = "\n".join(records)
    return combined_records

# esummary.invoke({"entrez_ids" : ["35966237"], "database" : "sra"})
# esummary.invoke({"entrez_ids" : ["200121737"], "database" : "sra"})
# esummary.invoke({"entrez_ids" : ["6697288"], "database" : "sra"})
# IDs = ["200148729", "100024676", "100020301", "100018573", "305022831", "305022830", "305022829", "305022828", "305022827", "305022826"]
# esummary.invoke({"entrez_ids" : IDs, "database" : "gds"})

#### elink

In [12]:
@tool
def elink(
    entrez_ids: Annotated[List[str], "List of Entrez IDs"],
    source_db: Annotated[str, "Source database (e.g., 'sra')"],
    target_db: Annotated[str, "Target database (e.g., 'bioproject', 'biosample', 'pubmed')"],
) -> Annotated[str, "eLink results in XML format"]:
    """
    Find related entries between Entrez databases, particularly useful for finding
    BioProject, BioSample, or publication records related to SRA entries.
    """
    batch_size = 200  # Maximum number of IDs per request as per NCBI guidelines
    records = []

    for id_batch in batch_ids(entrez_ids, batch_size):
        time.sleep(0.34)  # Respect NCBI's rate limits (no more than 3 requests per second)
        id_str = ",".join(id_batch)
        
        try:
            handle = Entrez.elink(
                id=id_str,
                dbfrom=source_db,
                db=target_db,
                retmode="xml"
            )
            batch_record = handle.read()
            handle.close()
        except Entrez.Parser.ValidationError:
            batch_record = f"Failed to find links for IDs: {id_str}"
        except Exception as e:
            batch_record = f"An error occurred: {e}"
        finally:
            try:
                handle.close()
            except:
                pass  # Handle cases where the handle might not be open
        
        # Decode the record if necessary
        if isinstance(batch_record, bytes):
            try:
                batch_record = batch_record.decode("utf-8")
            except Exception as e:
                print(f"Decoding error: {e}")
                continue

        # Truncate long values in the record
        batch_record = truncate_values(batch_record, max_length=1000)

        # convert to XML to JSON
        batch_record = json.dumps(xmltodict.parse(batch_record), indent=2)

        # Check for errors in the response
        if "ERROR" in batch_record.upper():
            batch_record = f"Failed to find links for IDs: {id_str}. Verify database names ({source_db}, {target_db}) and Entrez IDs."

        # Append the batch record to the list of records
        records.append(batch_record)
    
    # Combine all batch records into a single string
    return "\n".join(records)

# elink.invoke({"entrez_ids" : ["35966237", "200254051"], "source_db" : "gds", "target_db" : "pubmed"})
# elink.invoke({"entrez_ids" : ['200121737', '100024679', '303444964', '303444963', '303444962'], "source_db" : "gds", "target_db" : "sra"})
# elink.invoke({"entrez_ids" : ["200148729X"], "source_db" : "gds", "target_db" : "sra"})

#### which entrez database

In [13]:
@tool
def which_entrez_databases(
    entrez_ids: Annotated[List[str], "List of Entrez IDs"],
) -> Annotated[str, "List of databases where each Entrez ID is found."]:
    """
    Check which databases an Entrez ID is found in.
    """
    databases = ["sra", "gds", "pubmed", "biosample", "bioproject"]
    found_in = {entrez_id: [] for entrez_id in entrez_ids}

    for db in databases:
        for id_batch in batch_ids(entrez_ids, 200):
            time.sleep(0.34)  # Respect the rate limit
            try:
                handle = Entrez.esummary(db=db, id=",".join(id_batch))
                records = Entrez.read(handle)
                handle.close()
                # Extract the IDs that were successfully retrieved
                if isinstance(records, list):
                    found_ids = {record['Id'] for record in records}
                else:
                    # In case only one record is returned
                    found_ids = {records['Id']}
                for entrez_id in found_ids:
                    found_in[entrez_id].append(db)
            except Exception as e:
                continue

    # Prepare the output
    output_lines = []
    for entrez_id in entrez_ids:
        if not found_in[entrez_id]:
            output_lines.append(f"Entrez ID {entrez_id} not found in any databases.")
        else:
            output_lines.append(f"Entrez ID {entrez_id} found in: {', '.join(found_in[entrez_id])}.")

    return "\n".join(output_lines)

# Example usage
# which_entrez_databases.invoke({"entrez_ids" : ['200121737', '100024679', '303444964']})

#### fastq-dump

In [14]:
import shutil
import tempfile
from subprocess import Popen, PIPE

In [15]:
def run_cmd(cmd: list) -> Tuple[int, str, str]:
    """
    Run sub-command and return returncode, output, and error.
    Args:
        cmd: Command to run
    Returns:
        tuple: (returncode, output, error)
    """
    cmd = [str(i) for i in cmd]
    p = Popen(cmd, stdout=PIPE, stderr=PIPE)
    output, err = p.communicate()
    return p.returncode, output, err

In [55]:
@tool
def fastq_dump(
    SRR_accessions: Annotated[List[str], "List of SRA run accessions (e.g., SRR1234567)"],
    tries: Annotated[int, "Number of attempts to run fastq-dump"]=3
) -> str:
    """
    Use fastq-dump to download the first few lines from the fastq files of the given SRR accession.
    The tool is useful for quickly checking the fastq files of an SRR accession.
    """
    # check if accession is valid
    incorrect_accessions = [x for x in SRR_accessions if not x.startswith("SRR")]
    if len(incorrect_accessions) > 0:
        acc_str = ", ".join(incorrect_accessions)
        return f"Invalid SRA accession numbers {acc_str}. Please provide >=1 valid SRR accession number."

    # create temp directory
    temp_dir = tempfile.TemporaryDirectory()

    # create command
    cmd = ["fastq-dump", "--outdir", temp_dir.name, "--split-files", "--maxSpotId", 2] + SRR_accessions

    # run command
    for i in range(tries):
        return_code, output, error = run_cmd(cmd)
        if return_code == 0:
            break
        time.sleep(5 * (i + 1))
    if return_code != 0:
        return f"Error running fastq-dump: {error.decode('utf-8')}"

    # read in the files
    files = os.listdir(temp_dir.name)
    if len(files) == 0:
        return "No FASTQ files found."
    fastq_data = ""
    for file in files:
        file_name = os.path.basename(file)
        with open(os.path.join(temp_dir.name, file), "r") as f:
            fastq_data += f"#-- File: {file_name} --#\n"
            fastq_data += f.read() + "\n"
            #fastq_data[file_name] = f.read()

    # delete the temp directory
    temp_dir.cleanup()
    return str(fastq_data)
    
# accesssions = ["SRR13112659", "SRR13112660"]
# accessions = ["SRX4967529"]
#print(fastq_dump.invoke({"SRR_accessions" : accessions}))

In [62]:
@tool
def sra_stat(
    accessions: Annotated[List[str], "List of GEO and/or SRA accessions (e.g., SRP359840, SRR1234567, or GSE12345)"],
    tries: Annotated[int, "Number of attempts to run sra-stat"]=3
    ) -> str: 
    """
    Run the sra-stat CLI command (SRA Tools) on a GEO or SRA accession.
    Use this tool to get information about all sequence data associated with the accession.
    """
    # check if accession is valid
    incorrect_accessions = [x for x in accessions if not x.startswith(("SRP", "SRX", "SRR", "GSE", "GSM"))]
    if len(incorrect_accessions) > 0:
        acc_str = ", ".join(incorrect_accessions)
        return f"Invalid GEO/SRA accession numbers {acc_str}. Please provide >=1 valid GEO and/or SRA accession."

    # run sra-stat
    cmd = ['sra-stat', '--xml', '--quick'] + accessions

    # run command
    for i in range(tries):
        return_code, output, error = run_cmd(cmd)
        if return_code == 0:
            break
        time.sleep(5 * (i + 1))
    if return_code != 0:
        return f"Error running fastq-dump: {error.decode('utf-8')}"
        
    # Decode the record if necessary
    if isinstance(output, bytes):
        try:
            output = output.decode("utf-8")
        except Exception as e:
            return f"Decoding error: {e}"
            

    # Truncate long values in the record
    output = truncate_values(output, max_length=1000)

    # convert to XML to JSON
    output = json.dumps(xmltodict.parse(output), indent=2)
    return str(output)

accessions = ["SRP359840", "SRR13112659", "SRR13112660"]
accessions = ["GSE207334"]
# print(sra_stat.invoke({"accessions" : accessions}))

# Models

In [63]:
from langchain_openai import ChatOpenAI

In [64]:
# set model
model_supervisor = ChatOpenAI(model="gpt-4o", temperature=0.1)
model_worker = ChatOpenAI(model="gpt-4o-mini", temperature=0)

# Agents

In [65]:
from langgraph.prebuilt import create_react_agent
from langchain_core.messages import BaseMessage, HumanMessage, AIMessage, ToolMessage

## esearch

In [66]:
esearch_agent = create_react_agent(
    model=model_worker,
    tools=[esearch],
    state_modifier="\n".join([
        "You are an expert in bioinformatics and you are working on a project to find information about a specific dataset.",
        "Based on the task provided by your supervisor, use Entrez esearch to help complete the task.",
        "Provide a concise summary of your findings; use lists when possible; do not include helpful wording.",
    ])
)

# inputs = {"messages": [("user", "Investigate GSE121737")]}
# esearch_agent.invoke(inputs)

In [67]:
@tool
def invoke_esearch_worker(
    message: Annotated[str, "Message to the worker"],
) -> Annotated[str, "Response from the worker"]:
    """
    Invoke the esearch worker to perform a task.
    """
    result = esearch_agent.invoke({"messages": [("user", message)]})
    # just return the response
    return {
        "messages": [HumanMessage(content=result["messages"][-1].content, name="esearch worker")]
    }

#invoke_esearch_worker.invoke({"message" : "Investigate GSE121737"})

## esummary

In [68]:
esummary_agent = create_react_agent(
    model=model_worker,
    tools=[esummary, which_entrez_databases],
    state_modifier="\n".join([
        "You are an expert in bioinformatics and you are working on a project to find information about a specific dataset.",
        "Based on the task provided by your supervisor, use Entrez esummary to help complete the task.",
        "You can use which_entrez_databases to determine which databases to use for esummary queries.",
        "Provide a concise summary of your findings; use lists when possible; do not include helpful wording.",
    ])
)

In [69]:
@tool
def invoke_esummary_worker(
    message: Annotated[str, "Message to the worker. Be sure to provide Entrez IDs."],
) -> Annotated[str, "Response from the worker"]:
    """
    Invoke the esummary worker to perform a task.
    """
    result = esummary_agent.invoke({"messages": [("user", message)]})
    # just return the final response
    return {
        "messages": [HumanMessage(content=result["messages"][-1].content, name="esummary worker")]
    }

# invoke_esummary_worker.invoke({"message" : "Investigate Entrez ID 35966237"})

## efetch

In [70]:
efetch_agent = create_react_agent(
    model=model_worker,
    tools=[efetch, which_entrez_databases],
    state_modifier="\n".join([
        "You are an expert in bioinformatics and you are working on a project to find information about a specific dataset.",
        "Based on the task provided by your supervisor, use Entrez efetch to help complete the task.",
        "You can use which_entrez_databases to determine which databases to use for efetch queries.",
        "Provide a concise summary of your findings; use lists when possible; do not include helpful wording.",
    ])
)

In [71]:
@tool
def invoke_efetch_worker(
    message: Annotated[str, "Message to the worker. Be sure to provide Entrez IDs."],
) -> Annotated[str, "Response from the worker"]:
    """
    Invoke the efetch worker to perform a task.
    """
    result = efetch_agent.invoke({"messages": [("user", message)]})
    # just return the final response
    return {
        "messages": [HumanMessage(content=result["messages"][-1].content, name="efetch worker")]
    }

## elink

In [72]:
elink_agent = create_react_agent(
    model=model_supervisor,
    tools=[elink, which_entrez_databases],
    state_modifier="\n".join([
        "You are an expert in bioinformatics and you are working on a project to find information about a specific dataset.",
        "Based on the task provided by your supervisor, use Entrez elink to help complete the task.",
        "elink is useful for finding related entries between Entrez databases.",
        "Generally, you will want to use the which_entrez_databases tool to determine which databases to use for elink queries.",
        "Note that elink results are composed of Entrez IDs and not accessions (e.g., SRA accessions).",
        "Provide a concise summary of your findings; use lists when possible; do not include helpful wording.",
    ])
)

In [73]:
@tool
def invoke_elink_worker(
    message: Annotated[str, "Message to the worker. Be sure to provide Entrez IDs."],
) -> Annotated[str, "Response from the worker"]:
    """
    Invoke the efetch worker to perform a task.
    """
    result = elink_agent.invoke({"messages": [("user", message)]})
    # just return the final response
    return {
        "messages": [HumanMessage(content=result["messages"][-1].content, name="elink worker")]
    }

## Sequence

In [79]:
fastq_agent = create_react_agent(
    model=model_worker,
    tools=[sra_stat, fastq_dump],
    state_modifier="\n".join([
        "You are an expert in bioinformatics and you are working on a project to find information about a specific dataset.",
        "Based on the task provided by your supervisor, use sra-stat and fastq-dump to help complete the task.",
        "You can investige the sequence data (fastq files) associated with GEO and/or SRA accessions.",
        "sra-stat provides information about the sequence data associated with GEO and/or SRA accessions.",
        "fastq-dump is useful for quickly checking the fastq files of SRR accessions.",
        "If you are provided with Entrez IDs instead of GEO/SRA accessions, just state that you require GEO and/or SRA accessions.",
        "Provide a concise summary of your findings; use lists when possible; do not include helpful wording.",
    ])
)

In [86]:
@tool
def invoke_fastq_worker(
    message: Annotated[str, "Message to the worker. Be sure to provide GEO and/or SRA accessions."],
) -> Annotated[str, "Response from the worker"]:
    """
    Invoke the fastq worker to perform a task.
    """
    result = fastq_agent.invoke({"messages": [("user", message)]})
    # just return the final response
    return {
        "messages": [HumanMessage(content=result["messages"][-1].content, name="fastq worker")]
    }

# invoke_fastq_worker.invoke({"message" : "Is SRR13112659 Illumina paired-end 10X Genomics data?"})

## Supervisor

In [81]:
from langchain import PromptTemplate
def create_step_summary_chain(model: str="gpt-4o-mini", max_tokens: int=35):
    """
    Create a chain of tools to summarize each step in a workflow.
    """
    template = "\n".join([
        "Concisely summarize the provided step in the langgraph workflow.",
        f"The summary must be {max_tokens} tokens or less.",
        "Do not use introductory words such as \"The workflow step involves\"",
        "Write your output as plain text instead of markdown.",
        "#-- The workflow step --#",
        "{step}"
    ])
    prompt = PromptTemplate(
        input_variables=["step"],
        template=template
    )

    # Initialize the language model
    llm = ChatOpenAI(model_name=model, temperature=0, max_tokens=max_tokens)

    # Create the LLM chain
    return prompt | llm

msg = {'tools': {'messages': [ToolMessage(content="{'messages': [HumanMessage(content='- **Entrez ID: 200121737**\\n  - **SRX Accessions**: Not directly available, but related SRA ID is **SRP167700**\\n  - **GSE Accession**: GSE121737\\n  - **Samples**:\\n    - GSM3444963\\n    - GSM3444962\\n    - GSM3444964\\n\\n- **Entrez ID: 100024679**\\n  - **SRX Accessions**: Not directly available\\n  - **GSE Accession**: GSE132325; GSE151535; GSE206234; GSE240796; GSE192477; GSE206238; GSE166916; GSE121737; GSE184948\\n\\n- **Entrez ID: 303444964**\\n  - **SRX Accessions**: **SRX4967529**\\n  - **GSM Accession**: GSM3444964\\n\\n- **Entrez ID: 303444963**\\n  - **SRX Accessions**: **SRX4967528**\\n  - **GSM Accession**: GSM3444963\\n\\n- **Entrez ID: 303444962**\\n  - **SRX Accessions**: **SRX4967527**\\n  - **GSM Accession**: GSM3444962', additional_kwargs={}, response_metadata={}, name='esummary worker')]}", name='invoke_esummary_worker', id='dac6ce94-900b-4a87-a6f6-e48292ab1a83', tool_call_id='call_vPNZmoRFcXpgwmzxzEFdTMHj')]}}
step_summary_chain = create_step_summary_chain()
#step_summary_chain.invoke({"step": msg, "max_tokens": 25}).content

In [83]:
entrez_agent = create_react_agent(
    model=model_supervisor,
    tools=[
        invoke_esearch_worker, invoke_esummary_worker, invoke_efetch_worker, invoke_elink_worker, 
        which_entrez_databases, invoke_fastq_worker
    ],
    state_modifier="\n".join([
        "You are a helpful senior bioinformatician assisting a researcher with a task involving Entrez databases.",
        "You have a team of workers who can perform specific tasks using Entrez tools.",
        "Provide guidance to the workers to help them complete the task successfully.",
        "\n",
        "Generally, start with eSearch to find Entrez records, then use eFetch to get detailed information.",
        "Use eSummary to obtain summary information on an Entrez record.",
        "Use eLink to navigate between databases to find related records (e.g., GEO to SRA).",
        "Use the fastq worker to investigate the sequence data associated with GEO and/or SRA accessions.",
        "Note: the fastq worker calls sra-stat and fastq-dump, which both require SRA (or GEO) accessions; do not provide Entrez IDs to the fastq worker.",
        "\n",
        "Generally, you will want to specify the database(s) to search (e.g., sra, gds, or pubmed).",
        "If there are dozens of records, batch the IDs and call the worker multiple times to avoid rate limits and token count limits.",
        "Continue sending tasks to your workers until you successfully complete the task.",
        "Be very concise; provide simple lists when possible; do not include unnecessary wording such as \"If you need further assistance\".",
        "Write your output as plain text instead of markdown.",
        "\n",
        "#-- Accession notes --#",
        "SRA accesssion prefixes: SRX, SRP, SRR",
        "ENA accession prefixes: ERX, PRJNA, DRX, E-MTAB",
        "GEO accession prefixes: GSE, GSM, GPL",
        "BioProject accession prefixes: PRJNA, PRJEB, PRJDB",
        "BioSample accession prefixes: SAMN, SAME",
        "#-- Database notes --#",
        "Entrez databases: sra, gds, pubmed, biosample, bioproject",
        "#-- Accession conversion workflows --#",
        "GSE -> SRP -> SRX -> SRR",
        "GSE -> GSM -> SRS -> SRX -> SRR",
        "GSM -> SRS -> SRX -> SRR",
        "PRJNA -> SRX -> SRR",
        "SAMN -> SRX -> SRR",
        "ERP -> SRP -> SRX -> SRR",
        "#-- Example workflows --#",
        "# Task: Convert GSE123456 to SRX, SRP, or SRR accessions",
        "  1. esearch of GSE accession to obtain Entrez IDs",
        "  2. esummary of the Entrez IDs to get the SRX accessions"
    ])
)


In [None]:
def invoke_entrez_agent(
    inputs: dict,
    step_summary_chain: Any,
    config: dict = {"max_concurrency" : 8, "recursion_limit": 50}
):
    """
    Invoke the Entrez agent to perform a task.
    """
    final_step = ""
    for i,step in enumerate(entrez_agent.stream(inputs, config=config)):
        final_step = step
        msg = step_summary_chain.invoke({"step": step})
        print(f"Step {i+1}: {msg.content}")
    try:
        print(final_step["agent"]["messages"][-1].content)
    except:
        pass

inputs = {"messages": [("user", "Convert GSE121737 to SRX accessions")]}
invoke_entrez_agent(inputs, step_summary_chain)

Step 1: Requesting the number of spots and bases for the sample SRR13112659 using the invoke_fastq_worker function.
Step 2: Total spots and bases are summarized, with a detailed breakdown of member contributions in terms of spots and bases.
Step 3: Total spots for SRR13112659: 7,676,492; total bases: 898,149,564. Breakdown includes four members with specific spots and bases
The number of spots and bases for SRR13112659 are as follows:

- Total Spots: 7,676,492
- Total Bases: 898,149,564

### Member Breakdown:
1. **GTCTCTCG**
   - Spots: 2,026,746
   - Bases: 237,129,282
2. **AATCTCTC**
   - Spots: 2,041,326
   - Bases: 238,835,142
3. **CGGAGGGA**
   - Spots: 1,930,369
   - Bases: 225,853,173
4. **TCAGAAAT**
   - Spots: 1,678,051
   - Bases: 196,331,967


In [None]:
inputs = {"messages": [("user", "What are the number of spots and bases for SRR13112659?")]}
invoke_entrez_agent(inputs, step_summary_chain)

In [148]:
inputs = {"messages": [("user", "Obtain any available SRP accessions for GSE148729")]}
invoke_entrez_agent(inputs, step_summary_chain)

Step 1: Search for GSE148729 in the GDS database to retrieve Entrez IDs using the invoke_esearch_worker function.
Step 2: Entrez IDs for GSE148729 in the GDS database are listed, ranging from 200148729 to 304477929.
Step 3: Summarizes multiple Entrez IDs using the 'invoke_esummary_worker' function to retrieve relevant information.
Step 4: Gene expression profiling data for SARS-CoV-1/2 infections in human cell lines, detailing samples, platforms, and related resources.
Step 5: Identified SRP accession for GSE148729 as SRP256479, providing relevant information in response to the query.
The SRP accession related to GSE148729 is **SRP256479**.


In [152]:
inputs = {"messages": [("user", "Obtain any available publications for GSE196830")]}
invoke_entrez_agent(inputs, step_summary_chain)

Step 1: Search for GSE196830 in the GDS database to retrieve Entrez IDs using the invoke_esearch_worker function.
Step 2: Entrez IDs for GSE196830 in the GDS database were retrieved, listing 100 unique identifiers.
Step 3: Link multiple Entrez IDs from the GDS database to the PubMed database to retrieve related publications.
Failed to find links for IDs: 305902681. Verify database names (gds, pubmed) and Entrez IDs.
Failed to find links for IDs: 100028939. Verify database names (gds, pubmed) and Entrez IDs.
Step 4: Related PubMed IDs were found for one Entrez ID, while no publications were identified for four other Entrez IDs from the GDS database.
Step 5: Identified publications related to GSE196830, providing PubMed IDs for one Entrez ID and noting no publications for others.
Publications related to GSE196830:

- PubMed IDs for Entrez ID 200196830:
  - 38622708
  - 36823676
  - 35389779

No related publications were found for the other Entrez IDs.


In [155]:
inputs = {"messages": [("user", "Obtain any available GEO accessions for the pubmed ID 38622708")]}
invoke_entrez_agent(inputs, step_summary_chain)

Step 1: Link PubMed ID 38622708 to the GEO database to retrieve related GEO accessions.
Step 2: Retrieve related GEO accessions for PubMed ID 38622708: 200196830, 200196829, 200196735.
Step 3: Retrieve GEO accessions for specified Entrez IDs using the invoke_esummary_worker function.
Step 4: Error due to exceeding maximum string length in a request, requiring correction of the input content.
Step 5: Retrieve GEO accessions for three specified Entrez IDs using the invoke_esummary_worker function.
Step 6: Summarizes multiple GEO datasets related to single-cell eQTL mapping, detailing sample information, publication dates, and FTP links for data access.
Step 7: Identified GEO accessions related to PubMed ID 38622708, detailing titles, sample counts, and FTP links for three datasets.
The GEO accessions related to PubMed ID 38622708 are:

1. **GSE196830**
   - Title: Single-cell eQTL mapping identifies cell type specific genetic control of autoimmune disease
   - Samples: 1179 total samples