In [6]:
# import 
import os
import re
import time
from pprint import pprint 
from datetime import datetime, timedelta
from typing import Annotated, List, Dict, Tuple, Optional, Union, Any
import xml.etree.ElementTree as ET
from pydantic import BaseModel, Field
from langchain_core.tools import tool
from langchain_core.runnables import RunnableConfig
from Bio import Entrez

In [7]:
Entrez.email = "nick.youngblut@arcinstitute.org"

In [15]:
@tool 
def esearch(
    esearch_query: Annotated[str, "Entrez query string."],
    database: Annotated[str, "Database name ('sra' or 'gds')"]="sra",
    #organism: Annotated[str, "Organism name ('human' or 'mouse')"]="human",
    )-> Annotated[List[str], "All tasks in the Asana project."]:
    """
    Run an Entrez search query and return the IDs of the results.
    """
    # date range
    start_date = datetime.now() - timedelta(days=7)
    end_date = datetime.now()
    date_range = f"{start_date.strftime('%Y/%m/%d')}:{end_date.strftime('%Y/%m/%d')}[PDAT]"
    esearch_query += f" AND {date_range}"

    max_ids = 20  # DEBUG

    # query
    ids = []
    retstart = 0
    retmax = 50
    while True:
        try:
            search_handle = Entrez.esearch(
                db=database, 
                term=esearch_query, 
                retstart=retstart, 
                retmax=retmax
            )
            search_results = Entrez.read(search_handle)
            search_handle.close()
            ids.extend(search_results["IdList"])
            retstart += retmax
            time.sleep(0.5)
            if max_ids and len(ids) >= max_ids:
                break
            if retstart >= int(search_results['Count']):
                break
        except Exception as e:
            print(f"Error searching {database} with query: {esearch_query}: {str(e)}")
            break 
    # create return string
    ids = ", ".join(ids[:max_ids])
    return f"database: {database}, IDs: {ids}"

query = '("single cell RNA sequencing" OR "single cell RNA-seq")'
IDs = esearch.invoke({"esearch_query" : query, "database" : "sra"})
IDs

'database: sra, IDs: 36004814, 36004694, 36004643, 36004624, 36004101, 36003953, 36003799, 36003617, 36003381, 35966237, 35966236, 35966235, 35966234, 35966233, 35966232, 35960653, 35960652, 35860023, 35812982, 35812981'

In [None]:
@tool 
def efetch(
    database: Annotated[str, "Database name ('sra' or 'gds')"],
    dataset_id: Annotated[str, "Entrez ID"],
    )-> Annotated[List[str], "eFetch results in xml format"]:
    """
    Run an Entrez efetch query and return the results.
    """

    time.sleep(0.5)

    # Fetch dataset record
    handle = Entrez.efetch(db=database, id=dataset_id, retmode="xml")
    record = handle.read()
    handle.close()

    try:
        record = record.decode("utf-8")
    except:
        pass

    return str(record)

record = efetch.invoke({"database" : "sra", "dataset_id" : "35966237"})
pprint(record)

('<?xml version="1.0" encoding="UTF-8"  ?>\n'
 '<EXPERIMENT_PACKAGE_SET>\n'
 '<EXPERIMENT_PACKAGE><EXPERIMENT accession="SRX26623215" '
 'alias="220857C_LM_3"><IDENTIFIERS><PRIMARY_ID>SRX26623215</PRIMARY_ID></IDENTIFIERS><TITLE>RNA-Seq '
 'of Homo sapiens: adult female pancreas</TITLE><STUDY_REF '
 'accession="SRP543450"><IDENTIFIERS><PRIMARY_ID>SRP543450</PRIMARY_ID></IDENTIFIERS></STUDY_REF><DESIGN><DESIGN_DESCRIPTION>Single-cell '
 'RNA-seq libraries were prepared using Single Cell 3 Library Gel Bead Kit V2 '
 'following the manufactures introduction. Finally sequencing was performed on '
 'an Illumina Novaseq6000 with a sequencing depth of at least 100,000 reads '
 'per cell and pair end 150bp (PE150).</DESIGN_DESCRIPTION><SAMPLE_DESCRIPTOR '
 'accession="SRS23119663"><IDENTIFIERS><PRIMARY_ID>SRS23119663</PRIMARY_ID></IDENTIFIERS></SAMPLE_DESCRIPTOR><LIBRARY_DESCRIPTOR><LIBRARY_NAME>220857C_LM_3</LIBRARY_NAME><LIBRARY_STRATEGY>RNA-Seq</LIBRARY_STRATEGY><LIBRARY_SOURCE>TRANSCRIPTOM

In [None]:
@tool 
def geo2sra(
    dataset_id: Annotated[str, "GEO accession"],
    )-> Annotated[List[str], "SRA accession"]:
    """
    Convert GEO accession to SRA accession.
    """
    # Fetch detailed GEO record to get links to SRA
    handle = Entrez.elink(dbfrom="gds", db="sra", id=dataset_id)
    links = Entrez.read(handle)
    handle.close()
        
    sra_ids = []
    if links[0]['LinkSetDb']:
        sra_ids = [link['Id'] for link in links[0]['LinkSetDb'][0]['Link']]

    # Get SRA accessions from IDs
    sra_accessions = []
    for sra_id in sra_ids:
        handle = Entrez.esummary(db="sra", id=sra_id)
        summary = Entrez.read(handle)
        handle.close()
        # Extract SRA accessions from summary
        for entry in summary:
            try:
                run_xml = entry["Runs"]
            except KeyError:
                continue
            
            hit = re.search(r'Run acc="([A-Z]+\d+)"', run_xml)
            if hit:
                sra_accessions.append(hit.group(1))
        # Add delay to comply with NCBI's guidelines
        time.sleep(0.34) 
    if len(sra_accessions) == 0:
        return "No SRA accessions found."

    # return accessions as string
    return ','.join(sra_accessions)

#geo2sra.invoke({"dataset_id" : "200254051"})
geo2sra.invoke({"dataset_id" : "200268899"})

'SRR29263068,SRR29263069'

In [None]:
@tool 
def get_pubmed_article(
    pubmed_id: Annotated[str, "PubMed ID"],
    )-> Annotated[str, "PubMed article metadata in xml format"]:
    """
    Get a PubMed article.
    """
    handle = Entrez.efetch(db="pubmed", id=pubmed_id, retmode="xml")
    record = handle.read()
    handle.close()
    try:
        record = record.decode("utf-8")
    except:
        pass
    return str(record)

get_pubmed_article.invoke({"pubmed_id" : "39492543"})


'<?xml version="1.0" ?>\n<!DOCTYPE PubmedArticleSet PUBLIC "-//NLM//DTD PubMedArticle, 1st January 2024//EN" "https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_240101.dtd">\n<PubmedArticleSet>\n<PubmedArticle><MedlineCitation Status="Publisher" Owner="NLM"><PMID Version="1">39492543</PMID><DateRevised><Year>2024</Year><Month>11</Month><Day>04</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Electronic">1096-0007</ISSN><JournalIssue CitedMedium="Internet"><PubDate><Year>2023</Year><Month>Oct</Month><Day>28</Day></PubDate></JournalIssue><Title>Experimental eye research</Title><ISOAbbreviation>Exp Eye Res</ISOAbbreviation></Journal><ArticleTitle>Comprehensive landscape of RNA N6-methyladenosine modification in lens epithelial cells from normal and diabetic cataract.</ArticleTitle><Pagination><StartPage>109702</StartPage><MedlinePgn>109702</MedlinePgn></Pagination><ELocationID EIdType="doi" ValidYN="Y">10.1016/j.exer.2023.109702</ELocationID><ELocationID EIdTyp

In [10]:
from subprocess import Popen, PIPE

def run_cmd(cmd: str) -> tuple:
    """
    Run sub-command and return returncode, output, and error.
    Args:
        cmd: Command to run
    Returns:
        tuple: (returncode, output, error)
    """
    p = Popen(cmd, stdout=PIPE, stderr=PIPE, shell=True)
    output, err = p.communicate()
    return p.returncode, output, err

In [14]:
import time

@tool
def run_sra_stat(
    accession: Annotated[str, "SRA accession"],
    tries: Annotated[int, "Number of attempts"]=3
    ) -> str: 
    """
    Run sra-stat on an SRA accession.
    """
    cmd = f'sra-stat --xml --quick {accession}'
    err = ""
    for i in range(tries):
        rc,output,err = run_cmd(cmd)
        if rc == 0:
            return output.decode("utf-8")
        # sleep prior to next attempt
        sleep_time = 10 * (i + 1)
        time.sleep(sleep_time)
    return f"Failed to run sra-stat: {err}"

#run_sra_stat.invoke({"accession" : "SRR13112659"})
