In [None]:
import requests
from tqdm import tqdm
import time
import xml.etree.ElementTree as ET

In [None]:
def get_total_count(query):
    """
    Retrieves the total number of PubMed IDs for a given query.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmax": 0,
        "usehistory": "y",
        "retmode": "xml"
    }

    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        raise Exception(f"Error fetching total count: {response.status_code}")

    root = ET.fromstring(response.text)
    count_text = root.findtext(".//Count")
    if count_text is None:
        raise Exception("Error: <Count> tag not found in the response.")
    
    try:
        count = int(count_text)
    except ValueError:
        raise Exception(f"Error: Unable to convert count '{count_text}' to integer.")
    
    return count

In [None]:
def fetch_pubmed_ids(query, batch_size=9999):
    """
    Fetches all PubMed IDs for a given query in batches.
    """
    total_count = get_total_count(query)
    print(f"Total PubMed IDs found: {total_count}")

    pubmed_ids = []
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"

    # Calculate the number of batches needed
    num_batches = (total_count // batch_size) + (1 if total_count % batch_size != 0 else 0)

    for batch in tqdm(range(num_batches), desc="Fetching PubMed IDs"):
        retstart = batch * batch_size
        params = {
            "db": "pubmed",
            "term": query,
            "retstart": retstart,
            "retmax": batch_size,
            "usehistory": "y",
            "retmode": "xml"
        }

        success = False
        retries = 3
        while not success and retries > 0:
            try:
                response = requests.get(base_url, params=params)
                if response.status_code == 200:
                    root = ET.fromstring(response.text)
                    ids = [id_elem.text for id_elem in root.findall(".//IdList/Id")]
                    pubmed_ids.extend(ids)
                    success = True
                else:
                    print(f"Error {response.status_code} fetching batch starting at {retstart}. Retrying...")
                    retries -= 1
                    time.sleep(2)
            except Exception as e:
                print(f"Exception occurred: {e}. Retrying...")
                retries -= 1
                time.sleep(2)

        if not success:
            print(f"Failed to fetch batch starting at {retstart}. Exiting.")
            break

        # Respect NCBI's rate limits
        time.sleep(0.34)  # Approximately 3 requests per second

    return pubmed_ids

In [None]:
def main():
    query = '(Neoplasms[MH] AND Humans[MH])'
    batch_size = 9999

    pubmed_ids = fetch_pubmed_ids(query, batch_size)

    print(f"Total PubMed IDs retrieved: {len(pubmed_ids)}")

if __name__ == "__main__":
    main()

In [None]:
# save the IDs to a file
with open("pubmed_ids.txt", "w") as f:
    for pmid in pubmed_ids:
        f.write(f"{pmid}\n")
print("PubMed IDs have been saved to pubmed_ids.txt")