In [10]:
import requests
import xml.etree.ElementTree as ET
import time
from tqdm import tqdm

In [13]:
def get_pubmed_ids(query, batch_size=9999, delay=0.34, api_key=None):
    """
    Retrieve all PubMed IDs for a given query in batches, with progress tracking.

    Args:
        query (str): The search query.
        batch_size (int): Number of IDs to retrieve per batch (max 100,000).
        delay (float): Delay between requests in seconds to respect rate limits.
        api_key (str, optional): NCBI API key for increased rate limits.

    Returns:
        list: A list of PubMed IDs.
    """
    base_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    
    # Step 1: Initial request to get the total count and history info
    initial_params = {
        "db": "pubmed",
        "term": query,
        "retmax": 0,           # We don't need actual IDs yet
        "usehistory": "y",     # Use history to manage large results
        "retmode": "xml"
    }
    
    if api_key:
        initial_params["api_key"] = api_key
    
    print("Sending initial request to determine total number of records...")
    response = requests.get(base_url, params=initial_params)
    if response.status_code != 200:
        raise Exception(f"Initial request failed with status code {response.status_code}")
    
    root = ET.fromstring(response.text)
    total = int(root.findtext(".//Count"))
    webenv = root.findtext(".//WebEnv")
    query_key = root.findtext(".//QueryKey")
    
    print(f"Total records found: {total}")
    
    all_ids = []
    
    # Calculate the number of batches needed
    num_batches = (total // batch_size) + (1 if total % batch_size != 0 else 0)
    print(f"Retrieving IDs in {num_batches} batches of up to {batch_size} IDs each.")
    
    # Initialize tqdm progress bar
    with tqdm(total=num_batches, desc="Processing Batches", unit="batch") as pbar:
        for batch_num in range(num_batches):
            retstart = batch_num * batch_size
            current_batch_size = min(batch_size, total - retstart)
            
            params = {
                "db": "pubmed",
                "query_key": query_key,
                "WebEnv": webenv,
                "retstart": retstart,
                "retmax": current_batch_size,
                "usehistory": "y",
                "retmode": "xml"
            }
            
            if api_key:
                params["api_key"] = api_key
            
            # Debug: Print batch info
            # print(f"Fetching batch {batch_num + 1}/{num_batches} (retstart={retstart}, retmax={current_batch_size})...")
            
            try:
                response = requests.get(base_url, params=params)
                response.raise_for_status()
            except requests.exceptions.RequestException as e:
                print(f"Request failed for batch {batch_num + 1}: {e}. Retrying after delay...")
                time.sleep(delay)
                try:
                    response = requests.get(base_url, params=params)
                    response.raise_for_status()
                except requests.exceptions.RequestException as e:
                    raise Exception(f"Failed to retrieve batch {batch_num + 1} after retry: {e}")
            
            root = ET.fromstring(response.text)
            ids = [id_elem.text for id_elem in root.findall(".//IdList/Id")]
            all_ids.extend(ids)
            
            # Update the progress bar
            pbar.update(1)
            
            # Optional: Print retrieval info (can be commented out if not needed)
            # print(f"Retrieved {len(ids)} IDs from batch {batch_num + 1}. Total IDs collected: {len(all_ids)}")
            
            # Respect NCBI rate limits
            time.sleep(delay)
    
    print(f"All IDs retrieved. Total PubMed IDs collected: {len(all_ids)}")
    return all_ids

In [None]:

# Usage
if __name__ == "__main__":
    query = '(Neoplasms[MH] AND Humans[MH])'
    pubmed_ids = get_pubmed_ids(query=query, batch_size=9999, delay=0.34)


Sending initial request to determine total number of records...
Total records found: 3731171
Retrieving IDs in 38 batches of up to 100000 IDs each.


Processing Batches: 100%|██████████| 38/38 [00:53<00:00,  1.40s/batch]

All IDs retrieved. Total PubMed IDs collected: 9999





In [None]:
    
    # Optionally, save IDs to a file
    with open("pubmed_ids.txt", "w") as f:
        for pid in pubmed_ids:
            f.write(f"{pid}\n")
    
    print("PubMed IDs have been saved to pubmed_ids.txt")
