# **DOI to PMCID**

---


In [2]:
import torch

print("CUDA Available:", torch.cuda.is_available())
print("Current Device:", torch.cuda.current_device())
print("Device Name:", torch.cuda.get_device_name(torch.cuda.current_device()))

CUDA Available: True
Current Device: 0
Device Name: NVIDIA GeForce GTX 1080 Ti


In [3]:
import time
import requests
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

# Recording execution time
start = time.time()

# PMC API
API_KEY = '..'  # NCBI API key
BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"

# Load DOIs from CSV (non null and unique ones only)
df = pd.read_csv('metadata.csv', dtype=str)
dois = df['DOI'].dropna().unique()

# Function to fetch PMIDs & PMCIDs for a batch of DOIs
def fetch_ids_batch(doi_batch):
    """ This function fetches PMCIDS and PMIDS using batch
        resquest (doi_batch) to minimize the request number.
        This function recalls itself when the rate limit is
        exceeded, it retries after 10 seconds on the same
        batch that failed.
        The try-catch is to avoid stopping the script when an
        exeption arises.
    """
    params = {
        'format': 'json',
        'ids': ','.join(doi_batch),  # This is to send multiple DOIs in one request
        'api_key': API_KEY
    }

    try:
        response = requests.get(BASE_URL, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            return [(rec.get('doi', 'Not Found'), rec.get('pmid', 'Not Found'), rec.get('pmcid', 'Not Found'))
                    for rec in data.get('records', [])]
        elif response.status_code == 429:
            print("Rate limit exceeded! Retrying after 10 seconds...")
            time.sleep(10)
            return fetch_ids_batch(doi_batch)  # Retry the same batch
    except requests.RequestException as e:
        print(f"Request failed: {e}")

    # Return errors if request fails
    return [(doi, 'Error', 'Error') for doi in doi_batch]


batch_size = 150  # Number of DOIs per request
nb_workers = 5  # Number of parallel requests
results = []

# Split DOIs into batches of 10
doi_batches = [dois[i:i + batch_size] for i in range(0, len(dois), batch_size)]

# Execute requests in parallel using ThreadPoolExecutor with 5 workers
with ThreadPoolExecutor(max_workers = nb_workers) as executor:
    results_list = list(executor.map(fetch_ids_batch, doi_batches)) # This returns a list of lists (batches) of tuples

# Flatten results (since each batch returns a list)
results = [item for sublist in results_list for item in sublist] # This is a list of tuples after flattening the batches lists

# Convert results to DataFrame and save as CSV
result_df = pd.DataFrame(results, columns=['DOI', 'PMID', 'PMCID'])
display(result_df)
result_df.to_csv('doi_to_pmcids.tsv', index=False, sep='\t')

# Record end time
end = time.time()

print(f"Completed in {round(end-start, 2)} seconds.")


Unnamed: 0,DOI,PMID,PMCID
0,10.1038/s41598-023-47358-4,37989862,PMC10663565
1,10.3389/fonc.2023.1215194,37854681,PMC10580988
2,10.1007/s12094-023-03189-3,37067729,PMC10250522
3,10.3390/cells12050810,36899946,PMC10000689
4,10.3390/genes14010013,36672755,PMC9858589
...,...,...,...
4974,10.1016/j.molcel.2009.10.026,Not Found,Not Found
4975,10.1074/jbc.m503188200,Not Found,Not Found
4976,10.1210/en.2010-1481,Not Found,Not Found
4977,10.1074/jbc.m303257200,Not Found,Not Found


Completed in 2.58 seconds.


# **Verify if DOI is in PMC**

---



In [4]:
import requests
import pandas as pd
import time
from concurrent.futures import ThreadPoolExecutor
import time

# record start time
start = time.time()
# API details
API_KEY = None  
BASE_URL = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"

# Load DOIs from file
dois = pd.read_csv("doi_to_pmcids.tsv", sep="\t")["DOI"].dropna().tolist()
#dois = dois[:300]  # Limit to 300 DOIs for testing

# Function to check if a batch of DOIs exists in PMC
def check_doi_existence(doi_batch):
    params = {
        "format": "json",
        "ids": ",".join(doi_batch),  # Batch of DOIs
        "api_key": API_KEY
    }

    for attempt in range(3):  # Retry up to 3 times
        try:
            response = requests.get(BASE_URL, params=params, timeout=20)
            response.raise_for_status()
            return extract_results(response.json(), doi_batch)
        except requests.exceptions.Timeout:
            print(f"Timeout for batch {doi_batch}. Retrying ({attempt+1}/3)...")
            time.sleep(2 ** attempt)  # Exponential backoff (2s, 4s, 8s)
        except requests.exceptions.RequestException as e:
            print(f"Request failed for batch {doi_batch}: {e}")
            return {doi: -1 for doi in doi_batch}  # Mark all as failed

    return {doi: -1 for doi in doi_batch}  # Failure after all retries

# Extract results from API response & map to DOIs
def extract_results(api_response, doi_batch):
    found_dois = {record["doi"]: 1 for record in api_response.get("records", []) if "pmcid" in record}

    # Map input DOIs to found/not found status
    return {doi: found_dois.get(doi, 0) for doi in doi_batch}

# Process DOIs in parallel using 5 workers
def process_dois(dois, batch_size=150, workers=5):
    df_found = []
    df_not_found = []

    # Split DOIs into batches of `batch_size`
    doi_batches = [dois[i:i + batch_size] for i in range(0, len(dois), batch_size)]

    # Use ThreadPoolExecutor for parallel requests
    with ThreadPoolExecutor(max_workers=workers) as executor:
        results = list(executor.map(check_doi_existence, doi_batches))

    # Flatten results & store in DataFrames
    for batch_result in results:
        for doi, count in batch_result.items():
            if count > 0:
                df_found.append({"DOI": doi, "Found": count})
            else:
                df_not_found.append({"DOI": doi, "Found": count})

    # Convert to DataFrames & Save
    pd.DataFrame(df_found).to_csv('found_dois.csv', index=False)
    pd.DataFrame(df_not_found).to_csv('not_found_dois.csv', index=False)

    print(f"Found: {len(df_found)}, Not Found: {len(df_not_found)}")
    return df_found, df_not_found
# Run the main function
if __name__ == "__main__":
    f, nf = process_dois(dois)

end = time.time()

print("Execution time :",
      (end-start) , "s")


Found: 3270, Not Found: 1709
Execution time : 2.4146792888641357 s


# **PMCID xml files download**

---


In [5]:
import os
import shutil
import pandas as pd
import requests
import time
import concurrent.futures

# record start time
start = time.time()

TSV_FILE = "doi_to_pmcids.tsv"  #   Path to your TSV file
EXTRACT_PATH = "xml"  #   Folder to save XML files
BATCH_SIZE = 150  #   Max PMCIDs per batch
MAX_WORKERS = 15  #   Number of parallel batch requests


# Check if directory exists Ensure XML directory exists
if os.path.exists(EXTRACT_PATH):  
    shutil.rmtree(EXTRACT_PATH)
os.makedirs(EXTRACT_PATH, exist_ok=True)

#   Load PMCIDs from TSV file
def load_pmc_ids(tsv_file):
    df = pd.read_csv(tsv_file, sep="\t")
    df = df.loc[df['PMCID'] != 'Not Found']  # useless to look for the non existing pmcids
    return df["PMCID"].astype(str).tolist()  # Ensure PMCIDs are strings

#   Download XMLs in batch using NCBI API
def download_pmc_xml_batch(pmc_batch):
    max_retries = 3
    pmc_list = ",".join(pmc_batch)  # Format PMCIDs as comma-separated list
    url = f"https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id={pmc_list}&rettype=xml"

    for attempt in range(  max_retries):  # Retry logic
        try:
            response = requests.get(url, timeout=30)

            #  Handle 429 (Rate Limiting)
            if response.status_code == 429:
                print(f" Rate limited (429). Retrying in {2 ** attempt} seconds...")
                time.sleep(10 ** attempt)  # Use exponential backoff
                continue  # Retry

            #  Handle 200 (Success)
            if response.status_code == 200:
                for i, pmc_id in enumerate(pmc_batch):
                    file_path = os.path.join(EXTRACT_PATH, f"{pmc_id}.xml")
                    with open(file_path, "wb") as f:
                        f.write(response.content)  # Save response XML
                print(f" Batch download")
                return  # Exit function if successful

            #  Handle Other Errors 
            print(f" Failed batch {pmc_batch[:5]}... (Status {response.status_code})")
            break  # No retries for non-recoverable errors (e.g., 404)

        except requests.exceptions.Timeout:
            print(f" Timeout for batch {pmc_batch}. Retrying ({attempt+1}/{ max_retries})...")
            time.sleep(2 ** attempt)  # Exponential backoff

        except requests.exceptions.RequestException as e:
            print(f" Request failed for batch {pmc_batch}: {e}")
            break  # Don't retry if it's a permanent failure

    print(f"Giving up on batch of size: {len(pmc_batch)}")
    retry_with_smaller_batches(pmc_batch)


#   Split PMCIDs into batches
def split_into_batches(pmc_ids, batch_size):
    return [pmc_ids[i:i+batch_size] for i in range(0, len(pmc_ids), batch_size)]

def retry_with_smaller_batches(pmc_batch, min_batch_size=5):
    """Recursively splits a batch into smaller sub-batches and retries downloads."""
    if len(pmc_batch) <= min_batch_size:
        print(f"Even with small batches, failed to download: {pmc_batch}")
        return  # Stop trying if the batch is too small

    mid = len(pmc_batch) // 2
    sub_batch1, sub_batch2 = pmc_batch[:mid], pmc_batch[mid:]

    print(f"Splitting batch into two smaller batches: {len(sub_batch1)} and {len(sub_batch2)}")
    download_pmc_xml_batch(sub_batch1)
    download_pmc_xml_batch(sub_batch2)

#   Parallelized batch requests
def parallel_download(pmc_ids):
    pmc_batches = split_into_batches(pmc_ids, BATCH_SIZE)

    with concurrent.futures.ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        executor.map(download_pmc_xml_batch, pmc_batches)

#  Batch and parallel download
pmc_ids = load_pmc_ids(TSV_FILE)
parallel_download(pmc_ids)

end = time.time()

print("Execution time :",
      (end-start) , "s")

 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 1 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 2 seconds...
 Rate limited (429). Retrying in 4 seconds...
 Rate limited (429). Retrying in 4