In [1]:
import re
import time
import json
import random
import requests
import pandas as pd
from tqdm import tqdm
from itertools import islice
from multiprocessing import Pool
import xml.etree.ElementTree as ET
from concurrent.futures import ThreadPoolExecutor, as_completed

In [2]:
url = "https://www.ncbi.nlm.nih.gov/pmc/utils/idconv/v1.0/"
key = ".."
#_________the email is needed for the crossref abstract extraction__________
#mail = email adress
cross = "https://api.crossref.org/works/"

# **Extract all citations in xml file:**
1. Remove all citations that have no DOI
2. Save a df with unique DOI

In [5]:

import pandas as pd
import xml.etree.ElementTree as ET

def split_sentences(text):
    """Splits text into sentences without using nltk.punkt."""
    return re.split(r'(?<=[.!?])\s+', text)  # Splits at sentence-ending punctuation

def extract_pmc_citations(xml_file):
    """Extract citations from a given PMC XML file using ElementTree for faster parsing."""
    tree = ET.parse(xml_file)
    root = tree.getroot()
    
    # Get the citing article's DOI
    citing_doi = None
    for article_id in root.findall(".//article-id"):
        if article_id.get("pub-id-type") == "doi":
            citing_doi = article_id.text
            break

    # Extract reference list
    ref_dict = {}
    for ref in root.findall(".//ref"):
        ref_id = ref.get("id")
        title_elem = ref.find(".//article-title")
        doi_elem = ref.find(".//pub-id[@pub-id-type='doi']")
        ref_dict[ref_id] = {
            "title": title_elem.text.strip() if title_elem is not None and title_elem.text else "No title",
            "doi": doi_elem.text.strip() if doi_elem is not None and doi_elem.text else "No DOI",
        }
    
    # Extract citations from text
    data = []
    for paragraph in root.findall(".//p"):
        text = " ".join(paragraph.itertext()).strip()
        sentences = split_sentences(text)
        citation_matches = paragraph.findall(".//xref")

        for citation in citation_matches:
            citation_id = citation.get("rid")
            if citation_id in ref_dict:
                sentence_index = next((i for i, s in enumerate(sentences) if citation.text and citation.text in s), None)
                previous_sentence = sentences[sentence_index - 1] if sentence_index and sentence_index > 0 else None
                if previous_sentence:
                    previous_sentence = re.sub(r'\[.*?\]|\(.*?\)|\{.*?\}|\n.*?\n', '', previous_sentence).strip()
                    data.append([citing_doi, citation_id, previous_sentence, ref_dict[citation_id]["title"], ref_dict[citation_id]["doi"]])
    
    df = pd.DataFrame(data, columns=["Citing DOI", "Citation ID", "Citation Context", "Cited Title", "DOI"])
    df = df.drop(df[df['DOI'] == 'No DOI'].index) # Remove useless rows to save space
    return (df, df.drop_duplicates(subset=['DOI']))


start_time = time.time()
pmc_citations_df, unique_dois = extract_pmc_citations("../pmc_xml/10663565.xml")
print(f"Completed in {round(time.time() - start_time, 2)} seconds.")
#display(pmc_citations_df)
display(unique_dois)


Completed in 0.01 seconds.


Unnamed: 0,Citing DOI,Citation ID,Citation Context,Cited Title,DOI
0,10.1038/s41598-023-47358-4,CR5,"Due to increasing environmental problems, find...",Metal nanoparticles as green catalysts,10.3390/ma12213602
1,10.1038/s41598-023-47358-4,CR7,"Due to increasing environmental problems, find...",Fe(III)-salen complex supported on dendrimer f...,10.1016/j.jpcs.2020.109642
2,10.1038/s41598-023-47358-4,CR8,"For this purpose, economic and biocompatible a...",Nickel nanoparticles adorned on magnetized cel...,10.1007/s10570-022-04823-z
6,10.1038/s41598-023-47358-4,CR13,Nanotechnology is one of the newest and most e...,A comprehensive review on green synthesis of n...,10.1016/j.jclepro.2020.122880
7,10.1038/s41598-023-47358-4,CR15,Nanotechnology is one of the newest and most e...,Biosynthesis of organic nanocomposite using,10.1155/2021/4105853
8,10.1038/s41598-023-47358-4,CR16,The utilization of biologic materials such as ...,Design and synthesis of magnetic Fe,10.1039/D0RA06251K
9,10.1038/s41598-023-47358-4,CR18,The utilization of biologic materials such as ...,New Acetamidine Cu(II) Schiff base complex sup...,10.1038/s41598-022-07674-7
10,10.1038/s41598-023-47358-4,CR19,These eco-friendly condition are more efficien...,Green synthesis of nanoparticles using plant e...,10.1007/s10311-020-01074-x
12,10.1038/s41598-023-47358-4,CR21,Green methods may synthesize NPs with better s...,Bioinspired and green synthesis of nanoparticl...,10.1016/j.jscs.2021.101304
14,10.1038/s41598-023-47358-4,CR23,The reduction of metal ions using plant extrac...,Green synthesis of gold and silver nanoparticl...,10.3390/nano10091763


In [6]:
bs = 150 # batch size
par = 10 # workers
batches = [unique_dois['DOI'][i:i + bs] for i in range(0, len(unique_dois['DOI']), bs)]
print(len(batches))

1


# **Extract PMID:**
1. Save DOIs with PMID in a dico
2. Save DOIs with no PMID in a List

In [7]:
def fetch_ids_batch(doi_batch):
    """ 
    This function fetches PMIDS using batch resquest to minimize the request number. Recalls itself
    when the rate limit is exceeded, retries after 10 seconds on the same batch that failed. 
    """
    no_id = []
    pid = {}
    params = {'format': 'json','ids': ','.join(doi_batch), 'api_key': key}
    
    try:
        response = requests.get(url, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            for rec in data.get('records', []):
                if 'pmid' in rec:
                    pid.update({rec['doi']: rec['pmid']})
                else : 
                    no_id.append(rec['doi'])
            return (pid, no_id)
        elif response.status_code == 429:
            print("Rate limit exceeded! Retrying after 10 seconds...")
            time.sleep(10)
            return fetch_ids_batch(doi_batch)  # Retry the same batch
    except requests.RequestException as e:
        return f"Request failed: {e}", f"Request failed: {e}"

    return f"Response status code: {response.status_code}", f"Response status code: {response.status_code}"

start_time = time.time()
pmid = {} 
no_pmid = []

def process_batch(b):
    return fetch_ids_batch(b)  # Returns (pmid_dict, no_pmid_list)

with ThreadPoolExecutor(max_workers = par) as executor:
    results = list(executor.map(process_batch, batches))

# Aggregate results
for pmid_dict, no_pmid_list in results:
    pmid.update(pmid_dict) 
    no_pmid.extend(no_pmid_list) 

print(f"Completed fetching pmids in {round(time.time() - start_time, 2)} seconds.")

Completed fetching pmids in 0.32 seconds.


In [8]:
print(len(pmid) )
print(len(no_pmid))
dico = dict(islice(pmid.items(), 80))
lis = no_pmid[0:30]
#print(lis)
#lis = list(batched(pmid.items(), bs))
#print(lis)

19
21


# **Extract Abstract from PMC**
1. Using the pmids in the dict
2. Using epost for bulk queries

In [14]:
def get_pubmed_abstracts_bulk(pmid_dict):
    """
    Fetch abstracts in bulk using epost + efetch while keeping DOI keys.
    Returns: 
        - dict {DOI: Abstract}
        - DataFrame with DOI, Abstract
    """
    # Extract PMIDs
    pmid_list = list(pmid_dict.values())

    # epost to store PMIDs up to 10 000 at once
    epost_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/epost.fcgi"
    epost_params = {"db": "pubmed", "id": ",".join(pmid_list)}
    
    epost_response = requests.post(epost_url, data=epost_params, timeout=10)
    epost_response.raise_for_status()

    # Extract WebEnv & QueryKey
    root = ET.fromstring(epost_response.text)
    webenv = root.find("./WebEnv").text
    query_key = root.find("./QueryKey").text

    # efetch to retrieve abstracts
    efetch_url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    efetch_params = {
        "db": "pubmed",
        "query_key": query_key,
        "WebEnv": webenv,
        "retmode": "xml",
        "rettype": "abstract"
    }
    
    efetch_response = requests.get(efetch_url, params=efetch_params, timeout=10)
    efetch_response.raise_for_status()
    
    # Parse XML to extract abstracts
    root = ET.fromstring(efetch_response.text)
    abstracts = {}

    for article in root.findall(".//PubmedArticle"):
        pmid = article.find(".//PMID").text
        abstract_element = article.find(".//AbstractText")
        abstract_text = abstract_element.text if abstract_element is not None else "No abstract found"

        # Find corresponding DOI
        doi = next((k for k, v in pmid_dict.items() if v == pmid), None)
        if doi:
            abstracts[doi] = abstract_text

    # Convert to DataFrame
    df = pd.DataFrame(abstracts.items(), columns=["DOI", "Abstract"])
    
    return df

  # Add thousands of PMIDs here
start_time = time.time()
abstracts_df = get_pubmed_abstracts_bulk(pmid)
print(f"Completed in {round(time.time() - start_time, 2)} seconds.")

# Output as dictionary
#print(abstracts_dict)

Completed in 1.29 seconds.


In [15]:
# Output as DataFrame
display(abstracts_df)
#a = pd.DataFrame(list(ab.items()), columns=['DOI', 'Abstract'])
#display(a[a['Abstract'].str.split().str.len() > 4])
#display(a[a['Abstract'] == 'Error: HTTP 429'])
pmc_abstracts = pmc_citations_df.merge(abstracts_df, on='DOI', how='left')
display(pmc_abstracts)

Unnamed: 0,DOI,Abstract
0,10.3390/ma12213602,Nanoparticles play a significant role in vario...
1,10.1155/2021/4105853,Here presented a quick and easy synthesis of c...
2,10.1039/d0ra06251k,The Fe
3,10.1038/s41598-022-07674-7,"In this project, the new catalyst copper defin..."
4,10.3390/nano10091763,"Currently, metal nanoparticles have varied use..."
5,10.1049/iet-nbt.2018.5386,For being applied in medicine as therapeutic a...
6,10.1039/d0ra07861a,The increasing knowledge on health benefit pro...
7,10.3390/foods9020151,Mealworms (
8,10.1016/S0140-6736(21)02724-0,Antimicrobial resistance (AMR) poses a major t...
9,10.3390/ijms22137202,Silver nanoparticles (AgNPs) have been imposed...


Unnamed: 0,Citing DOI,Citation ID,Citation Context,Cited Title,DOI,Abstract
0,10.1038/s41598-023-47358-4,CR5,"Due to increasing environmental problems, find...",Metal nanoparticles as green catalysts,10.3390/ma12213602,Nanoparticles play a significant role in vario...
1,10.1038/s41598-023-47358-4,CR7,"Due to increasing environmental problems, find...",Fe(III)-salen complex supported on dendrimer f...,10.1016/j.jpcs.2020.109642,
2,10.1038/s41598-023-47358-4,CR8,"For this purpose, economic and biocompatible a...",Nickel nanoparticles adorned on magnetized cel...,10.1007/s10570-022-04823-z,
3,10.1038/s41598-023-47358-4,CR13,Nanotechnology is one of the newest and most e...,A comprehensive review on green synthesis of n...,10.1016/j.jclepro.2020.122880,
4,10.1038/s41598-023-47358-4,CR15,Nanotechnology is one of the newest and most e...,Biosynthesis of organic nanocomposite using,10.1155/2021/4105853,Here presented a quick and easy synthesis of c...
5,10.1038/s41598-023-47358-4,CR16,The utilization of biologic materials such as ...,Design and synthesis of magnetic Fe,10.1039/D0RA06251K,
6,10.1038/s41598-023-47358-4,CR18,The utilization of biologic materials such as ...,New Acetamidine Cu(II) Schiff base complex sup...,10.1038/s41598-022-07674-7,"In this project, the new catalyst copper defin..."
7,10.1038/s41598-023-47358-4,CR19,These eco-friendly condition are more efficien...,Green synthesis of nanoparticles using plant e...,10.1007/s10311-020-01074-x,
8,10.1038/s41598-023-47358-4,CR21,Green methods may synthesize NPs with better s...,Bioinspired and green synthesis of nanoparticl...,10.1016/j.jscs.2021.101304,
9,10.1038/s41598-023-47358-4,CR23,The reduction of metal ions using plant extrac...,Green synthesis of gold and silver nanoparticl...,10.3390/nano10091763,"Currently, metal nanoparticles have varied use..."


# **Extract Abstract from Crossref**
1. Using the dois in the list

In [11]:
USER_AGENT = f"YourAppName/1.0 (mailto:{mail})"
CROSSREF_URL = "https://api.crossref.org/works/"

HEADERS = {"User-Agent": USER_AGENT}
MAX_WORKERS = 5  # Number of parallel requests
RETRIES = 3  # Max retry attempts

In [12]:
failed_dois = []

def get_crossref_metadata(doi):
    """Fetch metadata from CrossRef for a single DOI."""
    url = f"{CROSSREF_URL}{doi}"
    
    try:
        response = requests.get(url, headers=HEADERS, timeout = 25)

        if response.status_code == 200:
            data = response.json().get("message", {})
            return {
                "DOI": doi,
                "Publisher": data.get("publisher", "Unknown Publisher"),
                "Journal": data.get("container-title", ["Unknown Journal"])[0],
                "Abstract": data.get("abstract", "No abstract found"),
                "Journal Article": "Yes" if data.get("type") == "journal-article" else "No",
                "Error": None,
            }

        elif response.status_code == 404:
            return {"DOI": doi, "Error": "DOI Not Found"}

        else:
            failed_dois.append(doi)  # Save DOI for retry
            return {"DOI": doi, "Error": f"HTTP {response.status_code}"}

    except requests.RequestException as e:
        failed_dois.append(doi)  # Save DOI for retry
        return {"DOI": doi, "Error": str(e)}

def fetch_metadata_parallel(doi_list):
    """Fetch metadata for a list of DOIs in parallel using ThreadPoolExecutor."""
    metadata_list = []
    
    with ThreadPoolExecutor(max_workers=5) as executor:
        futures = {executor.submit(get_crossref_metadata, doi): doi for doi in doi_list}

        for future in tqdm(as_completed(futures), total=len(doi_list), desc="Fetching Metadata"):
            metadata_list.append(future.result())

    return metadata_list

start_time = time.time()
metadata_results = fetch_metadata_parallel(lis)
metadata_df = pd.DataFrame(metadata_results)
print(f"Completed in {round(time.time() - start_time, 2)} seconds.")
display(metadata_df)

if failed_dois:
    with open("failed_dois.txt", "w") as f:
        for doi in failed_dois:
            f.write(doi + "\n")
    print(f"{len(failed_dois)} requests failed. Saved to failed_dois.txt for retry.")


Fetching Metadata: 100%|████████████████████████| 21/21 [00:01<00:00, 10.63it/s]

Completed in 1.99 seconds.





Unnamed: 0,DOI,Publisher,Journal,Abstract,Journal Article,Error
0,10.1016/j.foodres.2012.12.015,Elsevier BV,Food Research International,No abstract found,Yes,
1,10.1179/1753555714Y.0000000236,Informa UK Limited,Materials Technology,No abstract found,Yes,
2,10.1007/s10570-022-04823-z,Springer Science and Business Media LLC,Cellulose,No abstract found,Yes,
3,10.1080/14786419.2018.1508145,Informa UK Limited,Natural Product Research,No abstract found,Yes,
4,10.1088/1742-6596/2274/1/012001,IOP Publishing,Journal of Physics: Conference Series,<jats:title>Abstract</jats:title><jats:p>We ex...,Yes,
5,10.1007/s10311-020-01074-x,Springer Science and Business Media LLC,Environmental Chemistry Letters,No abstract found,Yes,
6,10.1016/j.scp.2022.100672,Elsevier BV,Sustainable Chemistry and Pharmacy,No abstract found,Yes,
7,10.1016/j.actbio.2009.02.003,Elsevier BV,Acta Biomaterialia,No abstract found,Yes,
8,10.1016/j.chemosphere.2020.128580,Elsevier BV,Chemosphere,No abstract found,Yes,
9,10.1016/j.jclepro.2020.122880,Elsevier BV,Journal of Cleaner Production,No abstract found,Yes,
