In [None]:
import requests
import xml.etree.ElementTree as ET
import time
import json

def search_pubmed(query, retmax=500):
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed",
        "term": query,
        "retmode": "json",
        "retmax": retmax
    }
    response = requests.get(url, params=params)
    response.raise_for_status()
    return response.json()["esearchresult"]["idlist"]

def fetch_summaries(pmids):
    """
    Fetch summaries in chunks of 100.
    """
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi"
    all_summaries = {}
    for i in range(0, len(pmids), 100):
        chunk = pmids[i:i + 100]
        params = {
            "db": "pubmed",
            "id": ",".join(chunk),
            "retmode": "json"
        }
        response = requests.get(url, params=params)
        response.raise_for_status()
        all_summaries.update(response.json()["result"])
        time.sleep(0.34)  # NCBI recommends <3 requests/sec
    return all_summaries

def fetch_abstracts(pmids):
    """
    Fetch abstracts in chunks of 100.
    """
    url = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    abstracts = {}
    for i in range(0, len(pmids), 100):
        chunk = pmids[i:i + 100]
        params = {
            "db": "pubmed",
            "id": ",".join(chunk),
            "retmode": "xml"
        }
        response = requests.get(url, params=params)
        response.raise_for_status()
        root = ET.fromstring(response.content)
        for article in root.findall(".//PubmedArticle"):
            pmid = article.findtext(".//PMID")
            abstract_text = ""
            for abstract in article.findall(".//AbstractText"):
                label = abstract.attrib.get("Label", "")
                section = f"{label}: " if label else ""
                abstract_text += section + (abstract.text or "").strip() + "\n"
            abstracts[pmid] = abstract_text.strip()
        time.sleep(0.34)
    return abstracts

def collect_article_data(pmids, summaries, abstracts):
    result = {}
    for pmid in pmids:
        if pmid not in summaries:
            continue
        article = summaries[pmid]
        if not isinstance(article, dict):  # Skip "uids" or malformed
            continue
        title = article.get("title", "No title")
        authors = "; ".join([a["name"] for a in article.get("authors", [])])
        journal = article.get("fulljournalname", "Unknown journal")
        year = article.get("pubdate", "Unknown date").split(" ")[0]
        abstract = abstracts.get(pmid, "No abstract found.")
        result[pmid] = {
            "title": title,
            "author": authors,
            "journal": journal,
            "year": year,
            "abstract": abstract
        }
    return result

def save_to_json(data, filename="pubmed_gut_brain_500.json"):
    with open(filename, "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=True)

# --- Main Execution ---
if __name__ == "__main__":
    query = "gut-brain axis"
    try:
        print(f"🔍 Searching PubMed for: '{query}' (max 500 results)...")
        pmids = search_pubmed(query, retmax=500)
        print(f"✅ Retrieved {len(pmids)} PMIDs.")
        time.sleep(0.5)

        print("📄 Fetching summaries...")
        summaries = fetch_summaries(pmids)

        print("📚 Fetching abstracts...")
        abstracts = fetch_abstracts(pmids)

        print("🧠 Compiling article data...")
        data = collect_article_data(pmids, summaries, abstracts)

        save_to_json(data)
        print(f"✅ Saved {len(data)} articles to 'pubmed_gut_brain_500.json'")
    except Exception as e:
        print(f"❌ Error: {e}")


🔍 Searching PubMed for: 'gut-brain axis' (max 500 results)...
✅ Retrieved 500 PMIDs.
📄 Fetching summaries...
📚 Fetching abstracts...
🧠 Compiling article data...
✅ Saved 500 articles to 'pubmed_gut_brain_500.json'


In [24]:
import os
DIR_DBS = "../data/raw/GutBrainIE_Full_Collection_2025/Articles/json_format"
PATH_DBS = os.listdir(DIR_DBS)


list_pmids = []
for PATH_DB in PATH_DBS:
    with open(os.path.join(DIR_DBS, PATH_DB), "r", encoding="utf-8") as f:
        data = json.load(f)
    for i in data.keys():
        list_pmids.append(i)

with open("pubmed_gut_brain_500.json", "r", encoding="utf-8") as f:
    data = json.load(f)