# Jupyter notebook sample

In [2]:
import os, csv, time, requests, sys
from urllib.parse import urlencode
from tqdm import tqdm

# Calculate the sums
import os, csv, time, requests, xml.etree.ElementTree as ET

In [3]:
MAILTO = "bukhari.453@s.kyushu-u.ac.jp"
OUT_CSV = "openalex_works.csv"

QUERY_HUMAN = r'''(transformer* OR "self-attention" OR BERT OR GPT OR "large language model" OR "retrieval-augmented generation") AND (prescrib* OR medicat* OR "drug" OR pharmac* OR "medication recommendation" OR prescription)'''.strip()
DATE_FROM, DATE_TO = "2020-01-01", "2025-05-31"

BASE = "https://api.openalex.org/works"

# We search fulltext fields, then filter by date, language, and has_doi if you like
params = {
    "search": QUERY_HUMAN,
    "filter": f"from_publication_date:{DATE_FROM},to_publication_date:{DATE_TO}",
    "per_page": 200,
    "cursor": "*",
    "mailto": MAILTO,
}

fields = ["id", "doi", "title", "publication_year", "publication_date",
          "authorships", "host_venue", "type", "open_access", "cited_by_count", "abstract_inverted_index",
          "primary_topic"]

with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(["source", "work_id", "doi", "title", "year", "date", "venue", "type", "is_oa", "oa_status", "cited_by",
                "abstract", "authors"])
    total = 0
    while True:
        r = requests.get(BASE, params=params, timeout=60)
        r.raise_for_status()
        j = r.json()
        results = j.get("results", [])
        if not results: break
        for x in results:
            doi = (x.get("doi") or "").lower().replace("https://doi.org/", "")
            title = x.get("title") or ""
            year = x.get("publication_year") or ""
            date = x.get("publication_date") or ""
            venue = (x.get("host_venue") or {}).get("display_name") or ""
            wtype = x.get("type") or ""
            oa = x.get("open_access") or {}
            is_oa = oa.get("is_oa")
            oa_status = oa.get("oa_status") or ""
            cited_by = x.get("cited_by_count") or 0
            # Recompose abstract text if present
            inv = x.get("abstract_inverted_index")
            abstract = ""
            if isinstance(inv, dict):
                words = sorted([(pos, word) for word, poss in inv.items() for pos in poss])
                abstract = " ".join(w for _, w in words)
            # Simple author string
            authors = "; ".join(
                [(a.get("author", {}) or {}).get("display_name", "") for a in x.get("authorships") or []])
            w.writerow(
                ["OpenAlex", x.get("id"), doi, title, year, date, venue, wtype, is_oa, oa_status, cited_by, abstract,
                 authors])
            total += 1
        params["cursor"] = j.get("meta", {}).get("next_cursor")
        if not params["cursor"]: break
        time.sleep(0.1)

print(f"Saved {total} OpenAlex records to {OUT_CSV}")

Saved 19095 OpenAlex records to openalex_works.csv


In [None]:


EMAIL = "your.email@institution.edu"  # NCBI asks for email/tool id
TOOL = "your_tool_name"

QUERY_HUMAN = r'''(transformer*[Title/Abstract] OR "self-attention"[Title/Abstract] OR BERT[Title/Abstract] OR GPT[Title/Abstract] OR "large language model"[Title/Abstract] OR "retrieval-augmented generation"[Title/Abstract]) AND (prescrib*[Title/Abstract] OR medicat*[Title/Abstract] OR drug[Title/Abstract] OR pharmac*[Title/Abstract] OR "medication recommendation"[Title/Abstract] OR prescription[Title/Abstract])'''.strip()
DATE_FROM, DATE_TO = "2020/01/01", "2025/05/31"  # PubMed date format

OUT_CSV = "pubmed_works.csv"


def esearch(term, retmax=100000):
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
    params = {
        "db": "pubmed", "term": term,
        "datetype": "pdat", "mindate": DATE_FROM, "maxdate": DATE_TO,
        "retmax": retmax, "retmode": "json",
        "usehistory": "y", "tool": TOOL, "email": EMAIL
    }
    r = requests.get(base, params=params, timeout=60)
    r.raise_for_status()
    return r.json()


def efetch(webenv, query_key, retstart, retmax=200):
    base = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
    params = {
        "db": "pubmed", "query_key": query_key, "WebEnv": webenv,
        "retstart": retstart, "retmax": retmax,
        "retmode": "xml", "tool": TOOL, "email": EMAIL
    }
    r = requests.get(base, params=params, timeout=60)
    r.raise_for_status()
    return r.text


def get_text(elem, path):
    x = elem.find(path)
    return x.text if x is not None and x.text else ""


def parse_article(article):
    med = article.find("MedlineCitation")
    pmid = get_text(med, "PMID")
    art = med.find("Article")
    title = get_text(art, "ArticleTitle")
    # Abstract (concat sections)
    abstract = " ".join([t.text for t in art.findall("Abstract/AbstractText") if t is not None and t.text]) or ""
    # Year
    y = get_text(art, "Journal/JournalIssue/PubDate/Year")
    if not y:
        y = get_text(art, "Journal/JournalIssue/PubDate/MedlineDate")[:4]
    # Journal
    journal = get_text(art, "Journal/Title")
    # DOI
    doi = ""
    for aid in art.findall("ELocationID"):
        if aid.get("EIdType", "").lower() == "doi" and aid.text:
            doi = aid.text.lower().strip()
            break
    if not doi:
        for el in med.findall("ArticleIdList/ArticleId"):
            if el.get("IdType", "").lower() == "doi" and el.text:
                doi = el.text.lower().strip()
                break
    # MeSH
    mesh_terms = [get_text(x, "DescriptorName") for x in med.findall("MeshHeadingList/MeshHeading")]
    authors = []
    for a in art.findall("AuthorList/Author"):
        last = get_text(a, "LastName");
        fore = get_text(a, "ForeName")
        if last or fore: authors.append((" ".join([fore, last])).strip())
    return {
        "source": "PubMed", "pmid": pmid, "doi": doi, "title": title, "year": y, "date": "", "venue": journal,
        "type": "", "is_oa": "", "oa_status": "", "cited_by": "",
        "abstract": abstract, "authors": "; ".join(authors), "mesh": "; ".join(mesh_terms)
    }


# Run
j = esearch(QUERY_HUMAN)
webenv = j["esearchresult"]["webenv"]
qk = j["esearchresult"]["querykey"]
count = int(j["esearchresult"]["count"])
print("PubMed hits:", count)

with open(OUT_CSV, "w", newline="", encoding="utf-8") as f:
    import csv

    w = csv.DictWriter(f, fieldnames=["source", "pmid", "doi", "title", "year", "date", "venue", "type", "is_oa",
                                      "oa_status", "cited_by", "abstract", "authors", "mesh"])
    w.writeheader()
    for start in tqdm(range(0, count, 200)):
        xml = efetch(webenv, qk, start, 200)
        root = ET.fromstring(xml)
        for art in root.findall("PubmedArticle"):
            rec = parse_article(art)
            w.writerow(rec)
        time.sleep(0.34)  # be polite to NCBI
print(f"Saved {count} PubMed records to {OUT_CSV}")
