In [13]:
import time
import os
import logging
import csv
from typing import List, Dict, Optional

# Official Clarivate Starter‑API client
import clarivate.wos_starter.client as wos
from clarivate.wos_starter.client import Configuration, ApiClient, DocumentsApi
from clarivate.wos_starter.client.rest import ApiException

import requests  # for Unpaywall + direct PDF fetch

"""
WoS Starter‑API PDF harvester (client edition)
=============================================
* Works with the 2025‑Q2 response envelope (`metadata` + `hits`).
* Paginates until it collects `MAX_RESULTS` (default = 100).
* Extracts `title` + `doi` (prefers `identifiers.doi`).
* Saves a CSV, then tries Unpaywall for OA PDFs.
* API key remains hard‑coded per user request.
"""

# -----------------------------------------------------------------------------
# 0. User‑configurable constants
# -----------------------------------------------------------------------------
API_KEY = "1f8efd1acb29e46412a32eba3f85b7b1690a1024"
API_HOST = "https://api.clarivate.com/apis/wos-starter/v1"
UNPAYWALL_EMAIL = "dn018@bucknell.edu"  # replace with your own
MAX_RESULTS = 100                        # stop after N papers
PAGE_SIZE = 50                           # WoS Starter max per page

logging.basicConfig(level=logging.INFO, format="%(levelname)s: %(message)s")

# -----------------------------------------------------------------------------
# 1. Low‑level: call Clarivate SDK and return plain dict
# -----------------------------------------------------------------------------

def wos_search(query: str, *, limit: int, page: int) -> dict:
    cfg = Configuration(host=API_HOST)
    cfg.api_key["ClarivateApiKeyAuth"] = API_KEY
    try:
        with ApiClient(cfg) as client:
            docs_api = DocumentsApi(client)
            resp = docs_api.documents_get(q=query, db="WOS", limit=limit, page=page, detail="full")
            return resp.to_dict()
    except ApiException as exc:
        logging.error("WoS API %s – %s", exc.status, exc.body)
        raise

# -----------------------------------------------------------------------------
# 2. Collect title+DOI up to MAX_RESULTS
# -----------------------------------------------------------------------------

def get_paper_list(query: str, *, max_results: int = MAX_RESULTS) -> List[Dict[str, Optional[str]]]:
    papers: List[Dict[str, Optional[str]]] = []
    page = 1
    while len(papers) < max_results:
        remaining = max_results - len(papers)
        limit = min(PAGE_SIZE, remaining)
        data = wos_search(query, limit=limit, page=page)

        # Starter‑API 1.0.0 puts records in "hits"
        recs = (
            data.get("hits")
            or data.get("records")
            or data.get("documents")
            or data.get("data")
            or []
        )
        if not recs:
            logging.warning("No records on page %d – stopping.", page)
            break

        logging.info("Page %d – %d records", page, len(recs))
        for rec in recs:
            title = rec.get("title") or "<no title>"
            doi = (
                rec.get("doi")
                or rec.get("identifiers", {}).get("doi")
                or rec.get("document_identifiers", {}).get("doi")
            )
            papers.append({"title": title, "doi": doi})
            if len(papers) >= max_results:
                break

        # pagination bookkeeping
        if len(recs) < limit:
            break  # last page
        page += 1
    return papers

# -----------------------------------------------------------------------------
# 3. Unpaywall PDF downloader
# -----------------------------------------------------------------------------

def download_pdf(doi: Optional[str], *, save_dir: str = "pdfs") -> bool:
    if not doi:
        logging.info("No DOI – skipping download.")
        return False
    try:
        upw_resp = requests.get(
            f"https://api.unpaywall.org/v2/{doi}",
            params={"email": UNPAYWALL_EMAIL},
            timeout=20,
        )
        upw_resp.raise_for_status()
        upw_data = upw_resp.json()
        loc = upw_data.get("best_oa_location") or next(iter(upw_data.get("oa_locations") or []), {})
        pdf_url = loc.get("url_for_pdf")
        if not pdf_url:
            logging.info("No OA PDF for %s", doi)
            return False
        pdf_resp = requests.get(pdf_url, timeout=30)
        pdf_resp.raise_for_status()
        os.makedirs(save_dir, exist_ok=True)
        filename = doi.replace("/", "_") + ".pdf"
        path = os.path.join(save_dir, filename)
        with open(path, "wb") as fh:
            fh.write(pdf_resp.content)
        logging.info("Saved PDF → %s", path)
        return True
    except requests.exceptions.RequestException as exc:
        logging.warning("PDF download failed for %s – %s", doi, exc)
        return False

# -----------------------------------------------------------------------------
# 4. CSV helper
# -----------------------------------------------------------------------------

def save_csv(rows: List[Dict[str, str]], *, path: str = "wos_results.csv") -> None:
    if not rows:
        logging.info("No records – skip CSV.")
        return
    with open(path, "w", newline="", encoding="utf-8") as fh:
        writer = csv.DictWriter(fh, fieldnames=["title", "doi"], extrasaction="ignore")
        writer.writeheader()
        writer.writerows(rows)
    logging.info("Wrote CSV → %s", path)

# -----------------------------------------------------------------------------
# 5. Main
# -----------------------------------------------------------------------------

def main():
    query = "TS=(biomass gasification)"
    papers = get_paper_list(query, max_results=MAX_RESULTS)
    save_csv(papers)
    for p in papers:
        logging.info("TITLE: %s", p["title"])
        download_pdf(p["doi"], save_dir="downloaded_pdfs")
        time.sleep(1.1)  # polite Unpaywall delay

if __name__ == "__main__":
    main()

2025-06-05 00:21:08,787 [INFO] Page 1 – 50 records
2025-06-05 00:21:10,268 [INFO] Page 2 – 50 records
2025-06-05 00:21:10,272 [INFO] Wrote CSV → wos_results.csv
2025-06-05 00:21:10,273 [INFO] TITLE: CATALYZED STEAM GASIFICATION OF BIOMASS
2025-06-05 00:21:11,383 [INFO] No OA PDF for 10.1016/0165-2370(79)80013-3
2025-06-05 00:21:12,485 [INFO] TITLE: FLUIDIZED-BED GASIFICATION OF SOLID-WASTES AND BIOMASS - CIL PROGRAM
2025-06-05 00:21:12,486 [INFO] No DOI – skipping download.
2025-06-05 00:21:13,588 [INFO] TITLE: DEVELOPMENT OF PROTOTYPE PACKAGED GASIFICATION AND COMBUSTION SYSTEMS FOR REFINED BIOMASS AIR BLOWN PRODUCER GAS
2025-06-05 00:21:13,589 [INFO] No DOI – skipping download.
2025-06-05 00:21:14,690 [INFO] TITLE: BIOMASS GASIFICATION AT THE FOCUS OF THE ODEILLO (FRANCE) 1 MWTH SOLAR FURNACE
2025-06-05 00:21:14,691 [INFO] No DOI – skipping download.
2025-06-05 00:21:15,792 [INFO] TITLE: FUNDAMENTAL INVESTIGATION OF THE STEAM GASIFICATION OF BIOMASS IN THE PRESENCE OF CATALYSTS
2025-