<a href="https://colab.research.google.com/github/xyshuai/openalex-api-demo/blob/main/OpenAlex_Basic_Paging_%E2%89%A4_10k.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# === OpenAlex Basic Paging (up to 10,000 records) ===
# - Uses classic page-based pagination: page=1,2,...,MAX_PAGES
# - Collects results into a single CSV
# - Designed to run in Google Colab, but also works in a normal Python environment

import os
import requests
import csv

# 1) Try to mount Google Drive (if in Colab); if it fails, fall back to a local folder
USE_DRIVE = False
try:
    from google.colab import drive, files  # type: ignore
    try:
        drive.mount('/content/drive', force_remount=True)
        USE_DRIVE = True
        save_dir = "/content/drive/MyDrive/OpenAlex"
        print("‚úÖ Google Drive mounted. Saving to:", save_dir)
    except Exception as e:
        print("‚ö†Ô∏è Failed to mount Google Drive. Falling back to /content. Error:", e)
        save_dir = "/content"
except Exception:
    # Not running in Colab; save to the current working directory
    save_dir = "."
    print("‚ÑπÔ∏è Not running in Colab. Saving to the current directory.")

os.makedirs(save_dir, exist_ok=True)
out_csv = os.path.join(save_dir, "openalex_basic_paging_full.csv")

# 2) OpenAlex API configuration (basic paging)
BASE = "https://api.openalex.org/works"

# Core filtering conditions
# - open_access.is_oa:true  ‚Üí only open-access works
# - authorships.countries:countries/my ‚Üí at least one author with country code "MY"
# - publication_year:2023-2024 ‚Üí works published between 2023 and 2024
# - type: article or review
FILTERS = [
    "open_access.is_oa:true",
    "authorships.countries:countries/my",
    "publication_year:2023-2024",
    "type:types/article|types/review",
]

# Paging settings:
# - per_page ‚â§ 200 (OpenAlex limit)
# - MAX_PAGES ‚â§ 50 ‚Üí at most 10,000 results (200 * 50)
PER_PAGE = 200
MAX_PAGES = 50

PARAMS_BASE = {
    "filter": ",".join(FILTERS),
    "sort": "cited_by_count:desc",
    "per_page": PER_PAGE,
    "mailto": "your.name@domain.com",  # TODO: replace with your email
}

# ---------- Helper functions ----------

def flatten_authors(authorships):
    """Return a simple ';'-separated string of author names."""
    if not authorships:
        return ""
    names = []
    for a in authorships:
        nm = (a.get("author") or {}).get("display_name")
        if nm:
            names.append(nm)
    return "; ".join(names)

def flatten_authors_affiliations(authorships):
    """
    Return a string such as:
        'Author A (Affil1;Affil2); Author B (AffilX)'
    If no affiliations exist, only the author name is shown.
    """
    if not authorships:
        return ""
    parts = []
    for a in authorships:
        nm = (a.get("author") or {}).get("display_name") or ""
        affs = []
        for inst in (a.get("institutions") or []):
            dn = inst.get("display_name") or ""
            if dn:
                affs.append(dn)
        if nm:
            if affs:
                parts.append(f"{nm} ({';'.join(affs)})")
            else:
                parts.append(nm)
    return "; ".join(parts)

def top_concepts(concepts, n=3):
    """
    Return the top-n concepts by score as a ';'-separated string of concept names.
    """
    if not concepts:
        return ""
    ranked = sorted(concepts, key=lambda c: c.get("score", 0), reverse=True)[:n]
    return "; ".join([c.get("display_name", "") for c in ranked if c.get("display_name")])

def topic_label(t):
    """Format one topic as 'Name (score=0.87)'."""
    if not t:
        return ""
    name = t.get("display_name", "")
    sc = t.get("score", None)
    if sc is not None:
        return f"{name} (score={sc:.2f})"
    return name

def collect_first_author_country_codes(authorships):
    """
    Collect country codes for the first author:
    - authorships[0].countries[]
    - authorships[0].institutions[].country_code
    """
    if not authorships:
        return ""
    a0 = authorships[0]
    s = set()
    for c in (a0.get("countries") or []):
        if c:
            s.add(c)
    for inst in (a0.get("institutions") or []):
        cc = inst.get("country_code")
        if cc:
            s.add(cc)
    return ";".join(sorted(s)) if s else ""

def collect_institution_country_codes(work):
    """
    Collect country codes from the top-level institutions[] array.
    """
    s = set()
    for inst in (work.get("institutions") or []):
        cc = inst.get("country_code")
        if cc:
            s.add(cc)
    return ";".join(sorted(s)) if s else ""

def collect_corresponding_author_country_codes(authorships):
    """
    Collect country codes for corresponding author(s):
    - For each authorship with is_corresponding=True:
      * authorships[i].countries[]
      * authorships[i].institutions[].country_code
    """
    if not authorships:
        return ""
    s = set()
    for a in authorships:
        if not a.get("is_corresponding"):
            continue
        for c in (a.get("countries") or []):
            if c:
                s.add(c)
        for inst in (a.get("institutions") or []):
            cc = inst.get("country_code")
            if cc:
                s.add(cc)
    return ";".join(sorted(s)) if s else ""

def pick_source_info(work):
    """
    Try to infer the journal/source information using:
    1) host_venue
    2) primary_location.source
    3) locations[].source (first available)
    Returns: (journal_name, issn_l)
    """
    journal = ""
    issn_l = ""

    host = work.get("host_venue") or {}
    journal = host.get("display_name") or ""
    issn_l = host.get("issn_l") or ""

    if not journal:
        pl = work.get("primary_location") or {}
        src = pl.get("source") or {}
        journal = journal or src.get("display_name") or ""
        issn_l = issn_l or src.get("issn_l") or ""

    if not journal:
        for loc in (work.get("locations") or []):
            src = loc.get("source") or {}
            if src.get("display_name"):
                journal = journal or src.get("display_name")
                issn_l = issn_l or src.get("issn_l") or ""
                break

    return journal or "", issn_l or ""

def get_corresponding_authors(authorships):
    """Return a ';'-separated list of corresponding author names."""
    if not authorships:
        return ""
    names = []
    for a in authorships:
        if a.get("is_corresponding"):
            nm = (a.get("author") or {}).get("display_name")
            if nm:
                names.append(nm)
    return ";".join(names) if names else ""

def get_fwci_and_percentile(work):
    """
    Extract FWCI and citation-normalized percentile information.
    Returns (fwci, citation_percentile, top_1pct_flag, top_10pct_flag)
    """
    fwci = work.get("fwci", "")
    if not isinstance(fwci, (int, float, str)):
        fwci = ""

    cnp = work.get("citation_normalized_percentile") or {}
    perc = cnp.get("value", None)
    citation_percentile = f"{perc:.6f}" if isinstance(perc, (int, float)) else ""

    top1 = cnp.get("is_in_top_1_percent") or cnp.get("is_in_top1_percent")
    top10 = cnp.get("is_in_top_10_percent") or cnp.get("is_in_top10_percent")

    citation_top_1pct = "Yes" if top1 else ("No" if top1 is not None else "")
    citation_top_10pct = "Yes" if top10 else ("No" if top10 is not None else "")

    return str(fwci), citation_percentile, citation_top_1pct, citation_top_10pct

def parse_apc_list(work):
    """
    Normalize the 'apc_list' field into a readable string.
    Handles list/dict/scalar/None and returns something like:
        '2000 USD; 1500 EUR'
    """
    apc = work.get("apc_list", None)
    items = []

    def norm_one(x):
        if isinstance(x, dict):
            val = x.get("value", None)
            cur = x.get("currency", "")
            if val is not None and cur:
                return f"{val} {cur}"
            if val is not None:
                return str(val)
            if cur:
                return cur
            return str(x)
        if isinstance(x, (int, float)):
            return str(x)
        if isinstance(x, str):
            return x.strip()
        return str(x)

    if isinstance(apc, list):
        for it in apc:
            s = norm_one(it)
            if s:
                items.append(s)
    elif isinstance(apc, dict):
        s = norm_one(apc)
        if s:
            items.append(s)
    elif isinstance(apc, (str, int, float)):
        s = norm_one(apc)
        if s:
            items.append(s)

    return "; ".join(items)

def parse_sdg_labels(work):
    """
    Extract SDG information from the correct field 'sustainable_development_goals'.
    Returns a string like: 'SDG 3: Good health and well-being (0.95); SDG 4: Quality education (0.87)'
    """
    # Use the correct SDG field namesÔºö 'sustainable_development_goals'
    sdg_field = work.get("sustainable_development_goals")

    if not sdg_field:
        return ""

    labels = []

    # OpenAlex returns a list, where each element contains an id, display_name, and score
    if isinstance(sdg_field, list):
        for item in sdg_field:
            if isinstance(item, dict):
                # Extract the SDG number (from the id URL, for example: "https://metadata.un.org/sdg/3" -> "3"Ôºâ
                sdg_id = item.get("id", "")
                sdg_number = ""
                if sdg_id:
                    # Extract the last numeric part from the URL
                    try:
                        sdg_number = sdg_id.rstrip('/').split('/')[-1]
                    except:
                        pass

                name = item.get("display_name", "")
                score = item.get("score")

                if name:
                    # Combine into a complete formatÔºöSDG 3: Good health and well-being (0.95)
                    if sdg_number:
                        full_label = f"SDG {sdg_number}: {name}"
                    else:
                        full_label = name

                    if score is not None:
                        labels.append(f"{full_label} ({score:.2f})")
                    else:
                        labels.append(full_label)
        if labels:
            return "; ".join(labels)

    # If it's not in list format, try other processing methods
    elif isinstance(sdg_field, dict):
        sdg_id = sdg_field.get("id", "")
        sdg_number = ""
        if sdg_id:
            try:
                sdg_number = sdg_id.rstrip('/').split('/')[-1]
            except:
                pass

        name = sdg_field.get("display_name", "")
        score = sdg_field.get("score")

        if name:
            if sdg_number:
                full_label = f"SDG {sdg_number}: {name}"
            else:
                full_label = name

            if score is not None:
                return f"{full_label} ({score:.2f})"
            return full_label

    elif isinstance(sdg_field, str):
        return sdg_field.strip()

    return ""

# 3) Download all pages (basic paging)
all_results = []
total_downloaded = 0

for page in range(1, MAX_PAGES + 1):
    params = PARAMS_BASE.copy()
    params["page"] = page

    print(f"‚ñ∂Ô∏è Requesting page {page} ...")
    resp = requests.get(BASE, params=params, timeout=60)
    if resp.status_code != 200:
        print(f"‚ö†Ô∏è HTTP {resp.status_code} on page {page}: {resp.text[:200]}")
        break

    payload = resp.json()
    results = payload.get("results", [])

    if not results:
        print(f"‚úÖ No more results at page {page}. Stopping.")
        break

    all_results.extend(results)
    total_downloaded += len(results)
    print(f"   Page {page}: {len(results)} records, total={total_downloaded}")

    # If this page returned fewer than PER_PAGE results, it is probably the last page
    if len(results) < PER_PAGE:
        print("‚ÑπÔ∏è Fewer results than 'per_page'; likely reached the final page.")
        break

print(f"\nüì• Finished downloading. Total records collected: {len(all_results)}")

# 4) Write to CSV
out_headers = [
    "openalex_id", "doi", "title", "year", "type", "language",
    "cited_by_count", "journal", "issn_l",
    "is_oa", "oa_status", "oa_url", "license", "version",
    "first_author", "authors_affiliations", "top3_concepts",
    "primary_topic_id", "primary_topic_name",
    "primary_topic_domain", "primary_topic_field", "primary_topic_subfield",
    "topics_top5",
    # Countries
    "first_author_country_codes",
    # Corresponding authors
    "corresponding_authors",
    "corresponding_author_country_codes",
    # APC
    "apc_list_values",
    # Citation metrics
    "fwci", "citation_percentile", "citation_top_1pct", "citation_top_10pct",
    # SDG labels
    "sdg_labels",
]

with open(out_csv, "w", newline="", encoding="utf-8") as f:
    w = csv.writer(f)
    w.writerow(out_headers)

    for wobj in all_results:
        ids = wobj.get("ids") or {}
        doi = ids.get("doi", "")
        title = wobj.get("title") or wobj.get("display_name", "")
        year = wobj.get("publication_year", "")
        wtype = wobj.get("type", "")
        lang = wobj.get("language", "")
        cited = wobj.get("cited_by_count", 0)
        journal, issn_l = pick_source_info(wobj)

        oa = wobj.get("open_access") or {}
        is_oa = "Yes" if oa.get("is_oa") else "No"
        oa_status = oa.get("oa_status", "")
        oa_url = oa.get("oa_url", "")

        boa = wobj.get("best_oa_location") or {}
        license_ = boa.get("license", "")
        version = boa.get("version", "")

        authorships = wobj.get("authorships", [])
        first_author = authorships[0].get("author", {}).get("display_name") if authorships else ""
        authors_aff = flatten_authors_affiliations(authorships)
        concepts = wobj.get("concepts", [])
        top3 = top_concepts(concepts)

        primary = wobj.get("primary_topic") or {}
        topics_list = wobj.get("topics") or []
        primary_topic_id = primary.get("id", "")
        primary_topic_name = primary.get("display_name", "")
        primary_topic_domain = (primary.get("domain") or {}).get("display_name", "")
        primary_topic_field = (primary.get("field") or {}).get("display_name", "")
        primary_topic_subfield = (primary.get("subfield") or {}).get("display_name", "")

        others = [t for t in topics_list if (t.get("id") != primary_topic_id)]
        others_sorted = sorted(others, key=lambda t: t.get("score", 0), reverse=True)[:5]
        topics_top5 = "; ".join([topic_label(t) for t in others_sorted])

        first_author_cc = collect_first_author_country_codes(authorships)
        corresponding_authors = get_corresponding_authors(authorships)
        corr_author_cc = collect_corresponding_author_country_codes(authorships)
        apc_list_values = parse_apc_list(wobj)
        fwci, citation_percentile, top1, top10 = get_fwci_and_percentile(wobj)
        sdg_labels = parse_sdg_labels(wobj)

        w.writerow([
            wobj.get("id", ""), doi, title, year, wtype, lang,
            cited, journal, issn_l,
            is_oa, oa_status, oa_url, license_, version,
            first_author, authors_aff, top3,
            primary_topic_id, primary_topic_name,
            primary_topic_domain, primary_topic_field, primary_topic_subfield,
            topics_top5,
            first_author_cc,
            corresponding_authors,
            corr_author_cc,
            apc_list_values,
            fwci, citation_percentile, top1, top10,
            sdg_labels,
        ])

print(f"\n‚úÖ Export completed: {len(all_results)} records written.")
print("üìÑ CSV file saved at:", out_csv)

# Optional: if not using Drive in Colab, automatically trigger a download
try:
    if not USE_DRIVE:
        from google.colab import files  # type: ignore
        files.download(out_csv)
except Exception:
    pass

Mounted at /content/drive
‚úÖ Google Drive mounted. Saving to: /content/drive/MyDrive/OpenAlex
‚ñ∂Ô∏è Requesting page 1 ...
   Page 1: 200 records, total=200
‚ñ∂Ô∏è Requesting page 2 ...
   Page 2: 200 records, total=400
‚ñ∂Ô∏è Requesting page 3 ...
   Page 3: 200 records, total=600
‚ñ∂Ô∏è Requesting page 4 ...
   Page 4: 200 records, total=800
‚ñ∂Ô∏è Requesting page 5 ...
   Page 5: 200 records, total=1000
‚ñ∂Ô∏è Requesting page 6 ...
   Page 6: 200 records, total=1200
‚ñ∂Ô∏è Requesting page 7 ...
   Page 7: 200 records, total=1400
‚ñ∂Ô∏è Requesting page 8 ...
   Page 8: 200 records, total=1600
‚ñ∂Ô∏è Requesting page 9 ...
   Page 9: 200 records, total=1800
‚ñ∂Ô∏è Requesting page 10 ...
   Page 10: 200 records, total=2000
‚ñ∂Ô∏è Requesting page 11 ...
   Page 11: 200 records, total=2200
‚ñ∂Ô∏è Requesting page 12 ...
   Page 12: 200 records, total=2400
‚ñ∂Ô∏è Requesting page 13 ...
   Page 13: 200 records, total=2600
‚ñ∂Ô∏è Requesting page 14 ...
   Page 14: 200 records, total=2800
‚ñ∂