In [1]:
import requests
import pandas as pd
import time
from typing import Union, List

In [9]:
# -------------------------------
# 🎯 Target Domains and Concepts
# -------------------------------
primary_domains = {
    "Biology": "C86803240",
    "Psychology": "C15744967",
    "Medicine": "C71924100",
    "Engineering": "C127413603",
    "Social Sciences": "C17744445"
}

ml_ai_concept_ids = [
    "C119857082",   # Artificial Intelligence
    "C154945302",   # Machine Learning
    "C203014093"    # Deep Learning
]

required_keywords = ["Artificial Intelligence", "Machine Learning", "Deep Learning"]

# -------------------------------
# 🧠 Decode Abstract
# -------------------------------
def decode_abstract(abstract_index):
    if not isinstance(abstract_index, dict):
        return ""
    words = sorted((idx, word) for word, indices in abstract_index.items() for idx in indices)
    return " ".join(word for _, word in words)

# -------------------------------
# ✅ Paper Filter Logic
# -------------------------------
def is_valid_paper(work, ml_ai_ids):
    if not work.get("concepts"):
        return False

    top_concept_id = work["concepts"][0]["id"].replace("https://openalex.org/", "")
    if top_concept_id in ml_ai_ids:
        return False

    abstract_text = decode_abstract(work.get("abstract_inverted_index"))
    if len(abstract_text.split()) < 200:
        return False

    return True

# -------------------------------
# 📦 Parse Work Metadata
# -------------------------------
def parse_work(work, domain_name):
    return {
        "title": work.get("title"),
        "abstract": decode_abstract(work.get("abstract_inverted_index")),
        "publication_year": work.get("publication_year"),
        "oa_pdf": work.get("primary_location", {}).get("pdf_url"),
        "publisher_url": work.get("primary_location", {}).get("landing_page_url"),
        "concepts": "; ".join([c["display_name"] for c in work.get("concepts", [])]),
        "openalex_id": work.get("id"),
        "domain": domain_name
    }

# -------------------------------
# 🚀 Fetch Data from OpenAlex
# -------------------------------
def fetch_openalex_data(domain_name, domain_id, year, ml_ai_concept_ids, per_page=100, max_pages=5):
    results = []
    for ml_ai_id in ml_ai_concept_ids:
        for page in range(1, max_pages + 1):
            filter_query = f"concepts.id:{domain_id},concepts.id:{ml_ai_id},publication_year:{year}"
            params = {
                "filter": filter_query,
                "per-page": per_page,
                "page": page
            }

            print(f"📦 Fetching page {page} for {domain_name} ({year}) with ML/AI ID {ml_ai_id}...")

            try:
                response = requests.get("https://api.openalex.org/works", params=params)
                response.raise_for_status()
                data = response.json()

                for work in data.get("results", []):
                    if is_valid_paper(work, ml_ai_concept_ids):
                        results.append(parse_work(work, domain_name))

                # ⏳ Fixed wait between requests
                time.sleep(2)

            except Exception as e:
                print(f"❌ Error on page {page}: {e}")
                continue

    return results

# -------------------------------
# 🔁 Collect Data for All Years
# -------------------------------
def collect_papers_for_years(years: Union[str, List[str]], ml_ai_ids, per_page=100, max_pages=5):
    if isinstance(years, str):
        years = [years]

    all_results = []
    for year in years:
        for domain_name, domain_id in primary_domains.items():
            batch = fetch_openalex_data(domain_name, domain_id, year, ml_ai_ids, per_page, max_pages)
            all_results.extend(batch)

    return all_results

# -------------------------------
# 💾 Final Save with Filtering
# -------------------------------
def save_results_to_csv(results, output_file):
    df = pd.DataFrame(results)
    df = df[df["concepts"].apply(lambda x: any(term.lower() in x.lower() for term in required_keywords))]

    df.to_csv(output_file, index=False)
    print(f"\n✅ Final dataset saved to '{output_file}'")
    print(f"📊 Total papers collected after filtering: {len(df)}")
    print("📊 Paper count by primary domain:")
    print(df["domain"].value_counts())

# -------------------------------
# 🎬 Main Run Pipeline
# -------------------------------
if __name__ == "__main__":
    target_years = ["2024"]
    output_filename = "openalex non-cs.csv"

    final_results = collect_papers_for_years(target_years, ml_ai_concept_ids)
    save_results_to_csv(final_results, output_filename)

📦 Fetching page 1 for Biology (2023) with ML/AI ID C119857082...
📦 Fetching page 2 for Biology (2023) with ML/AI ID C119857082...
📦 Fetching page 3 for Biology (2023) with ML/AI ID C119857082...
📦 Fetching page 4 for Biology (2023) with ML/AI ID C119857082...
📦 Fetching page 5 for Biology (2023) with ML/AI ID C119857082...
📦 Fetching page 1 for Biology (2023) with ML/AI ID C154945302...
📦 Fetching page 2 for Biology (2023) with ML/AI ID C154945302...
📦 Fetching page 3 for Biology (2023) with ML/AI ID C154945302...
📦 Fetching page 4 for Biology (2023) with ML/AI ID C154945302...
📦 Fetching page 5 for Biology (2023) with ML/AI ID C154945302...
📦 Fetching page 1 for Biology (2023) with ML/AI ID C203014093...
📦 Fetching page 2 for Biology (2023) with ML/AI ID C203014093...
📦 Fetching page 3 for Biology (2023) with ML/AI ID C203014093...
📦 Fetching page 4 for Biology (2023) with ML/AI ID C203014093...
📦 Fetching page 5 for Biology (2023) with ML/AI ID C203014093...
📦 Fetching page 1 for Psy

In [10]:
# Load the dataset
input_file = "openalex non-cs.csv"
df = pd.read_csv(input_file)

# Group by publication year and save each group as a separate CSV
for year, group in df.groupby("publication_year"):
    output_file = f"openalex_non_cs_{year}.csv"
    group.to_csv(output_file, index=False)
    print(f"✅ Saved: {output_file} ({len(group)} rows)")

✅ Saved: openalex_non_cs_2023.csv (1505 rows)
✅ Saved: openalex_non_cs_2024.csv (1606 rows)
