In [5]:
import requests
import pandas as pd
import ast

def fetch_openalex_data(query, per_page=50):
    base_url = "https://api.openalex.org/works"
    params = {
        "filter": "title.search:(polarization OR divisive) AND language,abstract.search:(polarization OR divisive) AND language,has_abstract:true",
        "per_page": per_page
    }
    
    response = requests.get(base_url, params=params)
    if response.status_code != 200:
        print("Error fetching data:", response.status_code, response.text)
        return []
    
    data = response.json()
    print("API Response:", data) 
    return data.get("results", [])

def extract_data(articles):
    data = []
    raw_abstracts = []
    for article in articles:
        title = article.get("display_name", "N/A")
        publication_date = article.get("publication_date", "N/A")

        #author
        authors = "N/A"
        if "authorships" in article: 
            authors = ", ".join([author["author"]["display_name"] if "author" in author and "display_name" in author["author"] else "N/A" for author in article["authorships"]])

        # journal，确保 primary_location 存在
        journal = "N/A"
        if "primary_location" in article and article["primary_location"]:
            source = article["primary_location"].get("source", {})
            if isinstance(source, dict) and "display_name" in source:
                journal = source["display_name"]

        cited_by_count = article.get("cited_by_count", 0)  #citation counts
        
        # has_abstract 判断是否存摘要
        has_abstract = article.get("has_abstract", False)
        raw_abstract = "N/A"
        if has_abstract:
            raw_abstract = article.get("abstract_inverted_index", "N/A")
        raw_abstracts.append([title, has_abstract, raw_abstract])
        
        data.append([title, publication_date, authors, journal, cited_by_count, has_abstract])
    
    return data, raw_abstracts

def save_to_excel(data, raw_abstracts, filename="divisive_language_articles.xlsx", abstract_filename="abstract_inverted_index.xlsx"):
    df = pd.DataFrame(data, columns=["Title", "Publication Date", "Authors", "Journal", "Cited By Count", "Has Abstract"])
    df.to_excel(filename, index=False)
    print(f"Saved {len(data)} articles to {filename}")
    
    df_abstracts = pd.DataFrame(raw_abstracts, columns=["Title", "Has Abstract", "Abstract Inverted Index"])
    df_abstracts.to_excel(abstract_filename, index=False)
    print(f"Saved raw abstracts to {abstract_filename}")

if __name__ == "__main__":
    query = "polarization OR divisive AND language"
    articles = fetch_openalex_data(query)
    data, raw_abstracts = extract_data(articles)
    save_to_excel(data, raw_abstracts)

API Response: {'meta': {'count': 26, 'db_response_time_ms': 152, 'page': 1, 'per_page': 50, 'groups_count': None}, 'results': [{'id': 'https://openalex.org/W2066387397', 'doi': 'https://doi.org/10.1353/eca.2012.0017', 'title': 'Political Polarization and the Dynamics of Political Language: Evidence from 130 Years of Partisan Speech', 'display_name': 'Political Polarization and the Dynamics of Political Language: Evidence from 130 Years of Partisan Speech', 'relevance_score': 253.99365, 'publication_year': 2012, 'publication_date': '2012-09-01', 'ids': {'openalex': 'https://openalex.org/W2066387397', 'doi': 'https://doi.org/10.1353/eca.2012.0017', 'mag': '2066387397'}, 'language': 'en', 'primary_location': {'is_oa': False, 'landing_page_url': 'https://doi.org/10.1353/eca.2012.0017', 'pdf_url': None, 'source': {'id': 'https://openalex.org/S4210173904', 'display_name': 'Brookings Papers on Economic Activity', 'issn_l': '0007-2303', 'issn': ['0007-2303', '1533-4465'], 'is_oa': False, 'is_i