In [32]:
import requests

def query_openalex_by_id(arxiv_id):
    url = f"https://api.openalex.org/works/doi:10.48550/arxiv.{arxiv_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None


def extract_affiliations(response_data):
    # Extract affiliation information from the response
    affiliations = []
    if "authorships" in response_data:
        for author in response_data["authorships"]:
            author_name = author["author"]["display_name"] if author.get("author") else "Unknown"
            raw_affiliations = author.get("raw_affiliation_strings", [])
            institutions = [
                institution.get("display_name") 
                for institution in author.get("institutions", [])
            ]
            
            affiliations.append({
                "author_name": author_name,
                "raw_affiliations": raw_affiliations,
                "institutions": institutions
            })
    return affiliations

# Example usage
arxiv_id = "2312.05942"  # Replace with your arXiv ID
response = query_openalex_by_id(arxiv_id)

if response:
    affiliations = extract_affiliations(response)
    for idx, aff in enumerate(affiliations, start=1):
        print(f"Author {idx}: {aff['author_name']}")
        print(f"  Raw Affiliations: {', '.join(aff['raw_affiliations']) if aff['raw_affiliations'] else 'None'}")
        print(f"  Institutions: {', '.join(aff['institutions']) if aff['institutions'] else 'None'}")


Author 1: Jayashree Behera
  Raw Affiliations: None
  Institutions: None
Author 2: Mehdi Rezaie
  Raw Affiliations: None
  Institutions: None
Author 3: Lado Samushia
  Raw Affiliations: None
  Institutions: None
Author 4: J. Ereza
  Raw Affiliations: None
  Institutions: None


In [41]:
import requests
import csv

def query_openalex_by_id(arxiv_id):
    """Query OpenAlex API using arXiv ID."""
    url = f"https://api.openalex.org/works/doi:10.48550/arxiv.{arxiv_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def extract_affiliations(response_data):
    """Extract affiliation information from OpenAlex response."""
    if not response_data or "authorships" not in response_data:
        return None

    affiliations = []
    for author in response_data["authorships"]:
        author_name = author["author"]["display_name"] if author.get("author") else "Unknown"
        raw_affiliations = author.get("raw_affiliation_strings", [])
        institutions = [
            institution.get("display_name") 
            for institution in author.get("institutions", [])
        ]
        if raw_affiliations or institutions:
            affiliations.append({
                "author": author_name,
                "raw_affiliations": raw_affiliations,
                "institutions": institutions
            })

    return affiliations if affiliations else None

def preprocess_csv(input_csv):
    """Preprocess CSV to remove duplicate paper IDs."""
    unique_paper_ids = set()
    unique_rows = []

    with open(input_csv, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            paper_id = row["paper_id"]
            if paper_id not in unique_paper_ids:
                unique_paper_ids.add(paper_id)
                unique_rows.append(row)

    return unique_rows

def process_papers(input_csv, output_txt, max_papers=50):
    """Process papers and collect statistics, limited to max_papers."""
    # Preprocess CSV to ensure unique paper IDs
    rows = preprocess_csv(input_csv)[:max_papers]

    total_papers = len(rows)
    no_affiliation_count = 0
    papers_with_affiliations = []

    for idx, row in enumerate(rows, start=1):
        arxiv_id = row["paper_id"]
        print(f"Processing paper {idx}/{total_papers}: {arxiv_id}")
        response = query_openalex_by_id(arxiv_id)
        affiliations = extract_affiliations(response)

        if affiliations:
            papers_with_affiliations.append({"paper_id": arxiv_id, "affiliations": affiliations})
        else:
            no_affiliation_count += 1

    # Save papers with affiliations to a file
    with open(output_txt, mode="w", encoding="utf-8") as output_file:
        for paper in papers_with_affiliations:
            output_file.write(f"Paper ID: {paper['paper_id']}\n")
            for aff in paper["affiliations"]:
                output_file.write(f"  Author: {aff['author']}\n")
                output_file.write(f"  Raw Affiliations: {', '.join(aff['raw_affiliations']) if aff['raw_affiliations'] else 'None'}\n")
                output_file.write(f"  Institutions: {', '.join(aff['institutions']) if aff['institutions'] else 'None'}\n")
            output_file.write("\n")

    # Print statistics
    print(f"Total papers processed: {total_papers}")
    print(f"Papers without affiliations: {no_affiliation_count}")
    print(f"Papers with affiliations: {len(papers_with_affiliations)}")

    return total_papers, no_affiliation_count, len(papers_with_affiliations)

# Usage
input_csv = "2312_scopus_16493.csv"  # Input CSV file
output_txt = "papers_with_affiliations_test.txt"  # Output file for papers with affiliations
total, no_affiliation, with_affiliation = process_papers(input_csv, output_txt, max_papers=5000)

print("\nSummary:")
print(f"Out of {total} papers:")
print(f" - {no_affiliation} papers have no affiliation information.")
print(f" - {with_affiliation} papers have affiliation information (saved to {output_txt}).")


Processing paper 1/5000: 2312.05803
Processing paper 2/5000: 2312.10339
Processing paper 3/5000: 2312.15320
Processing paper 4/5000: 2312.15540
Processing paper 5/5000: 2312.11880
Processing paper 6/5000: 2312.10033
Processing paper 7/5000: 2312.09814
Processing paper 8/5000: 2312.13645
Processing paper 9/5000: 2312.00717
Processing paper 10/5000: 2312.00135
Processing paper 11/5000: 2312.07813
Processing paper 12/5000: 2312.10619
Processing paper 13/5000: 2312.08210
Processing paper 14/5000: 2312.02297
Processing paper 15/5000: 2312.05341
Processing paper 16/5000: 2312.03144
Processing paper 17/5000: 2312.04820
Processing paper 18/5000: 2312.10315
Processing paper 19/5000: 2312.14361
Processing paper 20/5000: 2312.15318
Processing paper 21/5000: 2312.13151
Processing paper 22/5000: 2312.11780
Processing paper 23/5000: 2312.03797
Processing paper 24/5000: 2312.02371
Processing paper 25/5000: 2312.13603
Processing paper 26/5000: 2312.05988
Processing paper 27/5000: 2312.11035
Processing

Summary:
Out of 5000 papers:
 - 4377 papers have no affiliation information.
 - 623 papers have affiliation information (saved to papers_with_affiliations_test.txt).