In [6]:
import requests

def query_openalex_by_id(arxiv_id):
    url = f"https://api.openalex.org/works/doi:10.48550/arxiv.{arxiv_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

def extract_affiliations(response_data):
    # Extract affiliation information from the response
    affiliations = []
    if "authorships" in response_data:
        for author in response_data["authorships"]:
            author_name = author["author"]["display_name"] if author.get("author") else "Unknown"
            raw_affiliations = author.get("raw_affiliation_strings", [])
            institutions_info = [
                {
                    "display_name": institution.get("display_name"),
                    "ror": institution.get("ror")
                } 
                for institution in author.get("institutions", [])
            ]
            
            affiliations.append({
                "author_name": author_name,
                "raw_affiliations": raw_affiliations,
                "institutions_info": institutions_info
            })
    return affiliations

# Example usage
arxiv_id = "2312.14731"  # Replace with your arXiv ID
response = query_openalex_by_id(arxiv_id)
if response:
    affiliations = extract_affiliations(response)
    for idx, aff in enumerate(affiliations, start=1):
        print(f"Author {idx}: {aff['author_name']}")
        print(f"  Raw Affiliations: {', '.join(aff['raw_affiliations']) if aff['raw_affiliations'] else 'None'}")
        if aff['institutions_info']:
            for inst in aff['institutions_info']:
                print(f"  Institution: {inst['display_name']} (ROR ID: {inst['ror']})")
        else:
            print("  Institutions: None")


Author 1: Mingxiang Gao
  Raw Affiliations: Microwaves and Antennas Group, Ecole Polytechnique Federale de Lausanne, Lausanne, Switzerland
  Institution: École Polytechnique Fédérale de Lausanne (ROR ID: https://ror.org/02s376052)
Author 2: Sujith Raman
  Raw Affiliations: Microwaves and Antennas Group, Ecole Polytechnique Federale de Lausanne, Lausanne, Switzerland, Radio Systems Group, University of Twente, Enschede, Netherlands
  Institution: University of Twente (ROR ID: https://ror.org/006hf6230)
  Institution: École Polytechnique Fédérale de Lausanne (ROR ID: https://ror.org/02s376052)
Author 3: Zvonimir Šipuš
  Raw Affiliations: Faculty of Electrical Engineering and Computing, University of Zagreb, Zagreb, Croatia
  Institution: University of Zagreb (ROR ID: https://ror.org/00mv6sv71)
Author 4: Anja K. Skrivervik
  Raw Affiliations: Microwaves and Antennas Group, Ecole Polytechnique Federale de Lausanne, Lausanne, Switzerland
  Institution: École Polytechnique Fédérale de Lausan

In [41]:
import requests
import csv

def query_openalex_by_id(arxiv_id):
    """Query OpenAlex API using arXiv ID."""
    url = f"https://api.openalex.org/works/doi:10.48550/arxiv.{arxiv_id}"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        return None

def extract_affiliations(response_data):
    """Extract affiliation information from OpenAlex response."""
    if not response_data or "authorships" not in response_data:
        return None

    affiliations = []
    for author in response_data["authorships"]:
        author_name = author["author"]["display_name"] if author.get("author") else "Unknown"
        raw_affiliations = author.get("raw_affiliation_strings", [])
        institutions = [
            institution.get("display_name") 
            for institution in author.get("institutions", [])
        ]
        if raw_affiliations or institutions:
            affiliations.append({
                "author": author_name,
                "raw_affiliations": raw_affiliations,
                "institutions": institutions
            })

    return affiliations if affiliations else None

def preprocess_csv(input_csv):
    """Preprocess CSV to remove duplicate paper IDs."""
    unique_paper_ids = set()
    unique_rows = []

    with open(input_csv, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            paper_id = row["paper_id"]
            if paper_id not in unique_paper_ids:
                unique_paper_ids.add(paper_id)
                unique_rows.append(row)

    return unique_rows

def process_papers(input_csv, output_txt, max_papers=50):
    """Process papers and collect statistics, limited to max_papers."""
    # Preprocess CSV to ensure unique paper IDs
    rows = preprocess_csv(input_csv)[:max_papers]

    total_papers = len(rows)
    no_affiliation_count = 0
    papers_with_affiliations = []

    for idx, row in enumerate(rows, start=1):
        arxiv_id = row["paper_id"]
        print(f"Processing paper {idx}/{total_papers}: {arxiv_id}")
        response = query_openalex_by_id(arxiv_id)
        affiliations = extract_affiliations(response)

        if affiliations:
            papers_with_affiliations.append({"paper_id": arxiv_id, "affiliations": affiliations})
        else:
            no_affiliation_count += 1

    # Save papers with affiliations to a file
    with open(output_txt, mode="w", encoding="utf-8") as output_file:
        for paper in papers_with_affiliations:
            output_file.write(f"Paper ID: {paper['paper_id']}\n")
            for aff in paper["affiliations"]:
                output_file.write(f"  Author: {aff['author']}\n")
                output_file.write(f"  Raw Affiliations: {', '.join(aff['raw_affiliations']) if aff['raw_affiliations'] else 'None'}\n")
                output_file.write(f"  Institutions: {', '.join(aff['institutions']) if aff['institutions'] else 'None'}\n")
            output_file.write("\n")

    # Print statistics
    print(f"Total papers processed: {total_papers}")
    print(f"Papers without affiliations: {no_affiliation_count}")
    print(f"Papers with affiliations: {len(papers_with_affiliations)}")

    return total_papers, no_affiliation_count, len(papers_with_affiliations)

# Usage
input_csv = "2312_scopus_16493.csv"  # Input CSV file
output_txt = "papers_with_affiliations_test.txt"  # Output file for papers with affiliations
total, no_affiliation, with_affiliation = process_papers(input_csv, output_txt, max_papers=5000)

print("\nSummary:")
print(f"Out of {total} papers:")
print(f" - {no_affiliation} papers have no affiliation information.")
print(f" - {with_affiliation} papers have affiliation information (saved to {output_txt}).")


Processing paper 1/5000: 2312.05803
Processing paper 2/5000: 2312.10339
Processing paper 3/5000: 2312.15320
Processing paper 4/5000: 2312.15540
Processing paper 5/5000: 2312.11880
Processing paper 6/5000: 2312.10033
Processing paper 7/5000: 2312.09814
Processing paper 8/5000: 2312.13645
Processing paper 9/5000: 2312.00717
Processing paper 10/5000: 2312.00135
Processing paper 11/5000: 2312.07813
Processing paper 12/5000: 2312.10619
Processing paper 13/5000: 2312.08210
Processing paper 14/5000: 2312.02297
Processing paper 15/5000: 2312.05341
Processing paper 16/5000: 2312.03144
Processing paper 17/5000: 2312.04820
Processing paper 18/5000: 2312.10315
Processing paper 19/5000: 2312.14361
Processing paper 20/5000: 2312.15318
Processing paper 21/5000: 2312.13151
Processing paper 22/5000: 2312.11780
Processing paper 23/5000: 2312.03797
Processing paper 24/5000: 2312.02371
Processing paper 25/5000: 2312.13603
Processing paper 26/5000: 2312.05988
Processing paper 27/5000: 2312.11035
Processing

Summary:
Out of 5000 papers:
 - 4377 papers have no affiliation information.
 - 623 papers have affiliation information (saved to papers_with_affiliations_test.txt).

In [45]:
import csv

def load_affiliations_from_txt(txt_file):
    """Load affiliations and institutions from the TXT file."""
    paper_affiliations = {}
    current_paper_id = None

    with open(txt_file, mode="r", encoding="utf-8") as file:
        for line in file:
            line = line.strip()
            if line.startswith("Paper ID:"):
                current_paper_id = line.split(":")[1].strip()
                paper_affiliations[current_paper_id] = []
            elif line.startswith("Institutions:") and current_paper_id:
                institution = line.split(":")[1].strip()
                if institution != "None":
                    paper_affiliations[current_paper_id].append(institution)

    return paper_affiliations

def load_primary_org_from_csv(csv_file):
    """Load Primary Org Name and paper IDs from the CSV file."""
    primary_orgs = {}
    with open(csv_file, mode="r", encoding="utf-8") as file:
        reader = csv.DictReader(file)
        for row in reader:
            paper_id = row["paper_id"]
            primary_org_name = row["Primary Org Name"]
            if paper_id not in primary_orgs:
                primary_orgs[paper_id] = []
            primary_orgs[paper_id].append(primary_org_name)
    return primary_orgs

def compare_affiliations_with_substring_match(affiliations, primary_orgs):
    """Compare affiliations and primary orgs with substring matching and generate match statistics."""
    total_papers = 0
    total_matches = 0
    unmatched_papers = 0
    unmatched_affiliations = []

    for paper_id, institutions in affiliations.items():
        total_papers += 1
        if paper_id in primary_orgs:
            primary_names = primary_orgs[paper_id]
            matches = []
            mismatches = []

            for inst in institutions:
                if any(inst in primary_org or primary_org in inst for primary_org in primary_names):
                    matches.append(inst)
                else:
                    mismatches.append(inst)
            
            if matches:
                total_matches += 1
            else:
                unmatched_papers += 1
            
            unmatched_affiliations.append((paper_id, institutions, primary_names, matches, mismatches))
        else:
            unmatched_papers += 1
            unmatched_affiliations.append((paper_id, institutions, [], [], []))

    return {
        "total_papers": total_papers,
        "total_matches": total_matches,
        "unmatched_papers": unmatched_papers,
        "unmatched_affiliations": unmatched_affiliations,
    }


def main_with_substring_match(txt_file, csv_file, unmatched_output_file):
    # Load data
    affiliations = load_affiliations_from_txt(txt_file)
    primary_orgs = load_primary_org_from_csv(csv_file)

    # Compare affiliations with substring matching
    stats = compare_affiliations_with_substring_match(affiliations, primary_orgs)

    # Print statistics
    print(f"Total papers examined: {stats['total_papers']}")
    print(f"Total papers with a match: {stats['total_matches']}")
    print(f"Total papers without a match: {stats['unmatched_papers']}")

    # Save unmatched papers to a file
    with open(unmatched_output_file, mode="w", encoding="utf-8") as file:
        for paper_id, institutions, primary_names, matches, mismatches in stats["unmatched_affiliations"]:
            file.write(f"Paper ID: {paper_id}\n")
            file.write(f"  Institutions in TXT:\n    {', '.join(institutions) if institutions else 'None'}\n")
            file.write(f"  Primary Org Names in CSV:\n    {', '.join(primary_names) if primary_names else 'None'}\n")
            if matches:
                file.write(f"  Matched Institutions:\n    {', '.join(matches)}\n")
            if mismatches:
                file.write(f"  Mismatched Institutions:\n    {', '.join(mismatches)}\n")
            file.write("\n")

    print(f"Unmatched papers saved to {unmatched_output_file}")


# Usage
txt_file = "papers_with_affiliations_test.txt"  # Input TXT file
csv_file = "2312_scopus_16493.csv"  # Input CSV file
unmatched_output_file = "unmatched_papers_with_substring.txt"  # Output file for unmatched papers

main_with_substring_match(txt_file, csv_file, unmatched_output_file)


Total papers examined: 623
Total papers with a match: 502
Total papers without a match: 121
Unmatched papers saved to unmatched_papers_with_substring.txt


In [9]:
import re
import requests
import json
import time

# Define a data structure to hold the extracted data
affiliations_data = {}
queried_ror_ids = {}  # Cache to store already queried institutions and their ROR IDs

# Function to query ROR API to fetch ROR ID
def fetch_ror_id(institution_name):
    if institution_name in queried_ror_ids:
        return queried_ror_ids[institution_name]

    try:
        ror_api_url = f"https://api.ror.org/organizations?query={institution_name}"
        response = requests.get(ror_api_url)
        response.raise_for_status()
        data = response.json()
        if data and "items" in data and len(data["items"]) > 0:
            ror_id = data["items"][0]["id"]  # Return the first ROR ID
            queried_ror_ids[institution_name] = ror_id
            return ror_id
    except Exception as e:
        print(f"Error fetching ROR ID for institution '{institution_name}': {e}")
    queried_ror_ids[institution_name] = None  # Cache the result as None if failed
    return None

# Function to parse the text file
def parse_affiliations(file_path):
    global affiliations_data

    with open(file_path, "r") as file:
        lines = file.readlines()

    current_paper_id = None
    i = 0  # Use an explicit index
    total_lines = len(lines)
    print(f"Total lines to process: {total_lines}")
    while i < total_lines:
        line = lines[i].strip()

        # Match Paper ID
        if line.startswith("Paper ID:"):
            current_paper_id = line.split(":")[1].strip()
            affiliations_data[current_paper_id] = []
            print(f"Processing Paper ID: {current_paper_id}")

        # Match Author and Affiliation
        elif line.startswith("Author:"):
            author = line.split(":")[1].strip()
            # Check the next lines for Raw Affiliations and Institutions
            if i + 1 < total_lines and lines[i + 1].strip().startswith("Raw Affiliations:"):
                raw_affiliation = lines[i + 1].strip().split(":", 1)[1].strip()
                if i + 2 < total_lines and lines[i + 2].strip().startswith("Institutions:"):
                    institution = lines[i + 2].strip().split(":", 1)[1].strip()

                    # Fetch ROR ID for the institution
                    print(f"Fetching ROR ID for: {institution}")
                    ror_id = fetch_ror_id(institution)
                    print(f"Fetched ROR ID: {ror_id} for Institution: {institution}")

                    # Append the parsed data to the affiliations_data structure
                    affiliations_data[current_paper_id].append({
                        "author": author,
                        "raw_affiliation": raw_affiliation,
                        "institution": institution,
                        "ror_id": ror_id,
                    })

        i += 1  # Increment the index
        if i % 10 == 0:  # Show progress every 10 lines
            print(f"Processed {i}/{total_lines} lines...")

# Path to the input file
input_file = "papers_with_affiliations_test.txt"

# Parse the file
start_time = time.time()
parse_affiliations(input_file)
end_time = time.time()

# Output the structured data
print(f"Parsing completed in {end_time - start_time:.2f} seconds")
print(affiliations_data)

# Optionally, save to a JSON file for future use
output_file = "parsed_affiliations.json"
with open(output_file, "w") as json_file:
    json.dump(affiliations_data, json_file, indent=4)


Total lines to process: 22839
Processing Paper ID: 2312.10033
Fetching ROR ID for: Institut d'Astrophysique de Paris
Fetched ROR ID: https://ror.org/022bnxw24 for Institution: Institut d'Astrophysique de Paris
Processing Paper ID: 2312.09814
Fetching ROR ID for: Institut d'Astrophysique de Paris
Fetched ROR ID: https://ror.org/022bnxw24 for Institution: Institut d'Astrophysique de Paris
Fetching ROR ID for: Institut d'Astrophysique de Paris
Fetched ROR ID: https://ror.org/022bnxw24 for Institution: Institut d'Astrophysique de Paris
Processed 10/22839 lines...
Fetching ROR ID for: Institut d'Astrophysique de Paris
Fetched ROR ID: https://ror.org/022bnxw24 for Institution: Institut d'Astrophysique de Paris
Processing Paper ID: 2312.00717
Fetching ROR ID for: Laboratoire AstroParticule et Cosmologie
Fetched ROR ID: https://ror.org/03tnjrr49 for Institution: Laboratoire AstroParticule et Cosmologie
Processed 20/22839 lines...
Fetching ROR ID for: Institut d'Astrophysique de Paris
Fetched R

In [15]:
# Load the CSV file into a DataFrame with proper encoding
org_data = pd.read_csv("acm_id_with_ror.csv", encoding="ISO-8859-1")

# Convert the ROR ID column to a dictionary for quick lookup
ror_to_primary_org = org_data.set_index("ROR ID")["Primary Org Id"].to_dict()

# Add Primary Org ID to the affiliations_data dictionary
unmatched_ror_ids = []  # Store ROR IDs that don't have a matching Primary Org ID
updated_affiliations_data = {}  # New dictionary to store filtered data

for paper_id, authors in affiliations_data.items():
    updated_authors = []
    for author_data in authors:
        ror_id = author_data.get("ror_id")
        if ror_id in ror_to_primary_org:
            # Add Primary Org ID to the dictionary
            author_data["primary_org_id"] = ror_to_primary_org[ror_id]
            updated_authors.append(author_data)  # Include this author in the filtered data
        else:
            # Track unmatched ROR IDs
            unmatched_ror_ids.append(ror_id)
    if updated_authors:  # Only include paper if it has authors with Primary Org ID
        updated_affiliations_data[paper_id] = updated_authors

# Calculate percentage of unmatched ROR IDs
total_ror_ids = sum(len(authors) for authors in affiliations_data.values())
unmatched_percentage = (len(unmatched_ror_ids) / total_ror_ids) * 100

# Display the percentage of unmatched ROR IDs
print(f"Percentage of ROR IDs without a matching Primary Org ID: {unmatched_percentage:.2f}%")

# Remove duplicates from unmatched ROR IDs
unmatched_ror_ids = list(set(unmatched_ror_ids))

# Save the updated data to a new file
updated_affiliations_file = "updated_affiliations_with_acm_id.json"
with open(updated_affiliations_file, "w") as f:
    json.dump(updated_affiliations_data, f, indent=4)

# Save unmatched ROR IDs to a file for further review
unmatched_file = "unmatched_ror_ids.json"
with open(unmatched_file, "w") as f:
    json.dump(unmatched_ror_ids, f, indent=4)

# Display confirmation of saved files
print(f"Updated affiliations data saved to {updated_affiliations_file}")
print(f"Unmatched ROR IDs saved to {unmatched_file}")


Percentage of ROR IDs without a matching Primary Org ID: 73.53%
Updated affiliations data saved to updated_affiliations_with_acm_id.json
Unmatched ROR IDs saved to unmatched_ror_ids.json


In [16]:
# Load the Scopus ground truth data
scopus_file = "2312_scopus_16493.csv"
scopus_data = pd.read_csv(scopus_file)

# Group Scopus data by paper_id for easier comparison
scopus_grouped = scopus_data.groupby("paper_id")["Primary Org Id"].apply(list).to_dict()

# Perform comparison
total_papers = len(updated_affiliations_data)
matching_papers = 0
mismatched_papers = {}

for paper_id, authors in updated_affiliations_data.items():
    # Extract Primary Org IDs from our data
    our_primary_org_ids = {author["primary_org_id"] for author in authors}
    
    # Get Primary Org IDs from Scopus data
    scopus_primary_org_ids = set(scopus_grouped.get(paper_id, []))
    
    # Check if the sets of Primary Org IDs match
    if our_primary_org_ids == scopus_primary_org_ids:
        matching_papers += 1
    else:
        # Store mismatched papers for analysis
        mismatched_papers[paper_id] = {
            "our_ids": list(our_primary_org_ids),
            "scopus_ids": list(scopus_primary_org_ids),
        }

# Calculate the percentage of matching papers
matching_percentage = (matching_papers / total_papers) * 100

# Output results
print(f"Total papers compared: {total_papers}")
print(f"Papers with matching affiliations: {matching_papers}")
print(f"Percentage of matching papers: {matching_percentage:.2f}%")
print(f"Papers with mismatched affiliations: {len(mismatched_papers)}")

# Save mismatched papers to a file for further review
mismatched_file = "mismatched_papers.json"
with open(mismatched_file, "w") as f:
    json.dump(mismatched_papers, f, indent=4)

print(f"Mismatched papers saved to {mismatched_file}")


Total papers compared: 220
Papers with matching affiliations: 0
Percentage of matching papers: 0.00%
Papers with mismatched affiliations: 220
Mismatched papers saved to mismatched_papers.json


In [20]:
# Clean and normalize the paper_id column
scopus_data["paper_id"] = scopus_data["paper_id"].astype(str).str.strip()

# Ensure paper_id alignment by stripping version numbers and handling NaN values
scopus_data["base_paper_id"] = scopus_data["paper_id"].apply(
    lambda x: x.split("v")[0].strip() if isinstance(x, str) else None
)

# Drop rows with missing base_paper_id to avoid issues during comparison
scopus_data = scopus_data.dropna(subset=["base_paper_id"])

# Debugging outputs
print("Scopus Data (After Cleaning):")
print(scopus_data.head())

# Group Scopus data by base_paper_id for easier comparison
scopus_grouped = scopus_data.groupby("base_paper_id")["Primary Org Id"].apply(list).to_dict()

# Debugging grouped data
print("Grouped Scopus Data Sample:")
print({k: scopus_grouped[k] for k in list(scopus_grouped.keys())[:5]})

# Perform comparison
total_papers = len(updated_affiliations_data)
matching_papers = 0
mismatched_papers = {}

for paper_id, authors in updated_affiliations_data.items():
    # Extract Primary Org IDs from our data
    our_primary_org_ids = {author["primary_org_id"] for author in authors}
    
    # Get Primary Org IDs from Scopus data using the base paper_id
    scopus_primary_org_ids = set(scopus_grouped.get(paper_id, []))
    
    # Check if the sets of Primary Org IDs match
    if our_primary_org_ids == scopus_primary_org_ids:
        matching_papers += 1
    else:
        # Store mismatched papers for analysis
        mismatched_papers[paper_id] = {
            "our_ids": list(our_primary_org_ids),
            "scopus_ids": list(scopus_primary_org_ids),
        }

# Calculate percentage of matching papers
matching_percentage = (matching_papers / total_papers) * 100

# Output results
print(f"Total papers compared: {total_papers}")
print(f"Papers with matching affiliations: {matching_papers}")
print(f"Percentage of matching papers: {matching_percentage:.2f}%")
print(f"Papers with mismatched affiliations: {len(mismatched_papers)}")

# Save mismatched papers to a file for further review
mismatched_file = "mismatched_papers.json"
with open(mismatched_file, "w") as f:
    json.dump(mismatched_papers, f, indent=4)

print(f"Mismatched papers saved to {mismatched_file}")


Scopus Data (After Cleaning):
Empty DataFrame
Columns: [Primary Org Id, Primary Org Name, Primary Org City, Primary Org State, Primary Org Country, ArXiv Id, Affiliation Sequence Number, paper_id, base_paper_id]
Index: []
Grouped Scopus Data Sample:
{}
Total papers compared: 220
Papers with matching affiliations: 0
Percentage of matching papers: 0.00%
Papers with mismatched affiliations: 220
Mismatched papers saved to mismatched_papers.json


To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  scopus_grouped = scopus_data.groupby("base_paper_id")["Primary Org Id"].apply(list).to_dict()
