In [1]:
import tarfile
import io
import zipfile
import importlib
import regex as re
import pyperclip  
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
import os


In [3]:
import pandas as pd
from extractors.trie_extractor import TrieExtractor
from utils.file_reader import read_file
from tqdm import tqdm  # Import tqdm for progress bar

csv_path = "data/2201.00_scopus_931_with_ror.csv"
df = pd.read_csv(csv_path)



In [4]:
# for shuchang evalutaion
import json
with open("final_affiliations_2000_parallel.json", 'r', encoding='utf-8') as f:
    final_affiliations = json.load(f)

target_file_ids = set(entry['File ID'] for entry in final_affiliations)  # 例如 '2311.00001'


In [8]:
import zipfile
import re
import io

def find_doc_class(fp, name_match=False):
    """Search for document class related lines in a file and return a code to represent the type"""
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    file_content = fp.read()
    try:
        file_text = file_content.decode('utf-8')
    except UnicodeDecodeError:
        file_text = file_content.decode('latin-1')

    for line in file_text.splitlines():
        if doc_class_pat.search(line):
            if name_match:
                if sub_doc_class.search(line):
                    return -99999
                return 1  # Found document class line
    return 0  # No document class line found

def find_main_tex_source_in_folder(zip_file, folder_name):
    """Find the main .tex file inside a folder within the zip archive"""
    tex_names = {"paper", "main", "ms.", "article"}
    
    # Get all .tex files in the folder
    tex_files = [f for f in zip_file.namelist() if f.startswith(folder_name + '/') and f.endswith('.tex')]

    if len(tex_files) == 1:
        return tex_files[0]  # If there's only one .tex file, return it

    main_files = {}
    for tex_file in tex_files:
        depth = tex_file.count('/') - folder_name.count('/')  # Depth relative to folder
        has_main_name = any(kw in tex_file for kw in tex_names)
        
        with zip_file.open(tex_file) as fp:
            main_files[tex_file] = find_doc_class(fp, name_match=has_main_name) - depth

    return max(main_files, key=main_files.get) if main_files else None

def pre_format(text):
    """Format LaTeX text by adding spaces where necessary"""
    # return text.replace('\\}\\', '\\} \\').replace(')}', ') }').replace(')$', ') $')
    return text

def source_from_zip(zip_file, folder_name):
    """Extract and decode the main .tex file from a folder inside the zip archive"""
    tex_main = find_main_tex_source_in_folder(zip_file, folder_name)
    if tex_main:
        with zip_file.open(tex_main) as fp:
            file_content = fp.read()
            try:
                source_text = pre_format(file_content.decode('utf-8'))
            except UnicodeDecodeError:
                source_text = pre_format(file_content.decode('latin-1'))
            return source_text
    return None

def extract_before_abstract(source_text):
    """Extract text before the abstract section"""
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)  # Remove comments
    # no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)  # Remove usepackage
    # text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', no_usepackage_text)  # Remove LaTeX commands
    text = no_comments_text
    # text = re.sub(r'\\[a-zA-Z]+\[[^\]]*\]\{[^}]*\}', '', text)
    # text = re.sub(r'\$[^$]*\$', '', text)  # Remove inline math
    # text = text.replace('{', '').replace('}', '').replace('\n', ' ')
    text = ' '.join(text.split())

    # output_file.write(f"text: {text}\n")

    abstract_word_match = re.search(r'begin{abstract}', text, re.IGNORECASE)
    if abstract_word_match:
        return text[:abstract_word_match.start()].strip()

    abstract_match = re.search(r'abstract{', text)
    if abstract_match:
        return text[:abstract_match.start()].strip()
    
    nomacro_abstract_match_2 = re.search(r'abstract', text)
    if nomacro_abstract_match_2:
        return text[:nomacro_abstract_match_2.start()].strip()
    
    nomacro_abstract_match_1 = re.search(r'Abstract', text)
    if nomacro_abstract_match_1:
        return text[:nomacro_abstract_match_1.start()].strip()
    
    one_third_length = len(text) // 3
    return text[:one_third_length].strip()

zip_file_path = "./2311_tex.zip"
output_path = "./output.txt"


with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    base_folder = zip_file.namelist()[0]    
    folders = {name.rstrip('/') for name in zip_file.namelist() if name.startswith(base_folder) and name.endswith('/') and name != base_folder}
    filtered_folders = {folder for folder in folders if folder.split('/')[-1] in target_file_ids}

    with open(output_path, 'w', encoding='utf-8') as output_file:
        for folder in filtered_folders:
            source_text = source_from_zip(zip_file, folder)
            # output_file.write(f"{folder}:source text: {source_text}\n")
            if source_text:
                content_before_abstract = extract_before_abstract(source_text)
                if content_before_abstract:
                    output_file.write(f"Content before abstract in {folder}:\n{content_before_abstract}\n\n")
                else:
                    output_file.write(f"No abstract found in {folder}, or no content before abstract.\n\n")
            else:
                output_file.write(f"No source found in {folder}.\n\n")

In [9]:
# Read input from a text file and filter out files without valid content before the abstract
input_file_path = 'output.txt'  # Replace with your actual file path
output_file_path = 'filtered_files_with_content_macro.txt'

# Variables to keep track of statistics
total_files_count_author = 0
valid_files_count = 0
valid_files_with_content = []

# Reading and processing the input file
with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')
    
    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count_author += 1
            lines = block.split('\n', 1)  # Split to separate the file name from its content
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                
                # Check if the content does not indicate "No abstract found"
                if 'No abstract found' not in content and 'no content before abstract' not in content.lower():
                    valid_files_count += 1
                    valid_files_with_content.append((file_name, content))

# Writing the filtered results to an output file
with open(output_file_path, 'w') as outfile:
    for file_name, content in valid_files_with_content:
        outfile.write(f"Content before abstract in {file_name}:\n{content}\n\n")

# Print or save the statistics summary
print(f"Total number of files processed: {total_files_count_author}")
print(f"Total number of files with valid content before abstract: {valid_files_count}")
print(f"Filtered output saved in: {output_file_path}")


Total number of files processed: 1998
Total number of files with valid content before abstract: 1996
Filtered output saved in: filtered_files_with_content_macro.txt


In [10]:
import re
import os
import json  # For structured storage

# Define file paths
input_file_path = 'filtered_files_with_content_macro.txt'
output_file_path = './tagged_outputs/all_extracted_tags_cleaned.txt'  # Single output file

# Ensure output directory exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Tags to search for (without leading backslashes for dictionary keys)
tags_to_search = [
    'orgname','institution', 'affiliations', 'affiliation', 'icmlaffiliation',
    'institute', 'affil', 'aff', 'AFF', 'university', 'address'
]

# Dictionary to store extracted data
extracted_data = {}

# Function to extract content inside the first level of braces `{}` (handling optional `[]`)
def extract_tag_content(tag, content):
    pattern = rf"(\\{tag})(\[[^\]]*\])?\{{"
    results = []
    start = 0

    while (match := re.search(pattern, content[start:])) is not None:
        start_idx = start + match.end()  # Start after the macro (past `{`)
        brace_level = 1
        end_idx = start_idx

        # Find the matching closing brace
        while brace_level > 0 and end_idx < len(content):
            if content[end_idx] == '{':
                brace_level += 1
            elif content[end_idx] == '}':
                brace_level -= 1
            end_idx += 1

        # Extract only the content inside the first `{}` (excluding the macro name)
        extracted_content = content[start_idx:end_idx - 1].strip()  # Remove trailing `}`
        results.append(extracted_content)
        start = end_idx  # Move to next occurrence

    return results

# Function to process and store extracted content
def extract_and_store_tags(file_name, content, tags):
    extracted_data[file_name] = {}  # Initialize storage for this file
    found_any_tag = False  # Flag to check if we found any tag

    for tag in tags:
        matches = extract_tag_content(tag, content)

        if matches:
            extracted_data[file_name][tag] = matches  # Store under cleaned key
            found_any_tag = True  # At least one macro was found
            break

    # If no macros were found, store the whole pre-abstract content
    if not found_any_tag:
        extracted_data[file_name]["full_content"] = content

# Process input file
total_files_count = 0

with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')

    for block in content_blocks:
        if block.strip():
            total_files_count += 1
            lines = block.split('\n', 1)

            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                extract_and_store_tags(file_name, content, tags_to_search)

json_output = json.dumps(extracted_data, indent=4, ensure_ascii=False)
# json_output = json_output.replace("\\\\", "\\")  # Convert double backslashes to single

with open(output_file_path, 'w', encoding='utf-8') as outfile:
    outfile.write(json_output)


# Print summary
print(f"Total number of files processed: {total_files_count}")
print(f"Extracted tag data saved in: {output_file_path}")


Total number of files processed: 1996
Extracted tag data saved in: ./tagged_outputs/all_extracted_tags_cleaned.txt


In [11]:
import requests
import json
import logging
import os
from tenacity import retry, stop_after_attempt, wait_fixed

# Configure logging
logging.basicConfig(level=logging.INFO)

# Configure ROR API information
ROR_SEARCH_URL = 'https://api.ror.org/organizations'

# Initialize ROR cache
ror_cache = {}

# Load persistent cache
cache_file = 'ror_cache.json'
if os.path.exists(cache_file):
    with open(cache_file, 'r', encoding='utf-8') as f:
        ror_cache = json.load(f)
    logging.info("Loaded persistent ROR cache.")
else:
    logging.info("No persistent ROR cache found. Using empty cache.")

@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def query_ror(institution_name):
    """
    Query ROR information, returning best match from top 3 candidates.
    """
    if institution_name in ror_cache:
        return ror_cache[institution_name]

    params = {'query': institution_name}

    try:
        response = requests.get(ROR_SEARCH_URL, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            items = data.get('items', [])
            
            if not items:
                logging.warning(f"No ROR information found for '{institution_name}'.")
                ror_cache[institution_name] = None
                return None

            # Only consider top 3 results
            candidates = items[:3]
            chosen = None

            logging.info(f"🔍 Query: {institution_name}")
            for idx, item in enumerate(candidates):
                candidate_name = item.get('name', '').lower()
                if candidate_name in institution_name.lower():
                    chosen = item
                    logging.info(f"✅ Match found with candidate {idx+1}: {item.get('name')}")
                    break
                else:
                    logging.info(f"❌ No match for candidate {idx+1}: {item.get('name')}")

            if not chosen:
                # Fallback: take first one
                chosen = candidates[0]
                logging.info(f"⚠️ No match found, fallback to first candidate: {chosen.get('name')}")

            ror_cache[institution_name] = {
                'ROR_ID': chosen.get('id', 'N/A'),
                'Name': chosen.get('name', 'N/A'),
                'Country': chosen.get('country', {}).get('country_name', 'N/A'),
                'Type': ', '.join(chosen.get('types', []))
            }
            return ror_cache[institution_name]

        else:
            logging.error(f"ROR API query failed with status code: {response.status_code} for institution: {institution_name}")
            ror_cache[institution_name] = None
            return None

    except Exception as e:
        logging.error(f"ROR API query exception for '{institution_name}', Error: {e}")
        ror_cache[institution_name] = None
        return None

# Save cache to a file after all queries
def save_cache():
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(ror_cache, f)
    logging.info("Saved ROR cache to file.")

INFO:root:Loaded persistent ROR cache.


In [12]:
import json
import os
from extractors.trie_extractor import TrieExtractor
from utils.file_reader import read_file

# Define file paths
input_file_path = 'tagged_outputs/all_extracted_tags_cleaned.txt'
output_file_path = 'institution_output_with_ror.json'

# Read the file and replace invalid escape sequences
with open(input_file_path, 'r', encoding='utf-8') as infile:
    raw_data = infile.read()

# Fix invalid escape sequences by double-escaping backslashes
# fixed_data = raw_data.replace("\\", "\\\\")
fixed_data = raw_data

# Try loading again
try:
    extracted_data = json.loads(fixed_data)
    print("✅ JSON successfully loaded!")
except json.JSONDecodeError as e:
    print(f"❌ JSON loading failed: {e}")

✅ JSON successfully loaded!


In [13]:
# Load TrieExtractor for full content/address processing
extractor = TrieExtractor(
    data_path="data/1.34_extracted_ror_data.csv",
    common_words_path="data/common_english_words.txt"
)


In [14]:

import re
import spacy
# Load the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

In [15]:
def remove_address_spacy(text):
    """Uses spaCy NER to remove location entities from an institution name"""
    doc = nlp(text)
    cleaned_parts = []
    
    for ent in doc.ents:
        if ent.label_ in {"GPE", "LOC"}:  # GPE = Geo-Political Entity (City, Country, etc.)
            continue  # Skip locations
        cleaned_parts.append(ent.text)

    return " ".join(cleaned_parts)


In [19]:
import re
from pylatexenc.latex2text import LatexNodes2Text

converter = LatexNodes2Text()

def remove_address_spacy(text):
    """Uses spaCy NER to remove location entities from an institution name"""
    doc = nlp(text)
    cleaned_parts = []

    for ent in doc.ents:
        if ent.label_ in {"GPE", "LOC"}:
            continue  # Skip locations
        cleaned_parts.append(ent.text)

    return " ".join(cleaned_parts)

def latex_to_unicode(text):
    """Safely converts LaTeX special characters to Unicode, handling errors."""
    if not text or text.strip() == "":
        return ""  # Avoid empty input issues

    try:
        return converter.latex_to_text(text)
    except IndexError as e:
        print(f"⚠️ Warning: Error parsing LaTeX in text: {text}")
        print("\n⚠️ latex_to_unicode parse error encountered!")
        print("Full input string:")
        print("--------------------------------------------------")
        print(text)
        print("--------------------------------------------------")
        print(f"Error message: {e}\n")
        return text  # fallback: just return the original text

def mild_clean_affiliation(affiliation):
    parts = [p.strip() for p in affiliation.split(',')]
    if len(parts) > 3:
        for i, part in enumerate(parts):
            if 'university' in part.lower():
                if i < len(parts) - 1:
                    return f"{part}, {part}, {parts[i+1]}"
                else:
                    return part
        affiliation = ', '.join(parts[:3])
    return affiliation.strip()


import re

def split_affiliations_before_unicode(text):
    # 1. 预处理: 兼容一些奇怪的latex写法，比如\$^\dag\$ 这种
    text = text.replace('\\$', '$').replace('\\\\', '\\')
    
    # 2. 正则表达式：匹配 $^*  $^\dag$  $^1$ 这种开头
    pattern = re.compile(r'\$\\?\^\s*(?:\d+|\\dag|\\star|\*)\$?')
    
    # 找到所有匹配的位置
    matches = list(pattern.finditer(text))

    # 如果没有找到任何分割标记，直接返回整体
    if not matches:
        return [text.strip()]

    # 根据匹配位置进行切割
    results = []
    last_end = 0
    for match in matches:
        start = match.start()
        if start > last_end:
            chunk = text[last_end:start].strip()
            if chunk:
                results.append(chunk)
        last_end = match.end()

    # 处理最后一段
    if last_end < len(text):
        chunk = text[last_end:].strip()
        if chunk:
            results.append(chunk)

    return results



def smart_pick_best_ror(query_text, candidates):
    """Choose the best matching ROR result based on exact match priority."""
    for idx, item in enumerate(candidates):
        name = item.get('name', '').lower()
        if name and name in query_text.lower():
            print(f"✅ Match found with candidate {idx+1}: {name}")
            return item
        else:
            print(f"❌ No match for candidate {idx+1}: {name}")
    print("⚠️ No candidate matched exactly, fallback to first.")
    return candidates[0] if candidates else None


    
stringa1 = r"$^*$Physics Department, Arizona State University, Tempe, Arizona 85287, USA. \\ $^\\dag$Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland."
stringa2 = r"$^1$ Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se) \\ $^2$ Department of Computer Science, University of Helsinki, Finland."

print("\nOriginal input:")
print(stringa2)

split_result = split_affiliations_before_unicode(stringa2)

print("\nSplit affiliations before unicode:")
for idx, aff in enumerate(split_result, 1):
    print(f"Institution {idx}: {aff}")



Original input:
$^1$ Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se) \\ $^2$ Department of Computer Science, University of Helsinki, Finland.

Split affiliations before unicode:
Institution 1: Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se) \
Institution 2: Department of Computer Science, University of Helsinki, Finland.


In [20]:
results = {}

# Process each paper
for paper, macros in extracted_data.items():
    unique_rors = {}

    if isinstance(macros, dict) and set(macros.keys()) in [{"full_content"}, {"address"}]:
        text = "\n".join(macros.get("full_content", [])) + "\n" + "\n".join(macros.get("address", []))
        text = latex_to_unicode(text)
        extracted_ror_ids = extractor.extract_affiliations(text)

        if extracted_ror_ids:
            unique_rors.update({ror_id: {"ROR_ID": ror_id} for ror_id in extracted_ror_ids})

    else:
        for content_list in macros.values():
            for institution_name in content_list:
                
                raw_institutions = re.split(r'(?:\\\\)?\\and', institution_name)

                for raw_inst in raw_institutions:
                    raw_inst = raw_inst.strip()

                    split_by_superscript = re.split(r'\s*\^\d+', raw_inst)
                    split_by_superscript = [inst.strip() for inst in split_by_superscript if inst.strip()]

                    for raw_piece in split_by_superscript:
                        if not raw_piece:
                            continue

                        cleaned_piece = latex_to_unicode(raw_piece)

                        cleaned_piece = cleaned_piece.replace("{", " ").replace("}", " ").replace("^", " ").replace("[", " ").replace("]", " ").strip()
                        cleaned_piece = mild_clean_affiliation(cleaned_piece)
                        cleaned_piece = re.sub(r'^(.*?\b(Department of|School of|Faculty of|College of|Department)\b[^,]*),', '', cleaned_piece, flags=re.IGNORECASE)

                        if cleaned_piece:
                            ror_info = query_ror(cleaned_piece)
                            if ror_info:
                                unique_rors[ror_info["ROR_ID"]] = ror_info  # Store unique RORs

    if unique_rors:
        results[paper] = list(unique_rors.values())

with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, indent=4, ensure_ascii=False)

print(f"✅ JSON export completed. Results saved to {output_file_path}")


INFO:root:🔍 Query: addressline=The University of Manchester, addressline=The University of Manchester, city=Manchester
INFO:root:✅ Match found with candidate 1: University of Manchester
INFO:root:🔍 Query: addressline=Chulalongkorn University, addressline=Chulalongkorn University, city=Bangkok
INFO:root:✅ Match found with candidate 1: Chulalongkorn University
INFO:root:🔍 Query: Denison University, Denison University, Granville
INFO:root:✅ Match found with candidate 1: Denison University
INFO:root:🔍 Query: Crimean Astrophysical Observatory, 298409 Nauchny, Crimea
INFO:root:✅ Match found with candidate 1: Crimean Astrophysical Observatory
INFO:root:🔍 Query: Shanghai Jiao Tong University, Shanghai Jiao Tong University, 800 Dongchuan Road
INFO:root:✅ Match found with candidate 1: Shanghai Jiao Tong University
INFO:root:🔍 Query: Ningxia University, Ningxia University, Yinchuan
INFO:root:✅ Match found with candidate 1: Ningxia University
INFO:root:🔍 Query: Shanghai Research Center for Quantum


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
Boston University \& NTT Research \\\href{mailto:luowen@qcry.pt}{\email{luowen@qcry.pt}}
--------------------------------------------------
Error message: list index out of range



INFO:root:🔍 Query: Boston University \& NTT Research \\\href mailto:luowen@qcry.pt  \email luowen@qcry.pt
INFO:root:❌ No match for candidate 1: Neelan Tiruchelvam Trust
INFO:root:✅ Match found with candidate 2: Boston University
INFO:root:🔍 Query: The Pennsylvania State University, The Pennsylvania State University, University Park
INFO:root:✅ Match found with candidate 1: Pennsylvania State University
INFO:root:🔍 Query: Université Paris-Saclay, UVSQ, CNRS
INFO:root:✅ Match found with candidate 1: Université Paris-Saclay
ERROR:root:ROR API query failed with status code: 500 for institution: European Space Agency, European Space Astronomy Centre, Camino Bajo del Castillo s/n
INFO:root:🔍 Query: European Space Agency (ESA), ESA Office, Space Telescope Science Institute
INFO:root:✅ Match found with candidate 1: Space Telescope Science Institute
INFO:root:🔍 Query: University of Exeter, University of Exeter, Exeter EX4 4QL
INFO:root:✅ Match found with candidate 1: University of Exeter
INFO:r


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
\if #1\empty\else#1 \fi E-Mail: \href{mailto:#3}{#3}
--------------------------------------------------
Error message: list index out of range



INFO:root:🔍 Query: \if #1\empty\else#1 \fi E-Mail: \href mailto:#3  #3
INFO:root:❌ No match for candidate 1: Forecasting International (United States)
INFO:root:❌ No match for candidate 2: Istituto Nazionale di Fisica Nucleare, Sezione di Firenze
INFO:root:❌ No match for candidate 3: Maj Institute of Pharmacology
INFO:root:⚠️ No match found, fallback to first candidate: Forecasting International (United States)
INFO:root:🔍 Query: Dipartimento di Matematica, Dipartimento di Eccellenza 2023-2027, Università di Genova
INFO:root:❌ No match for candidate 1: University of Genoa
INFO:root:❌ No match for candidate 2: Federal Department of Justice and Police
INFO:root:❌ No match for candidate 3: University of Ragusa
INFO:root:⚠️ No match found, fallback to first candidate: University of Genoa
INFO:root:🔍 Query: Istituto Nazionale di Fisica Nucleare, Sezione di Genova, Via Dodecaneso 33
INFO:root:✅ Match found with candidate 1: Istituto Nazionale di Fisica Nucleare, Sezione di Genova
INFO:root:🔍

Departamento de An\'alisis Matem\'atico, Universidad de Sevilla, C/Tarfia s/n, Campus Reina Mercedes, 41012, Sevilla, Spain. \href{mailto:egarcia12@us.es}{egarcia12@us.es}
Department of Mathematics, Brown University, Kassar House,151 Thayer St. Providence, RI 02912, USA. \href{mailto:susanna_haziot@brown.edu}{susanna\_haziot@brown.edu}

⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------

Departamento de An\'alisis Matem\'atico, Universidad de Sevilla, C/Tarfia s/n, Campus Reina Mercedes, 41012, Sevilla, Spain. \href{mailto:egarcia12@us.es}{egarcia12@us.es}
Department of Mathematics, Brown University, Kassar House,151 Thayer St. Providence, RI 02912, USA. \href{mailto:susanna_haziot@brown.edu}{susanna\_haziot@brown.edu}
--------------------------------------------------
Error message: list index out of range



INFO:root:🔍 Query:  Korea Advanced Institute of Science and Technology
INFO:root:✅ Match found with candidate 1: Korea Advanced Institute of Science and Technology
INFO:root:🔍 Query:  University of Colorado Boulder
INFO:root:✅ Match found with candidate 1: University of Colorado Boulder
INFO:root:🔍 Query: University of Minnesota
INFO:root:✅ Match found with candidate 1: University of Minnesota
INFO:root:🔍 Query: Southwestern Institute of Physics, Post Office Box 432, Chengdu 610041
INFO:root:✅ Match found with candidate 1: Southwestern Institute of Physics
INFO:root:🔍 Query: Southern University of Science and Technology, Southern University of Science and Technology, 518055 Shenzhen
INFO:root:✅ Match found with candidate 1: Southern University of Science and Technology
INFO:root:🔍 Query: CEA, IRFM, F-13108 Saint-Paul-lez-Durance
INFO:root:❌ No match for candidate 1: Institut de Recherche sur la Fusion par Confinement Magnétique
INFO:root:❌ No match for candidate 2: Saint Paul Universit


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
\href{mailto:kart1712@nmsu.edu}{kart1712@nmsu.edu}, \href{mailto:mababneh@nmsu.edu@nmsu.edu}{mababneh@nmsu.edu}, \href{mailto:roopav@nmsu.edu}{roopav@nmsu.edu}
--------------------------------------------------
Error message: list index out of range



INFO:root:🔍 Query: Inria, IRISA, University of Rennes
INFO:root:❌ No match for candidate 1: Inria Rennes - Bretagne Atlantique Research Centre
INFO:root:❌ No match for candidate 2: Institut de Recherche en Informatique et Systèmes Aléatoires
INFO:root:❌ No match for candidate 3: Université de Rennes
INFO:root:⚠️ No match found, fallback to first candidate: Inria Rennes - Bretagne Atlantique Research Centre
INFO:root:🔍 Query: University of Rennes, Inria, IRISA
INFO:root:❌ No match for candidate 1: Inria Rennes - Bretagne Atlantique Research Centre
INFO:root:❌ No match for candidate 2: Institut de Recherche en Informatique et Systèmes Aléatoires
INFO:root:❌ No match for candidate 3: Université de Rennes
INFO:root:⚠️ No match found, fallback to first candidate: Inria Rennes - Bretagne Atlantique Research Centre
INFO:root:🔍 Query: University Rennes 2, Inria, M2S
INFO:root:❌ No match for candidate 1: Université Rennes 2
INFO:root:❌ No match for candidate 2: Inria Rennes - Bretagne Atlantiqu


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
European Southern Observatory, Karl-Schwarzschild-Str 2, 85748 Garching, Germany\\ \email{\href{mailto:Haochang.Jiang@eso.org}{hjiang@eso.org}}
--------------------------------------------------
Error message: list index out of range



INFO:root:🔍 Query: European Southern Observatory, Karl-Schwarzschild-Str 2, 85748 Garching
INFO:root:❌ No match for candidate 1: Thüringer Landessternwarte Tautenburg
INFO:root:✅ Match found with candidate 2: European Southern Observatory
INFO:root:🔍 Query: Tsinghua University, Tsinghua University, 30 Shuangqing Rd
INFO:root:✅ Match found with candidate 1: Tsinghua University
INFO:root:🔍 Query: Instituto de Radioastronomía y Astrofísica (IRyA), Universidad Nacional Autónoma de México (UNAM), Mexico
INFO:root:✅ Match found with candidate 1: Universidad Nacional Autónoma de México
INFO:root:🔍 Query: Southwest University, Southwest University, Chongqing 400715
INFO:root:✅ Match found with candidate 1: Southwest University
INFO:root:🔍 Query: RIKEN Nishina Center, Wako 351-0198, Japan
INFO:root:✅ Match found with candidate 1: RIKEN Nishina Center
INFO:root:🔍 Query: University of California, University of California, San Diego
INFO:root:✅ Match found with candidate 1: University of Californi


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
\href{https://normalcomputing.ai/}{Normal Computing}
--------------------------------------------------
Error message: list index out of range



ERROR:root:ROR API query failed with status code: 500 for institution: \href https://normalcomputing.ai/  Normal Computing
INFO:root:🔍 Query: Harvard University
INFO:root:✅ Match found with candidate 1: Harvard University
INFO:root:🔍 Query: Ming Li Institute of Data Science, National University of Singapore and Sea AI Lab 
 ming.li@u.nus.edu
INFO:root:❌ No match for candidate 1: AI Singapore
INFO:root:✅ Match found with candidate 2: National University of Singapore
INFO:root:🔍 Query:  Singapore Management University 
 panzhou@smu.edu.sg
INFO:root:✅ Match found with candidate 1: Singapore Management University
INFO:root:🔍 Query: Jia-Wei Liu Show Lab, National University of Singapore 
 jiawei.liu@u.nus.edu
INFO:root:✅ Match found with candidate 1: National University of Singapore
INFO:root:🔍 Query: Jussi Keppo Business School, National University of Singapore 
 keppo@nus.edu.sg
INFO:root:✅ Match found with candidate 1: National University of Singapore
INFO:root:🔍 Query: Min Lin Sea AI La


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
Perimeter Institute for Theoretical Physics, 31 Caroline St. N., Waterloo ON, Canada, N2L 2Y5\\ \href{mailto:ciambelli.luca@gmail.com}{ciambelli.luca@gmail.com}
--------------------------------------------------
Error message: list index out of range



INFO:root:🔍 Query: Cranberry-Lemon University, Cranberry-Lemon University, Pittsburgh
INFO:root:❌ No match for candidate 1: Cranberry Institute
INFO:root:❌ No match for candidate 2: Lemon Grove School District
INFO:root:❌ No match for candidate 3: University of Pittsburgh
INFO:root:⚠️ No match found, fallback to first candidate: Cranberry Institute
INFO:root:🔍 Query: Mount-Sheikh University, Mount-Sheikh University, Santa Narimana
INFO:root:❌ No match for candidate 1: Sheikh Bahaei University
INFO:root:❌ No match for candidate 2: Sheikh Hasina University
INFO:root:❌ No match for candidate 3: Sheikh Hasina Medical University
INFO:root:⚠️ No match found, fallback to first candidate: Sheikh Bahaei University
INFO:root:🔍 Query: Nicolaus Copernicus Astronomical Center, Polish Academy of Sciences, Bartycka 18
INFO:root:✅ Match found with candidate 1: Nicolaus Copernicus Astronomical Center
INFO:root:🔍 Query: LESIA (UMR 8109), Observatoire de Paris, PSL
INFO:root:✅ Match found with candidate 

✅ JSON export completed. Results saved to institution_output_with_ror.json


In [21]:
import json

# 输入文件路径
input_file = "institution_output_with_ror.json"
# 输出文件路径
output_file = "xk_final_affiliations_2000_parallel.json"

# 读取原来的results
with open(input_file, "r", encoding="utf-8") as f:
    original_results = json.load(f)

new_format_results = []

for paper_path, institutions in original_results.items():
    # 去掉前缀，比如 "2311_tex/2311.02188" -> "2311.02188"
    paper_id = paper_path.split('/')[-1]
    
    converted_institutions = []
    for inst in institutions:
        ror_url = inst.get('ROR_ID', '')
        if ror_url.startswith('https://ror.org/'):
            ror_id = ror_url.replace('https://ror.org/', '')
        else:
            ror_id = ror_url  # 防止意外情况

        converted_institutions.append({
            "name": inst.get('Name', 'N/A'),
            "ror_id": ror_id
        })

    new_format_results.append({
        "File ID": paper_id,
        "institutions_with_ror": converted_institutions
    })

# 保存成新的JSON
with open(output_file, "w", encoding="utf-8") as f:
    json.dump(new_format_results, f, indent=2, ensure_ascii=False)

print(f"✅ New prediction file generated: {output_file}")


✅ New prediction file generated: xk_final_affiliations_2000_parallel.json


In [None]:
import json

with open("xk_final_affiliations_2000_parallel.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 统计有多少个 file_id
file_id_count = len(data)

print(f"✅ Total number of File IDs: {file_id_count}")


✅ Total number of File IDs: 923


In [26]:
import json

with open('ror/v1.52-2024-09-16-ror-data_schema_v2.json', 'r', encoding='utf-8') as f:
    ror_data = json.load(f)

ror_relationships = {}

for record in ror_data:
    ror_id = record['id']
    related_ids = set()
    for rel in record.get('relationships', []):
        related_ids.add(rel['id'])
    ror_relationships[ror_id] = related_ids


In [27]:
def is_match(ror1, ror2, ror_relationships):
    related1 = ror_relationships.get(ror1, set())
    related2 = ror_relationships.get(ror2, set())
    return (ror1 == ror2) or (ror2 in related1) or (ror1 in related2)


In [28]:
import json
import pandas as pd

ground_truth = pd.read_csv("data/2311_with_ror.csv")

with open("institution_output_with_ror.json", 'r', encoding='utf-8') as f:
    extracted_results = json.load(f)

ground_truth['ROR ID'] = ground_truth['ROR ID'].fillna('').astype(str)
ground_truth['paper_id_clean'] = ground_truth['paper_id'].astype(str).str.strip().str.replace(r'v\\d+$', '', regex=True)

gt_rors_per_paper = (
    ground_truth.groupby('paper_id_clean')['ROR ID']
    .apply(lambda x: set(ror.strip() for ror in x if ror.strip() != ''))
    .to_dict()
)

correct_paper_level = 0
total_papers = 0
correct_affiliations = 0
total_affiliations = 0
evaluation_details = []

for paper_id, extracted_rors_list in extracted_results.items():
    paper_id_clean = paper_id.split('/')[-1]
    extracted_rors = set(ror_info['ROR_ID'] for ror_info in extracted_rors_list)
    gt_rors = gt_rors_per_paper.get(str(paper_id_clean), set())

    paper_correct = all(
        any(is_match(gt_ror, extracted_ror, ror_relationships) for extracted_ror in extracted_rors)
        for gt_ror in gt_rors
    ) and all(
        any(is_match(extracted_ror, gt_ror, ror_relationships) for gt_ror in gt_rors)
        for extracted_ror in extracted_rors
    )

    if paper_correct:
        correct_paper_level += 1
    total_papers += 1

    real_missed = []
    for gt_ror in gt_rors:
        total_affiliations += 1
        if any(is_match(gt_ror, extracted_ror, ror_relationships) for extracted_ror in extracted_rors):
            correct_affiliations += 1
        else:
            real_missed.append(gt_ror)

    real_extra = []
    for ext_ror in extracted_rors:
        if not any(is_match(ext_ror, gt_ror, ror_relationships) for gt_ror in gt_rors):
            real_extra.append(ext_ror)

    evaluation_details.append({
        "paper_id": paper_id_clean,
        "ground_truth_rors": list(gt_rors),
        "extracted_rors": list(extracted_rors),
        "matched_affiliations": list(gt_rors.intersection(extracted_rors)),  # optional
        "missed_affiliations": real_missed,
        "extra_affiliations": real_extra,
        "paper_exact_match": paper_correct
    })

paper_level_accuracy = correct_paper_level / total_papers if total_papers > 0 else 0
affiliation_level_accuracy = correct_affiliations / total_affiliations if total_affiliations > 0 else 0

with open("evaluation_details_relationship.json", "w", encoding="utf-8") as f:
    json.dump(evaluation_details, f, indent=4, ensure_ascii=False)

print(f"\n Paper-level accuracy: {paper_level_accuracy * 100:.2f}% ({correct_paper_level}/{total_papers})")
print(f" Affiliation-level accuracy: {affiliation_level_accuracy * 100:.2f}% ({correct_affiliations}/{total_affiliations})")
print(" Detailed evaluation saved to evaluation_details.json")



 Paper-level accuracy: 34.67% (320/923)
 Affiliation-level accuracy: 66.21% (2242/3386)
 Detailed evaluation saved to evaluation_details.json
