In [26]:
import tarfile
import io
import zipfile
import importlib
import regex as re
import pyperclip  
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
import os


In [None]:
import pandas as pd
from extractors.trie_extractor import TrieExtractor
from utils.file_reader import read_file
from tqdm import tqdm  # Import tqdm for progress bar

333 cLoad the CSV file
csv_path = "data/2201.00_scopus_931_with_ror.csv"
df = pd.read_csv(csv_path)



In [None]:
import zipfile
import re
import io

def find_doc_class(fp, name_match=False):
    """Search for document class related lines in a file and return a code to represent the type"""
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    file_content = fp.read()
    try:
        file_text = file_content.decode('utf-8')
    except UnicodeDecodeError:
        file_text = file_content.decode('latin-1')

    for line in file_text.splitlines():
        if doc_class_pat.search(line):
            if name_match:
                if sub_doc_class.search(line):
                    return -99999
                return 1  # Found document class line
    return 0  # No document class line found

def find_main_tex_source_in_folder(zip_file, folder_name):
    """Find the main .tex file inside a folder within the zip archive"""
    tex_names = {"paper", "main", "ms.", "article"}
    
    # Get all .tex files in the folder
    tex_files = [f for f in zip_file.namelist() if f.startswith(folder_name + '/') and f.endswith('.tex')]

    if len(tex_files) == 1:
        return tex_files[0]  # If there's only one .tex file, return it

    main_files = {}
    for tex_file in tex_files:
        depth = tex_file.count('/') - folder_name.count('/')  # Depth relative to folder
        has_main_name = any(kw in tex_file for kw in tex_names)
        
        with zip_file.open(tex_file) as fp:
            main_files[tex_file] = find_doc_class(fp, name_match=has_main_name) - depth

    return max(main_files, key=main_files.get) if main_files else None

def pre_format(text):
    """Format LaTeX text by adding spaces where necessary"""
    # return text.replace('\\}\\', '\\} \\').replace(')}', ') }').replace(')$', ') $')
    return text

def source_from_zip(zip_file, folder_name):
    """Extract and decode the main .tex file from a folder inside the zip archive"""
    tex_main = find_main_tex_source_in_folder(zip_file, folder_name)
    if tex_main:
        with zip_file.open(tex_main) as fp:
            file_content = fp.read()
            try:
                source_text = pre_format(file_content.decode('utf-8'))
            except UnicodeDecodeError:
                source_text = pre_format(file_content.decode('latin-1'))
            return source_text
    return None

def extract_before_abstract(source_text):
    """Extract text before the abstract section"""
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)  # Remove comments
    # no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)  # Remove usepackage
    # text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', no_usepackage_text)  # Remove LaTeX commands
    text = no_comments_text
    # text = re.sub(r'\\[a-zA-Z]+\[[^\]]*\]\{[^}]*\}', '', text)
    # text = re.sub(r'\$[^$]*\$', '', text)  # Remove inline math
    # text = text.replace('{', '').replace('}', '').replace('\n', ' ')
    text = ' '.join(text.split())

    # output_file.write(f"text: {text}\n")

    abstract_word_match = re.search(r'begin{abstract}', text, re.IGNORECASE)
    if abstract_word_match:
        return text[:abstract_word_match.start()].strip()

    abstract_match = re.search(r'abstract{', text)
    if abstract_match:
        return text[:abstract_match.start()].strip()
    
    nomacro_abstract_match_2 = re.search(r'abstract', text)
    if nomacro_abstract_match_2:
        return text[:nomacro_abstract_match_2.start()].strip()
    
    nomacro_abstract_match_1 = re.search(r'Abstract', text)
    if nomacro_abstract_match_1:
        return text[:nomacro_abstract_match_1.start()].strip()
    
    return None

zip_file_path = "./2311_tex_test.zip"
output_path = "./output.txt"


with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    base_folder = zip_file.namelist()[0]
    folders = {name.rstrip('/') for name in zip_file.namelist() if name.startswith(base_folder) and name.endswith('/') and name != base_folder}  # Get folder names

    with open(output_path, 'w', encoding='utf-8') as output_file:
        for folder in folders:
            source_text = source_from_zip(zip_file, folder)
            # output_file.write(f"{folder}:source text: {source_text}\n")
            if source_text:
                content_before_abstract = extract_before_abstract(source_text)
                if content_before_abstract:
                    output_file.write(f"Content before abstract in {folder}:\n{content_before_abstract}\n\n")
                else:
                    output_file.write(f"No abstract found in {folder}, or no content before abstract.\n\n")
            else:
                output_file.write(f"No source found in {folder}.\n\n")

In [30]:
# Read input from a text file and filter out files without valid content before the abstract
input_file_path = 'output.txt'  # Replace with your actual file path
output_file_path = 'filtered_files_with_content_macro.txt'

# Variables to keep track of statistics
total_files_count_author = 0
valid_files_count = 0
valid_files_with_content = []

# Reading and processing the input file
with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')
    
    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count_author += 1
            lines = block.split('\n', 1)  # Split to separate the file name from its content
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                
                # Check if the content does not indicate "No abstract found"
                if 'No abstract found' not in content and 'no content before abstract' not in content.lower():
                    valid_files_count += 1
                    valid_files_with_content.append((file_name, content))

# Writing the filtered results to an output file
with open(output_file_path, 'w') as outfile:
    for file_name, content in valid_files_with_content:
        outfile.write(f"Content before abstract in {file_name}:\n{content}\n\n")

# Print or save the statistics summary
print(f"Total number of files processed: {total_files_count_author}")
print(f"Total number of files with valid content before abstract: {valid_files_count}")
print(f"Filtered output saved in: {output_file_path}")


Total number of files processed: 125
Total number of files with valid content before abstract: 111
Filtered output saved in: filtered_files_with_content_macro.txt


In [31]:
import re
import os
import json  # For structured storage

# Define file paths
input_file_path = 'filtered_files_with_content_macro.txt'
output_file_path = './tagged_outputs/all_extracted_tags_cleaned.txt'  # Single output file

# Ensure output directory exists
os.makedirs(os.path.dirname(output_file_path), exist_ok=True)

# Tags to search for (without leading backslashes for dictionary keys)
tags_to_search = [
    'orgname','institution', 'affiliations', 'affiliation', 'icmlaffiliation',
    'institute', 'affil', 'aff', 'AFF', 'university', 'address'
]

# Dictionary to store extracted data
extracted_data = {}

# Function to extract content inside the first level of braces `{}` (handling optional `[]`)
def extract_tag_content(tag, content):
    pattern = rf"(\\{tag})(\[[^\]]*\])?\{{"
    results = []
    start = 0

    while (match := re.search(pattern, content[start:])) is not None:
        start_idx = start + match.end()  # Start after the macro (past `{`)
        brace_level = 1
        end_idx = start_idx

        # Find the matching closing brace
        while brace_level > 0 and end_idx < len(content):
            if content[end_idx] == '{':
                brace_level += 1
            elif content[end_idx] == '}':
                brace_level -= 1
            end_idx += 1

        # Extract only the content inside the first `{}` (excluding the macro name)
        extracted_content = content[start_idx:end_idx - 1].strip()  # Remove trailing `}`
        results.append(extracted_content)
        start = end_idx  # Move to next occurrence

    return results

# Function to process and store extracted content
def extract_and_store_tags(file_name, content, tags):
    extracted_data[file_name] = {}  # Initialize storage for this file
    found_any_tag = False  # Flag to check if we found any tag

    for tag in tags:
        matches = extract_tag_content(tag, content)

        if matches:
            extracted_data[file_name][tag] = matches  # Store under cleaned key
            found_any_tag = True  # At least one macro was found
            break

    # If no macros were found, store the whole pre-abstract content
    if not found_any_tag:
        extracted_data[file_name]["full_content"] = content

# Process input file
total_files_count = 0

with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')

    for block in content_blocks:
        if block.strip():
            total_files_count += 1
            lines = block.split('\n', 1)

            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                extract_and_store_tags(file_name, content, tags_to_search)

json_output = json.dumps(extracted_data, indent=4, ensure_ascii=False)
# json_output = json_output.replace("\\\\", "\\")  # Convert double backslashes to single

with open(output_file_path, 'w', encoding='utf-8') as outfile:
    outfile.write(json_output)


# Print summary
print(f"Total number of files processed: {total_files_count}")
print(f"Extracted tag data saved in: {output_file_path}")


Total number of files processed: 111
Extracted tag data saved in: ./tagged_outputs/all_extracted_tags_cleaned.txt


In [58]:
import requests
import json
import logging
import os
from tenacity import retry, stop_after_attempt, wait_fixed

# Configure logging
logging.basicConfig(level=logging.INFO)

# Configure ROR API information
ROR_SEARCH_URL = 'https://api.ror.org/organizations'

# Initialize ROR cache
ror_cache = {}

# Load persistent cache
cache_file = 'ror_cache.json'
if os.path.exists(cache_file):
    with open(cache_file, 'r', encoding='utf-8') as f:
        ror_cache = json.load(f)
    logging.info("Loaded persistent ROR cache.")
else:
    logging.info("No persistent ROR cache found. Using empty cache.")

@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def query_ror(institution_name):
    """
    Query ROR information, returning best match from top 3 candidates.
    """
    if institution_name in ror_cache:
        return ror_cache[institution_name]

    params = {'query': institution_name}

    try:
        response = requests.get(ROR_SEARCH_URL, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            items = data.get('items', [])
            
            if not items:
                logging.warning(f"No ROR information found for '{institution_name}'.")
                ror_cache[institution_name] = None
                return None

            # Only consider top 3 results
            candidates = items[:3]
            chosen = None

            logging.info(f"🔍 Query: {institution_name}")
            for idx, item in enumerate(candidates):
                candidate_name = item.get('name', '').lower()
                if candidate_name in institution_name.lower():
                    chosen = item
                    logging.info(f"✅ Match found with candidate {idx+1}: {item.get('name')}")
                    break
                else:
                    logging.info(f"❌ No match for candidate {idx+1}: {item.get('name')}")

            if not chosen:
                # Fallback: take first one
                chosen = candidates[0]
                logging.info(f"⚠️ No match found, fallback to first candidate: {chosen.get('name')}")

            ror_cache[institution_name] = {
                'ROR_ID': chosen.get('id', 'N/A'),
                'Name': chosen.get('name', 'N/A'),
                'Country': chosen.get('country', {}).get('country_name', 'N/A'),
                'Type': ', '.join(chosen.get('types', []))
            }
            return ror_cache[institution_name]

        else:
            logging.error(f"ROR API query failed with status code: {response.status_code} for institution: {institution_name}")
            ror_cache[institution_name] = None
            return None

    except Exception as e:
        logging.error(f"ROR API query exception for '{institution_name}', Error: {e}")
        ror_cache[institution_name] = None
        return None

# Save cache to a file after all queries
def save_cache():
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(ror_cache, f)
    logging.info("Saved ROR cache to file.")

INFO:root:Loaded persistent ROR cache.


In [33]:
import json
import os
from extractors.trie_extractor import TrieExtractor
from utils.file_reader import read_file

# Define file paths
input_file_path = 'tagged_outputs/all_extracted_tags_cleaned.txt'
output_file_path = 'institution_output_with_ror.json'

# Read the file and replace invalid escape sequences
with open(input_file_path, 'r', encoding='utf-8') as infile:
    raw_data = infile.read()

# Fix invalid escape sequences by double-escaping backslashes
# fixed_data = raw_data.replace("\\", "\\\\")
fixed_data = raw_data

# Try loading again
try:
    extracted_data = json.loads(fixed_data)
    print("✅ JSON successfully loaded!")
except json.JSONDecodeError as e:
    print(f"❌ JSON loading failed: {e}")

✅ JSON successfully loaded!


In [34]:
# Load TrieExtractor for full content/address processing
extractor = TrieExtractor(
    data_path="data/1.34_extracted_ror_data.csv",
    common_words_path="data/common_english_words.txt"
)


In [35]:

import re
import spacy
# Load the spaCy model for Named Entity Recognition (NER)
nlp = spacy.load("en_core_web_sm")

In [36]:
abname = ' $^1$ Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se)'
abname = re.sub(r'\b(Faculty of|Department of|School of|College of)\b[^,]*,', '', abname, flags=re.IGNORECASE)
print(abname)

 $^1$ Robotics and Artificial Intelligence Group,  Electrical and Space Engineering, Lulea \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se)


In [37]:
stringa = "$^*$Physics Department, Arizona State University, Tempe, Arizona 85287, USA.\\\\ $^\\dag$Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland."
print(latex_to_unicode(stringa))

^*Physics Department, Arizona State University, Tempe, Arizona 85287, USA.
 ^†Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland.


In [None]:
def remove_address_spacy(text):
    """Uses spaCy NER to remove location entities from an institution name"""
    doc = nlp(text)
    cleaned_parts = []
    
    for ent in doc.ents:
        if ent.label_ in {"GPE", "LOC"}:  # GPE = Geo-Political Entity (City, Country, etc.)
            continue  # Skip locations
        cleaned_parts.append(ent.text)

    return " ".join(cleaned_parts)

from pylatexenc.latex2text import LatexNodes2Text

converter = LatexNodes2Text()

def latex_to_unicode(text):
    """Safely converts LaTeX special characters to Unicode, handling errors."""
    if not text or text.strip() == "":
        return ""  # Avoid empty input issues
    
    try:
        return converter.latex_to_text(text)
    except IndexError as e:
        print(f"⚠️ Warning: Error parsing LaTeX in text: {text}")
        print("\n⚠️ latex_to_unicode parse error encountered!")
        print("Full input string:")
        print("--------------------------------------------------")
        print(text)
        print("--------------------------------------------------")
        print(f"Error message: {e}\n")
        return text  # fallback: just return the original text
    
import re

def mild_clean_affiliation(affiliation):
    parts = [p.strip() for p in affiliation.split(',')]
    if len(parts) > 3:
        # If any part contains 'university', return that part + next part (if exists)
        for i, part in enumerate(parts):
            if 'university' in part.lower():
                if i < len(parts) - 1:
                    return f"{part}, {parts[i+1]}"
                else:
                    return part
        affiliation = ', '.join(parts[:3])
    return affiliation.strip()

import re



def split_affiliations_before_unicode(raw_text):
    pattern = re.compile(r'(\$\\?\^(?:\\dag|\\star|\*|\d+))')
    positions = [m.start() for m in pattern.finditer(raw_text)] + [len(raw_text)]

    results = []
    for i in range(len(positions) - 1):
        segment = raw_text[positions[i]:positions[i+1]].strip()
        cleaned_segment = pattern.sub('', segment, count=1).strip()
        if cleaned_segment:
            results.append(cleaned_segment)

    return results

stringa = r"$^*$Physics Department, Arizona State University, Tempe, Arizona 85287, USA. \\ $^\\dag$Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland."

print("\nOriginal input:")
print(stringa)

split_result = split_affiliations_before_unicode(stringa)

print("\nSplit affiliations before unicode:")
for idx, aff in enumerate(split_result, 1):
    print(f"Institution {idx}: {aff}")




Original input:
$^*$Physics Department, Arizona State University, Tempe, Arizona 85287, USA. \\ $^\\dag$Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland.

Split affiliations before unicode:
Institution 1: $Physics Department, Arizona State University, Tempe, Arizona 85287, USA. \\ $^\\dag$Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland.


In [44]:
import re
from pylatexenc.latex2text import LatexNodes2Text

converter = LatexNodes2Text()

def remove_address_spacy(text):
    """Uses spaCy NER to remove location entities from an institution name"""
    doc = nlp(text)
    cleaned_parts = []

    for ent in doc.ents:
        if ent.label_ in {"GPE", "LOC"}:
            continue  # Skip locations
        cleaned_parts.append(ent.text)

    return " ".join(cleaned_parts)

def latex_to_unicode(text):
    """Safely converts LaTeX special characters to Unicode, handling errors."""
    if not text or text.strip() == "":
        return ""  # Avoid empty input issues

    try:
        return converter.latex_to_text(text)
    except IndexError as e:
        print(f"⚠️ Warning: Error parsing LaTeX in text: {text}")
        print("\n⚠️ latex_to_unicode parse error encountered!")
        print("Full input string:")
        print("--------------------------------------------------")
        print(text)
        print("--------------------------------------------------")
        print(f"Error message: {e}\n")
        return text  # fallback: just return the original text

def mild_clean_affiliation(affiliation):
    parts = [p.strip() for p in affiliation.split(',')]
    if len(parts) > 3:
        for i, part in enumerate(parts):
            if 'university' in part.lower():
                if i < len(parts) - 1:
                    return f"{part}, {part}, {parts[i+1]}"
                else:
                    return part
        affiliation = ', '.join(parts[:3])
    return affiliation.strip()


def split_affiliations_before_unicode(raw_text):
    temp_text = re.sub(r'(\\?\\?\$\\?\\^)(\\?\\dag|\\?\\star|\\?\*|\\?\\)', r'$^SPECIAL', raw_text)
    split_result = re.split(r'\\?\\?\$\\?\\^(?:SPECIAL|\d+)', temp_text)
    cleaned = [chunk.strip(' .\\') for chunk in split_result if chunk.strip()]
    return cleaned


def smart_pick_best_ror(query_text, candidates):
    """Choose the best matching ROR result based on exact match priority."""
    for idx, item in enumerate(candidates):
        name = item.get('name', '').lower()
        if name and name in query_text.lower():
            print(f"✅ Match found with candidate {idx+1}: {name}")
            return item
        else:
            print(f"❌ No match for candidate {idx+1}: {name}")
    print("⚠️ No candidate matched exactly, fallback to first.")
    return candidates[0] if candidates else None


    
stringa1 = r"$^*$Physics Department, Arizona State University, Tempe, Arizona 85287, USA. \\ $^\\dag$Theoretical Physics Department, CERN, 1211 Geneva 23, Switzerland."
stringa2 = r"$^1$ Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se) \\ $^2$ Department of Computer Science, University of Helsinki, Finland."

print("\nOriginal input:")
print(stringa2)

split_result = split_affiliations_before_unicode(stringa2)

print("\nSplit affiliations before unicode:")
for idx, aff in enumerate(split_result, 1):
    print(f"Institution {idx}: {aff}")



Original input:
$^1$ Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se) \\ $^2$ Department of Computer Science, University of Helsinki, Finland.

Split affiliations before unicode:
Institution 1: $^1$ Robotics and Artificial Intelligence Group, Department of Computer Science, Electrical and Space Engineering, Lule\\aa \,\, University of Technology, Sweden (e-mail: (vissan, sumsat, geonik)@ltu.se) \\ $^2$ Department of Computer Science, University of Helsinki, Finland


In [None]:
results = {}

# Process each paper
for paper, macros in extracted_data.items():
    unique_rors = {}

    if isinstance(macros, dict) and set(macros.keys()) in [{"full_content"}, {"address"}]:
        text = "\n".join(macros.get("full_content", [])) + "\n" + "\n".join(macros.get("address", []))
        text = latex_to_unicode(text)
        extracted_ror_ids = extractor.extract_affiliations(text)

        if extracted_ror_ids:
            unique_rors.update({ror_id: {"ROR_ID": ror_id} for ror_id in extracted_ror_ids})

    else:
        for content_list in macros.values():
            for institution_name in content_list:
                
                raw_institutions = re.split(r'(?:\\\\)?\\and', institution_name)

                for raw_inst in raw_institutions:
                    raw_inst = raw_inst.strip()

                    split_by_superscript = re.split(r'\s*\^\d+', raw_inst)
                    split_by_superscript = [inst.strip() for inst in split_by_superscript if inst.strip()]

                    for raw_piece in split_by_superscript:
                        if not raw_piece:
                            continue

                        cleaned_piece = latex_to_unicode(raw_piece)

                        cleaned_piece = cleaned_piece.replace("{", " ").replace("}", " ").replace("^", " ").replace("[", " ").replace("]", " ").strip()
                        cleaned_piece = mild_clean_affiliation(cleaned_piece)
                        cleaned_piece = re.sub(r'^(.*?\b(Department of|School of|Faculty of|College of|Department)\b[^,]*),', '', cleaned_piece, flags=re.IGNORECASE)

                        if cleaned_piece:
                            ror_info = query_ror(cleaned_piece)
                            if ror_info:
                                unique_rors[ror_info["ROR_ID"]] = ror_info  # Store unique RORs

    if unique_rors:
        results[paper] = list(unique_rors.values())

with open(output_file_path, 'w', encoding='utf-8') as outfile:
    json.dump(results, outfile, indent=4, ensure_ascii=False)

print(f"✅ JSON export completed. Results saved to {output_file_path}")


INFO:root:🔍 Query: Instituto de Astrofísica e Ciências do Espaço, Universidade do Porto, CAUP
INFO:root:❌ No match for candidate 1: Institute of Astrophysics and Space Sciences
INFO:root:❌ No match for candidate 2: Instituto de Ciências da Terra e do Espaço
INFO:root:❌ No match for candidate 3: Centre for Astrophysics of the University of Porto
INFO:root:⚠️ No match found, fallback to first candidate: Institute of Astrophysics and Space Sciences
INFO:root:🔍 Query: Instituto de Astrofísica de Canarias (IAC), E-38205 La Laguna, Tenerife
INFO:root:✅ Match found with candidate 1: Instituto de Astrofísica de Canarias
INFO:root:🔍 Query: Universidad de La Laguna (ULL), Departamento de Astrofísica, E-38206 La Laguna
INFO:root:✅ Match found with candidate 1: Universidad de La Laguna
INFO:root:🔍 Query: Ecole Centrale-Supelec, Université Paris-Saclay, 91190 Gif-sur-Yvette
INFO:root:✅ Match found with candidate 1: Université Paris-Saclay
INFO:root:🔍 Query: INAF – Osservatorio Astrofisico di Catani


⚠️ latex_to_unicode parse error encountered!
Full input string:
--------------------------------------------------
\href{http://www.ncbj.gov.pl}{National Centre for Nuclear Research}
--------------------------------------------------
Error message: list index out of range



INFO:pylatexenc.latexwalker:Ignoring parse error (tolerant parsing mode): End of input while parsing arguments of macro "\" @(1,93)
Open LaTeX blocks:
           @(1,93)  arguments of macro "\"

INFO:pylatexenc.latexwalker:Ignoring parse error (tolerant parsing mode): End of input while parsing arguments of macro "\" @(1,93)
Open LaTeX blocks:
           @(1,93)  arguments of macro "\"

INFO:pylatexenc.latexwalker:Ignoring parse error (tolerant parsing mode): End of input while parsing arguments of macro "\" @(1,82)
Open LaTeX blocks:
           @(1,82)  arguments of macro "\"

INFO:pylatexenc.latexwalker:Ignoring parse error (tolerant parsing mode): End of input while parsing arguments of macro "\" @(1,123)
Open LaTeX blocks:
          @(1,123)  arguments of macro "\"



✅ JSON export completed. Results saved to institution_output_with_ror.json


In [None]:
import json
import pandas as pd

ground_truth = pd.read_csv("data/2311_with_ror.csv")

with open("institution_output_with_ror.json", 'r', encoding='utf-8') as f:
    extracted_results = json.load(f)

ground_truth['ROR ID'] = ground_truth['ROR ID'].fillna('').astype(str)
ground_truth['paper_id_clean'] = ground_truth['paper_id'].astype(str).str.strip().str.replace(r'v\\d+$', '', regex=True)

gt_rors_per_paper = (
    ground_truth.groupby('paper_id_clean')['ROR ID']
    .apply(lambda x: set(ror.strip() for ror in x if ror.strip() != ''))
    .to_dict()
)

correct_paper_level = 0
total_papers = 0
correct_affiliations = 0
total_affiliations = 0
evaluation_details = []

for paper_id, extracted_rors_list in extracted_results.items():
    paper_id_clean = paper_id.split('/')[-1]
    extracted_rors = set(ror_info['ROR_ID'] for ror_info in extracted_rors_list)
    gt_rors = gt_rors_per_paper.get(str(paper_id_clean), set())

    paper_correct = extracted_rors == gt_rors
    if paper_correct:
        correct_paper_level += 1

    total_papers += 1

    matched_affiliations = extracted_rors.intersection(gt_rors)
    missed_affiliations = gt_rors - extracted_rors
    extra_affiliations = extracted_rors - gt_rors

    for gt_ror in gt_rors:
        total_affiliations += 1
        if gt_ror in extracted_rors:
            correct_affiliations += 1

    evaluation_details.append({
        "paper_id": paper_id_clean,
        "ground_truth_rors": list(gt_rors),
        "extracted_rors": list(extracted_rors),
        "matched_affiliations": list(matched_affiliations),
        "missed_affiliations": list(missed_affiliations),
        "extra_affiliations": list(extra_affiliations),
        "paper_exact_match": paper_correct
    })

paper_level_accuracy = correct_paper_level / total_papers if total_papers > 0 else 0
affiliation_level_accuracy = correct_affiliations / total_affiliations if total_affiliations > 0 else 0

# Save evaluation details to JSON
with open("evaluation_details.json", "w", encoding="utf-8") as f:
    json.dump(evaluation_details, f, indent=4, ensure_ascii=False)

print(f"\n Paper-level accuracy: {paper_level_accuracy * 100:.2f}% ({correct_paper_level}/{total_papers})")
print(f" Affiliation-level accuracy: {affiliation_level_accuracy * 100:.2f}% ({correct_affiliations}/{total_affiliations})")
print(" Detailed evaluation saved to evaluation_details.json")



 Paper-level accuracy: 21.43% (12/56)
 Affiliation-level accuracy: 53.52% (114/213)
 Detailed evaluation saved to evaluation_details.json


In [42]:
import requests
import json

def test_ror_query(institution_name):
    ROR_SEARCH_URL = 'https://api.ror.org/organizations'
    params = {'query': institution_name}
    
    response = requests.get(ROR_SEARCH_URL, params=params, timeout=10)
    if response.status_code == 200:
        data = response.json()
        print(f"\n🔎 Query: {institution_name}")
        print(f"Number of results: {len(data.get('items', []))}")
        for item in data.get('items', []):
            print(f"Name: {item.get('name')}")
            print(f"Score: {item.get('score')}")
            print(f"Country: {item.get('country', {}).get('country_name')}")
            print(f"ROR ID: {item.get('id')}")
            print("------")
    else:
        print(f"Request failed with status code {response.status_code}")

# Example test cases:
test_ror_query("Department of Physics, University of Illinois at Urbana-Champaign, Urbana, Illinois 61801-3080, USA")
test_ror_query("Department of Radiology, Wake Forest University School of Medicine, Winston-Salem, North Carolina, USA")
test_ror_query("Institute of Astronomy and Space Physics, Buenos Aires, Argentina")



🔎 Query: Department of Physics, University of Illinois at Urbana-Champaign, Urbana, Illinois 61801-3080, USA
Number of results: 20
Name: University of Illinois Urbana-Champaign
Score: None
Country: United States
ROR ID: https://ror.org/047426m28
------
Name: Urbana University
Score: None
Country: United States
ROR ID: https://ror.org/04kp3hw27
------
Name: University of Illinois Chicago, Rockford campus
Score: None
Country: United States
ROR ID: https://ror.org/02437s643
------
Name: University of Illinois Chicago
Score: None
Country: United States
ROR ID: https://ror.org/02mpq6x41
------
Name: Peoria campus of the University of Illinois System
Score: None
Country: United States
ROR ID: https://ror.org/02qrdc062
------
Name: University of Illinois at Springfield
Score: None
Country: United States
ROR ID: https://ror.org/0126qma51
------
Name: Illinois Department of Agriculture
Score: None
Country: United States
ROR ID: https://ror.org/00pd5rv22
------
Name: Illinois Department of Tran

In [None]:
import json

with open('ror/v1.52-2024-09-16-ror-data_schema_v2.json', 'r', encoding='utf-8') as f:
    ror_data = json.load(f)

ror_relationships = {}

for record in ror_data:
    ror_id = record['id']
    related_ids = set()
    for rel in record.get('relationships', []):
        related_ids.add(rel['id'])
    ror_relationships[ror_id] = related_ids


In [52]:
def is_match(ror1, ror2, ror_relationships):
    related1 = ror_relationships.get(ror1, set())
    related2 = ror_relationships.get(ror2, set())
    return (ror1 == ror2) or (ror2 in related1) or (ror1 in related2)


In [None]:
import json
import pandas as pd

ground_truth = pd.read_csv("data/2311_with_ror.csv")

with open("institution_output_with_ror.json", 'r', encoding='utf-8') as f:
    extracted_results = json.load(f)

ground_truth['ROR ID'] = ground_truth['ROR ID'].fillna('').astype(str)
ground_truth['paper_id_clean'] = ground_truth['paper_id'].astype(str).str.strip().str.replace(r'v\\d+$', '', regex=True)

gt_rors_per_paper = (
    ground_truth.groupby('paper_id_clean')['ROR ID']
    .apply(lambda x: set(ror.strip() for ror in x if ror.strip() != ''))
    .to_dict()
)

correct_paper_level = 0
total_papers = 0
correct_affiliations = 0
total_affiliations = 0
evaluation_details = []

for paper_id, extracted_rors_list in extracted_results.items():
    paper_id_clean = paper_id.split('/')[-1]
    extracted_rors = set(ror_info['ROR_ID'] for ror_info in extracted_rors_list)
    gt_rors = gt_rors_per_paper.get(str(paper_id_clean), set())

    paper_correct = all(
        any(is_match(gt_ror, extracted_ror, ror_relationships) for extracted_ror in extracted_rors)
        for gt_ror in gt_rors
    ) and all(
        any(is_match(extracted_ror, gt_ror, ror_relationships) for gt_ror in gt_rors)
        for extracted_ror in extracted_rors
    )

    if paper_correct:
        correct_paper_level += 1
    total_papers += 1

    real_missed = []
    for gt_ror in gt_rors:
        total_affiliations += 1
        if any(is_match(gt_ror, extracted_ror, ror_relationships) for extracted_ror in extracted_rors):
            correct_affiliations += 1
        else:
            real_missed.append(gt_ror)

    real_extra = []
    for ext_ror in extracted_rors:
        if not any(is_match(ext_ror, gt_ror, ror_relationships) for gt_ror in gt_rors):
            real_extra.append(ext_ror)

    evaluation_details.append({
        "paper_id": paper_id_clean,
        "ground_truth_rors": list(gt_rors),
        "extracted_rors": list(extracted_rors),
        "matched_affiliations": list(gt_rors.intersection(extracted_rors)),  # optional
        "missed_affiliations": real_missed,
        "extra_affiliations": real_extra,
        "paper_exact_match": paper_correct
    })

paper_level_accuracy = correct_paper_level / total_papers if total_papers > 0 else 0
affiliation_level_accuracy = correct_affiliations / total_affiliations if total_affiliations > 0 else 0

with open("evaluation_details_relationship.json", "w", encoding="utf-8") as f:
    json.dump(evaluation_details, f, indent=4, ensure_ascii=False)

print(f"\n Paper-level accuracy: {paper_level_accuracy * 100:.2f}% ({correct_paper_level}/{total_papers})")
print(f" Affiliation-level accuracy: {affiliation_level_accuracy * 100:.2f}% ({correct_affiliations}/{total_affiliations})")
print(" Detailed evaluation saved to evaluation_details.json")



 Paper-level accuracy: 39.29% (22/56)
 Affiliation-level accuracy: 75.12% (160/213)
 Detailed evaluation saved to evaluation_details.json


In [2]:
import json
import pandas as pd

# Step 1: Load ground truth
ground_truth = pd.read_csv("data/2311_with_ror.csv")

# Step 2: Load extracted results from new file
with open("final_affiliations_2000_parallel.json", 'r', encoding='utf-8') as f:
    extracted_results_raw = json.load(f)

# Step 3: Preprocess ground truth
ground_truth['ROR ID'] = ground_truth['ROR ID'].fillna('').astype(str)
ground_truth['paper_id_clean'] = ground_truth['paper_id'].astype(str).str.strip().str.replace(r'v\\d+$', '', regex=True)

gt_rors_per_paper = (
    ground_truth.groupby('paper_id_clean')['ROR ID']
    .apply(lambda x: set(ror.strip() for ror in x if ror.strip() != ''))
    .to_dict()
)

# Step 4: Preprocess extracted results into a {paper_id: set(rors)} format
extracted_results = {}
for item in extracted_results_raw:
    paper_id = item.get("File ID", "").strip()
    extracted_rors = set()
    for inst in item.get("institutions_with_ror", []):
        ror_id = inst.get("ror_id", "").strip()
        if ror_id:
            # Add full URL form if needed
            if not ror_id.startswith("https://ror.org/"):
                ror_id = f"https://ror.org/{ror_id}"
            extracted_rors.add(ror_id)
    extracted_results[paper_id] = extracted_rors

# Step 5: Load ROR relationships file
with open("ror/v1.52-2024-09-16-ror-data_schema_v2.json", 'r', encoding='utf-8') as f:
    ror_full_data = json.load(f)

# Build relationship dictionary
ror_relationships = {}
for entry in ror_full_data:
    ror_id = entry.get("id", "")
    if ror_id:
        children = set(rel["id"] for rel in entry.get("relationships", []) if rel["type"] == "child")
        parents = set(rel["id"] for rel in entry.get("relationships", []) if rel["type"] == "parent")
        ror_relationships[ror_id] = {"children": children, "parents": parents}

# Step 6: Define is_match function
def is_match(ror_a, ror_b, relationships):
    if ror_a == ror_b:
        return True
    a_rel = relationships.get(ror_a, {"children": set(), "parents": set()})
    b_rel = relationships.get(ror_b, {"children": set(), "parents": set()})
    if ror_b in a_rel["children"] or ror_b in a_rel["parents"]:
        return True
    if ror_a in b_rel["children"] or ror_a in b_rel["parents"]:
        return True
    return False

# Step 7: Evaluate
correct_paper_level = 0
total_papers = 0
correct_affiliations = 0
total_affiliations = 0
evaluation_details = []

for paper_id, extracted_rors in extracted_results.items():
    gt_rors = gt_rors_per_paper.get(paper_id, set())

    paper_correct = all(
        any(is_match(gt_ror, extracted_ror, ror_relationships) for extracted_ror in extracted_rors)
        for gt_ror in gt_rors
    ) and all(
        any(is_match(extracted_ror, gt_ror, ror_relationships) for gt_ror in gt_rors)
        for extracted_ror in extracted_rors
    )

    if paper_correct:
        correct_paper_level += 1
    total_papers += 1

    real_missed = []
    for gt_ror in gt_rors:
        total_affiliations += 1
        if any(is_match(gt_ror, extracted_ror, ror_relationships) for extracted_ror in extracted_rors):
            correct_affiliations += 1
        else:
            real_missed.append(gt_ror)

    real_extra = []
    for ext_ror in extracted_rors:
        if not any(is_match(ext_ror, gt_ror, ror_relationships) for gt_ror in gt_rors):
            real_extra.append(ext_ror)

    evaluation_details.append({
        "paper_id": paper_id,
        "ground_truth_rors": list(gt_rors),
        "extracted_rors": list(extracted_rors),
        "matched_affiliations": list(gt_rors.intersection(extracted_rors)),
        "missed_affiliations": real_missed,
        "extra_affiliations": real_extra,
        "paper_exact_match": paper_correct
    })

paper_level_accuracy = correct_paper_level / total_papers if total_papers > 0 else 0
affiliation_level_accuracy = correct_affiliations / total_affiliations if total_affiliations > 0 else 0

# Step 8: Save evaluation results
with open("evaluation_details_relationship.json", "w", encoding="utf-8") as f:
    json.dump(evaluation_details, f, indent=4, ensure_ascii=False)

print(f"\n✅ Paper-level accuracy: {paper_level_accuracy * 100:.2f}% ({correct_paper_level}/{total_papers})")
print(f"✅ Affiliation-level accuracy: {affiliation_level_accuracy * 100:.2f}% ({correct_affiliations}/{total_affiliations})")
print("✅ Detailed evaluation saved to evaluation_details_relationship.json")



✅ Paper-level accuracy: 41.25% (825/2000)
✅ Affiliation-level accuracy: 57.71% (3965/6870)
✅ Detailed evaluation saved to evaluation_details_relationship.json
