In [1]:
import tarfile
import io
import zipfile
import importlib
import regex as re
import pyperclip  
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
import os


In [2]:
def find_doc_class(fp, name_match=False):
    '''Search for document class related lines in a file and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    # Read the content as bytes
    file_content = fp.read()
    try:
        # Try decoding with UTF-8
        file_text = file_content.decode('utf-8')
    except UnicodeDecodeError:
        # Fallback to latin-1 encoding if UTF-8 fails
        file_text = file_content.decode('latin-1')

    for line in file_text.splitlines():
        if doc_class_pat.search(line):
            if name_match:
                if sub_doc_class.search(line):
                    return -99999
                return 1  # Found document class line
    return 0  # No document class line found

def find_main_tex_source_in_tar(tar_file, encoding='utf-8'):
    tex_names = set(["paper", "main", "ms.", "article"])
    tex_files = [f for f in tar_file.getnames() if f.endswith('.tex')]

    if len(tex_files) == 1:
        return tex_files[0]

    main_files = {}
    for tf in tex_files:
        depth = len(tf.split('/')) - 1
        has_main_name = any(kw in tf for kw in tex_names)
        fp = tar_file.extractfile(tf)
        if fp:
            main_files[tf] = find_doc_class(fp, name_match=has_main_name) - depth
            fp.close()

    return max(main_files, key=main_files.get) if main_files else None

def pre_format(text):
    source_text = text.replace('\\}\\', '\\} \\').replace(')}', ') }').replace(')$', ') $')
    return source_text

def source_from_tar(tar_file, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_file, encoding=encoding)
    if tex_main:
        fp = tar_file.extractfile(tex_main)
        if fp is not None:
            file_content = fp.read()  # Read as bytes to keep it in memory
            try:
                # Attempt to decode using UTF-8
                source_text = pre_format(file_content.decode(encoding))
            except UnicodeDecodeError:
                # Fallback to latin-1 encoding if UTF-8 fails
                source_text = pre_format(file_content.decode('latin-1'))
            return source_text
    return None

def extract_before_abstract(source_text):
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)
    no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\\[a-zA-Z]+\[[^\]]*\]\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\$[^$]*\$', '', no_usepackage_text)
    text = no_usepackage_text.replace('{', '').replace('}', '').replace('\n', ' ')
    text = ' '.join(no_usepackage_text.split())
    abstract_match = re.search(r'\\begin\s*\{\s*abstract\s*\}', text)

    if abstract_match:
        return text[:abstract_match.start()].strip()
    
    abstract_word_match = re.search(r'\babstract\b', text, re.IGNORECASE)
    if abstract_word_match:
        return text[:abstract_word_match.start()].strip()
    return None

zip_file_path = "./2401.zip"
output_path = "./firstoutput.txt"

latest_versions = {}
# Track the latest version of each identifier
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]
    for tar_name in tar_files:
        base_name, version = tar_name.rsplit("v", 1)
        version_num = int(version.split('.')[0])  # Extract version number
        if base_name not in latest_versions or version_num > latest_versions[base_name][1]:
            latest_versions[base_name] = (tar_name, version_num)

with open(output_path, 'w', encoding='utf-8') as output_file:
    for base_name, (tar_name, version) in latest_versions.items():
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            with zip_file.open(tar_name) as tar_bytes:
                tar_file = tarfile.open(fileobj=io.BytesIO(tar_bytes.read()), mode='r:gz')
                source_text = source_from_tar(tar_file)
                if source_text:
                    content_before_abstract = extract_before_abstract(source_text)
                    if content_before_abstract:
                        output_file.write(f"Content before abstract in {tar_name}:\n{content_before_abstract}\n\n")
                    else:
                        output_file.write(f"No abstract found in {tar_name}, or no content before abstract.\n\n")
                tar_file.close()


NameError: name 'zipfile' is not defined

In [3]:
# Read input from a text file and filter out files without valid content before the abstract
input_file_path = 'firstoutput.txt'  # Replace with your actual file path
output_file_path = 'filtered_files_with_content_macro.txt'

# Variables to keep track of statistics
total_files_count_author = 0
valid_files_count = 0
valid_files_with_content = []

# Reading and processing the input file
with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')
    
    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count_author += 1
            lines = block.split('\n', 1)  # Split to separate the file name from its content
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                
                # Check if the content does not indicate "No abstract found"
                if 'No abstract found' not in content and 'no content before abstract' not in content.lower():
                    valid_files_count += 1
                    valid_files_with_content.append((file_name, content))

# Writing the filtered results to an output file
with open(output_file_path, 'w') as outfile:
    for file_name, content in valid_files_with_content:
        outfile.write(f"Content before abstract in {file_name}:\n{content}\n\n")

# Print or save the statistics summary
# print(f"Total number of files processed: {total_files_count}")
# print(f"Total number of files with valid content before abstract: {valid_files_count}")
# print(f"Filtered output saved in: {output_file_path}")


1818 files with content
183 no abstract found (probably wrong tex file)

In [31]:
import re
import os

# Define the input file path and output file directory
input_file_path = 'filtered_files_with_content_macro.txt'
output_dir = './tagged_outputs/'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Tags to search for, their output files, and counters
tags_to_search = [
    r'\\institution',
    r'\\affiliations', r'\\affiliation', r'\\icmlaffiliation',  
    r'\\institute', r'\\affil', r'\\aff', r'\\AFF', r'\\address'
]
tag_files = {tag: [] for tag in tags_to_search}  # Dictionary to store file content for each tag
tag_counts = {tag: 0 for tag in tags_to_search}   # Counter for unique papers containing each tag

# Function to extract the content within the first level of braces after each tag
def extract_tag_content(tag, content):
    pattern = rf"({tag}\{{)"
    results = []
    start = 0
    while (match := re.search(pattern, content[start:])) is not None:
        # Find the opening brace position and keep the tag itself
        start_idx = start + match.start()
        tag_with_brace = match.group(1)  # Keep the matched tag including the opening brace
        brace_level = 1
        end_idx = start_idx + len(tag_with_brace)

        # Find the matching closing brace
        while brace_level > 0 and end_idx < len(content):
            if content[end_idx] == '{':
                brace_level += 1
            elif content[end_idx] == '}':
                brace_level -= 1
            end_idx += 1

        # Extract and store the full tag with content in braces
        results.append(content[start_idx:end_idx])
        start = end_idx  # Move the start index to continue searching

    return results

# Function to process and assign content for each tag in a file
def extract_and_assign_tag_content(file_name, content, tags):
    tag_found = set()  # Track tags found in this paper to avoid counting duplicates

    for tag in tags:
        # Extract content with braces for each occurrence of the tag
        matches = extract_tag_content(tag, content)

        # If matches are found, add them to the corresponding tag's list
        if matches:
            extracted_content = f"Content in {tag} for {file_name}:\n" + "\n".join(matches) + "\n"
            tag_files[tag].append(extracted_content)

            # Only increment the count if this tag hasn't been counted for this paper yet
            if tag not in tag_found:
                tag_counts[tag] += 1
                tag_found.add(tag)  # Mark this tag as found for this paper

    return bool(tag_found)  # Return True if any tags were found

# Process the input file
total_files_count = 0

with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')

    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count += 1
            lines = block.split('\n', 1)
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()

                # Extract and assign content within tags
                extract_and_assign_tag_content(file_name, content, tags_to_search)

# Write each tag's extracted content to its respective output file
for tag in tags_to_search:
    output_file_path = "{}{}_output.txt".format(output_dir, tag.replace('\\', ''))
    with open(output_file_path, 'w') as outfile:
        outfile.write('\n'.join(tag_files[tag]))

# Print statistics
print(f"Total number of files processed: {total_files_count}")
for tag, count in tag_counts.items():
    print(f"Number of papers with tag '{tag}': {count}")
print(f"Tagged output files saved in directory: {output_dir}")


Total number of files processed: 1818
Number of papers with tag '\\institution': 104
Number of papers with tag '\\affiliations': 35
Number of papers with tag '\\affiliation': 451
Number of papers with tag '\\icmlaffiliation': 37
Number of papers with tag '\\institute': 93
Number of papers with tag '\\affil': 30
Number of papers with tag '\\aff': 7
Number of papers with tag '\\AFF': 5
Number of papers with tag '\\address': 146
Tagged output files saved in directory: ./tagged_outputs/


In [None]:
import re
import os
import json
import requests
import logging
from tenacity import retry, stop_after_attempt, wait_fixed

# Configure logging
logging.basicConfig(level=logging.INFO)

# Configure ROR API information
ROR_SEARCH_URL = 'https://api.ror.org/organizations'

# Initialize ROR cache
ror_cache = {}

# Load persistent cache
cache_file = 'ror_cache.json'
if os.path.exists(cache_file):
    with open(cache_file, 'r', encoding='utf-8') as f:
        ror_cache = json.load(f)
    logging.info("Loaded persistent ROR cache.")
else:
    logging.info("No persistent ROR cache found. Using empty cache.")

@retry(stop=stop_after_attempt(3), wait=wait_fixed(5))
def query_ror(institution_name):
    """
    Query ROR information, returning details only if the name exactly matches.
    """
    if institution_name in ror_cache:
        return ror_cache[institution_name]
    
    # ROR API search endpoint
    params = {'query': institution_name}
    
    try:
        response = requests.get(ROR_SEARCH_URL, params=params, timeout=10)
        if response.status_code == 200:
            data = response.json()
            if data.get('items'):
                # Iterate through all returned items to find an exact name match
                for item in data['items']:
                    ror_name = item.get('name', '').strip()
                    # Compare names, case-insensitive and ignore leading/trailing spaces
                    if ror_name.lower() == institution_name.strip().lower():
                        ror_cache[institution_name] = {
                            'ROR_ID': item.get('id', 'N/A'),
                            'Name': ror_name,
                            'Country': item.get('country', {}).get('country_name', 'N/A'),
                            'Type': ', '.join(item.get('types', []))
                        }
                        logging.info(f"Found ROR ID for '{institution_name}': {item.get('id', 'N/A')}")
                        return ror_cache[institution_name]
                
                # If no exact match found
                logging.warning(f"No exact match found for institution '{institution_name}' in ROR.")
                ror_cache[institution_name] = None
                return None
            else:
                logging.warning(f"No ROR information found for institution '{institution_name}'.")
                ror_cache[institution_name] = None
                return None
        else:
            logging.error(f"ROR API query failed with status code: {response.status_code} for institution: {institution_name}")
            ror_cache[institution_name] = None
            return None
    except Exception as e:
        logging.error(f"ROR API query exception for institution: {institution_name}, Error: {e}")
        ror_cache[institution_name] = None
        return None

# Save cache to a file after all queries
def save_cache():
    with open(cache_file, 'w', encoding='utf-8') as f:
        json.dump(ror_cache, f)
    logging.info("Saved ROR cache to file.")

# Process institution_output.txt to add ROR details
input_file_path = 'tagged_outputs/institution_output.txt'
output_file_path = 'institution_output_with_ror.txt'

with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    current_file = None
    for line in infile:
        if line.startswith("Content in \\institution for"):
            # Write header line to output
            outfile.write(line)
            current_file = line.strip()
        elif line.startswith("\\institution{"):
            # Extract institution name within braces
            institution_name = re.search(r'\\institution\{(.*?)\}', line).group(1).strip()
            
            # Query ROR API for this institution name
            ror_info = query_ror(institution_name)
            
            # Write the original line to output
            outfile.write(line)
            
            # Write ROR information if available
            if ror_info:
                ror_details = (
                    f"  - ROR_ID: {ror_info['ROR_ID']}\n"
                    f"  - Name: {ror_info['Name']}\n"
                    f"  - Country: {ror_info['Country']}\n"
                    f"  - Type: {ror_info['Type']}\n"
                )
                outfile.write(ror_details)
            else:
                outfile.write("  - No exact match found in ROR.\n")
        elif line.strip() == '':
            # Separate sections for each file
            outfile.write("\n")
        else:
            # Write any other lines as-is
            outfile.write(line)

# Save the ROR cache after processing all institutions
save_cache()


INFO:root:No persistent ROR cache found. Using empty cache.
INFO:root:Found ROR ID for 'Charles University': https://ror.org/024d6js02
INFO:root:Found ROR ID for 'Xidian University': https://ror.org/05s92vm98
INFO:root:Found ROR ID for 'Nanyang Technological University': https://ror.org/02e7b5302
INFO:root:Found ROR ID for 'Nanjing University': https://ror.org/01rxvg760
INFO:root:Found ROR ID for 'Singapore Management University': https://ror.org/050qmg959
INFO:root:Found ROR ID for 'University of Stuttgart': https://ror.org/04vnq7t77
INFO:root:Found ROR ID for 'Zhejiang University': https://ror.org/00a2xv884
INFO:root:Found ROR ID for 'Aalborg University': https://ror.org/04m5j1k67
INFO:root:Found ROR ID for 'Case Western Reserve University': https://ror.org/051fd9666
INFO:root:Found ROR ID for 'Zhejiang Lab': https://ror.org/02m2h7991
INFO:root:Found ROR ID for 'Tongji University': https://ror.org/03rc6as71
INFO:root:Found ROR ID for 'Hangzhou Dianzi University': https://ror.org/0576

In [None]:
import re
import csv
import os

# Define file paths
ror_data_path = '1.34_extracted_ror_data.csv'
input_file_path = './tagged_outputs/institution_output.txt'
output_file_path = './tagged_outputs/institution_output_with_ror_matches.txt'

# Step 1: Load ROR data into a dictionary for quick lookups
ror_data = {}

with open(ror_data_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ror_id = row['id']
        name = row['name']
        # aliases = row['aliases'].split(',') if row['aliases'] else []
        # Combine name and aliases in a list for each ID
        ror_data[ror_id] = {
            "name": name,
            "all_terms": [name]
        }

# Step 2: Process institution_output.txt and find ROR matches for each entry
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    content_blocks = infile.read().split('Content in \\institution for')

    for block in content_blocks:
        if block.strip():  # Ignore empty blocks
            # Write the header for each block
            outfile.write(f"Content in \\institution for{block.strip()}\n")

            # Extract each \institution{...} tag's content
            affiliations = re.findall(r'(\\institution\{(.*?)\})', block, re.DOTALL)

            # Step 3: For each institution entry, check for ROR matches
            for full_tag, affiliation in affiliations:
                # Write the original \institution{...} content
                outfile.write(f"\n{full_tag}\n")

                # Track matches for the current institution
                affiliation_matches = []

                # Check each ROR entry for a match
                for ror_id, ror_info in ror_data.items():
                    for term in ror_info['all_terms']:
                        if term and term.lower() in affiliation.lower():  # Case-insensitive substring match
                            affiliation_matches.append(f"  - ROR_ID: {ror_id}\n  - Name: {ror_info['name']}")
                            break  # Stop at the first match for this ROR entry

                # Append matches or "No exact match found in ROR"
                if affiliation_matches:
                    for match in affiliation_matches:
                        outfile.write(f"{match}\n")
                else:
                    outfile.write("  - No exact match found in ROR.\n")

            # Separate each block for readability
            outfile.write("\n")

print(f"Results saved to {output_file_path}")


Results saved to ./tagged_outputs/institution_output_with_ror_matches.txt


number of papers with tag \institution: 104
total number of tag \institution: 526
no exact match found for a tag: 186
accuracy: 64%

In [26]:
import re
import csv
import os

# Define file paths
ror_data_path = '1.34_extracted_ror_data.csv'
input_file_path = './tagged_outputs/affiliation_output.txt'
output_file_path = './tagged_outputs/affiliation_output_with_ror_matches.txt'

# Step 1: Load ROR data into a dictionary for quick lookups
ror_data = {}

with open(ror_data_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ror_id = row['id']
        name = row['name']
        # aliases = row['aliases'].split(',') if row['aliases'] else []
        # Combine name and aliases in a list for each ID
        ror_data[ror_id] = {
            "name": name,
            "all_terms": [name]  # Add aliases here if necessary
        }

# Step 2: Process affiliation_output.txt and find ROR matches for each entry
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    content_blocks = infile.read().split('Content in \\affiliation for')  # Correctly split on "affiliation"

    for block in content_blocks:
        if block.strip():  # Ignore empty blocks
            # Write the header for each block
            outfile.write(f"Content in \\affiliation for{block.strip()}\n")  # Correctly label as "affiliation"

            # Extract each \affiliation{...} tag's content
            affiliations = re.findall(r'(\\affiliation\{(.*?)\})', block, re.DOTALL)

            # Step 3: For each affiliation entry, check for ROR matches
            for full_tag, affiliation in affiliations:
                # Write the original \affiliation{...} content
                outfile.write(f"\n{full_tag}\n")

                # Track matches for the current affiliation
                affiliation_matches = []

                # Check each ROR entry for a match
                for ror_id, ror_info in ror_data.items():
                    for term in ror_info['all_terms']:
                        if term and term.lower() in affiliation.lower():  # Case-insensitive substring match
                            affiliation_matches.append(f"  - ROR_ID: {ror_id}\n  - Name: {ror_info['name']}")
                            break  # Stop at the first match for this ROR entry

                # Append matches or "No exact match found in ROR"
                if affiliation_matches:
                    for match in affiliation_matches:
                        outfile.write(f"{match}\n")
                else:
                    outfile.write("  - No exact match found in ROR.\n")

            # Separate each block for readability
            outfile.write("\n")

print(f"Results saved to {output_file_path}")


Results saved to ./tagged_outputs/affiliation_output_with_ror_matches.txt


no exact match: 628



In [None]:
import re
import csv
import os

# Define file paths
ror_data_path = '1.34_extracted_ror_data.csv'
input_file_path = './tagged_outputs/address_output.txt'
output_file_path = './tagged_outputs/address_output_with_ror_matches.txt'

# Step 1: Load ROR data into a dictionary for quick lookups
ror_data = {}

with open(ror_data_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ror_id = row['id']
        name = row['name']
        # aliases = row['aliases'].split(',') if row['aliases'] else []
        # Combine name and aliases in a list for each ID
        ror_data[ror_id] = {
            "name": name,
            "all_terms": [name]
        }

# Step 2: Process institution_output.txt and find ROR matches for each entry
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    content_blocks = infile.read().split('Content in \\institution for')

    for block in content_blocks:
        if block.strip():  # Ignore empty blocks
            # Write the header for each block
            outfile.write(f"Content in \\institution for{block.strip()}\n")

            # Extract each \institution{...} tag's content
            affiliations = re.findall(r'(\\affiliation\{(.*?)\})', block, re.DOTALL)

            # Step 3: For each institution entry, check for ROR matches
            for full_tag, affiliation in affiliations:
                # Write the original \institution{...} content
                outfile.write(f"\n{full_tag}\n")

                # Track matches for the current institution
                affiliation_matches = []

                # Check each ROR entry for a match
                for ror_id, ror_info in ror_data.items():
                    for term in ror_info['all_terms']:
                        if term and term.lower() in affiliation.lower():  # Case-insensitive substring match
                            affiliation_matches.append(f"  - ROR_ID: {ror_id}\n  - Name: {ror_info['name']}")
                            break  # Stop at the first match for this ROR entry

                # Append matches or "No exact match found in ROR"
                if affiliation_matches:
                    for match in affiliation_matches:
                        outfile.write(f"{match}\n")
                else:
                    outfile.write("  - No exact match found in ROR.\n")

            # Separate each block for readability
            outfile.write("\n")

print(f"Results saved to {output_file_path}")


no exa

In [22]:
import re
import csv
import os

# Define file paths
ror_data_path = '1.34_extracted_ror_data.csv'
input_file_path = './tagged_outputs/affiliation_output.txt'
output_file_path = './tagged_outputs/affiliation_output_with_ror_matches.txt'

# Helper function for consistent text normalization
def normalize_text(text):
    """Normalize text by converting to lowercase, removing punctuation, and extra spaces."""
    text = re.sub(r'[^\w\s]', '', text)  # Remove all non-alphanumeric and non-space characters
    return text.lower().strip()

# Step 1: Load ROR data into a dictionary for fast lookups
ror_dict = {}

with open(ror_data_path, newline='', encoding='utf-8') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        ror_id = row['id']
        name = row['name']
        normalized_name = normalize_text(name)

        # Skip single-word institution names
        if len(normalized_name.split()) > 1:  # Only consider multi-word names
            ror_dict[normalized_name] = (ror_id, name)  # Map normalized name to its ID and original name

# Step 2: Process affiliation_output.txt and find ROR matches for each entry
with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
    content_blocks = infile.read().split('Content in \\affiliation for')

    for block in content_blocks:
        if block.strip():  # Ignore empty blocks
            # Write the header for each block
            outfile.write(f"Content in \\affiliation for{block.strip()}\n")

            # Extract each \affiliation{...} tag's content
            affiliations = re.findall(r'(\\affiliation\{(.*?)\})', block, re.DOTALL)

            # Step 3: For each affiliation entry, check for ROR matches
            for full_tag, affiliation in affiliations:
                # Write the original \affiliation{...} content
                outfile.write(f"\n{full_tag}\n")

                # Normalize the affiliation text for comparison
                normalized_affiliation = normalize_text(affiliation)
                affiliation_words = set(normalized_affiliation.split())

                # Track matches for the current affiliation
                matched_ror_ids = set()  # Ensure no duplicate matches for the same affiliation
                affiliation_matches = []

                # Check each normalized ROR name for matches
                for ror_name, (ror_id, original_name) in ror_dict.items():
                    ror_words = set(ror_name.split())

                    # Match only if all words in the ROR name are found in the affiliation
                    if ror_words.issubset(affiliation_words) and ror_id not in matched_ror_ids:
                        affiliation_matches.append(f"  - ROR_ID: {ror_id}\n  - Name: {original_name}")
                        matched_ror_ids.add(ror_id)  # Mark this ROR ID as matched

                # Append matches or "No exact match found in ROR"
                if affiliation_matches:
                    for match in affiliation_matches:
                        outfile.write(f"{match}\n")
                else:
                    outfile.write("  - No exact match found in ROR.\n")

            # Separate each block for readability
            outfile.write("\n")

print(f"Results saved to {output_file_path}")


KeyboardInterrupt: 

In [20]:
import re
import csv
import os
from compare import Trie

# Define file paths
ror_data_path = '1.34_extracted_ror_data.csv'
common_words_path = 'common english word.txt'
input_file_path = './tagged_outputs/affiliation_output.txt'
output_file_path = './tagged_outputs/affiliation_output_with_ror_matches_trie.txt'

# Helper function for consistent text normalization
def normalize_text(text):
    """Normalize text by converting to lowercase, removing punctuation, and extra spaces."""
    text = re.sub(r'[^\w\s]', '', text)  # Remove all non-alphanumeric and non-space characters
    return text.lower().strip()

# Initialize Tries
ror_trie = Trie()
common_words_trie = Trie()

# Step 1: Load common words into Trie for filtering
def load_common_words(common_words_path):
    with open(common_words_path, 'r', encoding='utf-8') as file:
        for line in file:
            word, _ = line.split()
            common_words_trie.insert(word.upper())

# Step 2: Load ROR data into Trie
def load_ror_data(ror_data_path):
    ror_dict = {}
    with open(ror_data_path, newline='', encoding='utf-8') as csvfile:
        reader = csv.DictReader(csvfile)
        for row in reader:
            ror_id = row['id']
            name = row['name']
            aliases = row['aliases'].split(';') if row['aliases'] else []

            # Add official name and aliases to the Trie
            names = [name] + aliases
            for institution_name in names:
                normalized_name = normalize_text(institution_name).upper()
                if len(normalized_name.split()) > 1:  # Skip single-word names
                    ror_trie.insert(normalized_name, ror_id)
                    ror_dict[normalized_name] = (ror_id, institution_name)
    return ror_dict

# Step 3: Match affiliations against ROR Trie
def match_affiliations(input_file_path, ror_dict):
    with open(input_file_path, 'r', encoding='utf-8') as infile, open(output_file_path, 'w', encoding='utf-8') as outfile:
        content_blocks = infile.read().split('Content in \\affiliation for')

        for block in content_blocks:
            if block.strip():  # Ignore empty blocks
                # Write the header for each block
                outfile.write(f"Content in \\affiliation for{block.strip()}\n")

                # Extract each \affiliation{...} tag's content
                affiliations = re.findall(r'(\\affiliation\{(.*?)\})', block, re.DOTALL)

                # Step 4: Match each affiliation against the Trie
                for full_tag, affiliation in affiliations:
                    # Write the original \affiliation{...} content
                    outfile.write(f"\n{full_tag}\n")

                    # Normalize the affiliation text for comparison
                    normalized_affiliation = normalize_text(affiliation).upper()

                    # Match words against ROR Trie
                    matched_ids = set()
                    words = normalized_affiliation.split()
                    for i in range(len(words)):
                        for j in range(i + 1, len(words) + 1):
                            phrase = " ".join(words[i:j])
                            if common_words_trie.search(phrase):  # Skip common English words
                                continue
                            match = ror_trie.search(phrase)
                            if match and match.is_word:
                                matched_ids.update(match.matchedIds)

                    # Append matches or "No exact match found in ROR"
                    if matched_ids:
                        for ror_id in matched_ids:
                            _, institution_name = ror_dict.get(ror_id, ("Unknown", "Unknown"))
                            outfile.write(f"  - ROR_ID: {ror_id}\n  - Name: {institution_name}\n")
                    else:
                        outfile.write("  - No exact match found in ROR.\n")

                # Separate each block for readability
                outfile.write("\n")

# Step 4: Main execution
if __name__ == "__main__":
    print("Initializing common words and ROR data...")
    load_common_words(common_words_path)
    ror_dict = load_ror_data(ror_data_path)

    print("Matching affiliations...")
    match_affiliations(input_file_path, ror_dict)

    print(f"Results saved to {output_file_path}")


ImportError: cannot import name 'Trie' from 'compare' (/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/compare.py)