In [2]:
import tarfile
import io
import importlib
import os
import regex as re
import glob
import pandas as pd
import itertools as itr
import pyperclip  
from tqdm.auto import tqdm
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
TS.__file__

def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper", "main", "ms.", "article"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            depth = len(tf.split('/')) - 1
            has_main_name = any(kw in tf for kw in tex_names)
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name) - depth 
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
        #.replace(r'\left [', r'\left[ ')
        #.replace(r'\left (', r'\left( ')
        #.replace(r'\left \{', r'\left\{ ')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

def soup_from_tar(tar_path, encoding='utf-8', tolerance=0):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text, tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
        return soup
    
def source_from_tar(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        return source_text
    
def extract_before_abstract(source_text):
    # Extract all content before the \begin{abstract}, removing comments and \usepackage commands
    
    # Remove LaTeX comments (lines starting with %)
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)
    
    # Remove all \usepackage{xxx} lines
    no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)
    # Remove LaTeX commands like \command{...}
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\\[a-zA-Z]+\[[^\]]*\]\{[^}]*\}', '', no_usepackage_text)
    # Remove mathematical formulas like $...$
    text = re.sub(r'\$[^$]*\$', '', no_usepackage_text)
    # Remove braces and newlines
    text = no_usepackage_text.replace('{', '').replace('}', '').replace('\n', ' ')
    # Remove extra spaces
    text = ' '.join(no_usepackage_text.split())
    # Use regex to locate the position of \begin{abstract}
    abstract_match = re.search(r'\\begin\s*\{\s*abstract\s*\}', text)
    
    if abstract_match:
        # Extract everything before \begin{abstract}, without comments or \usepackage
        return text[:abstract_match.start()].strip()
    
    # If \begin{abstract} is not found, search for the word "abstract"
    abstract_word_match = re.search(r'\babstract\b', text, re.IGNORECASE)
    
    if abstract_word_match:
        # Extract everything before the word "abstract"
        return text[:abstract_word_match.start()].strip()
    return None

directory = "./2201_samp/"

tar_files = glob.glob(os.path.join(directory, "*.tar.gz"))

for infile_path in tar_files:
    source_text = source_from_tar(infile_path)  # Get raw source text
    pyperclip.copy(source_text)  # Copy the raw text to clipboard if needed
    
    # Extract content before \begin{abstract} using raw LaTeX source text
    content_before_abstract = extract_before_abstract(source_text)
    
    if content_before_abstract:
        print(f"Content before abstract in {infile_path}:\n{content_before_abstract}\n")
    else:
        print(f"No abstract found in {infile_path}, or no content before abstract.\n")



Content before abstract in ./2201_samp/2201.00040v2.tar.gz:
\documentclass{article} \newcommand*\circled[1]{\tikz[baseline=(char.base)]{ \draw (0,0.25) -- (0,0.75); \draw (1,0.25) -- (1,0.5); \draw (2,0.5) -- (2,0.75); \draw (-0.25,0.5) -- (1,0.5); \draw (2,0.5) -- (2.25,0.5); \draw [black, fill=black] (1,1) circle(2pt); \draw [black, fill=black] (2,0) circle(2pt); \node[shape=circle,draw,inner sep=2pt] at (0,0) (char) {#1}; \node[shape=circle,draw,inner sep=2pt] at (0,1) (char) {#1}; \node[shape=circle,draw,inner sep=2pt] at (1,0) (char) {#1}; \node[shape=circle,draw,inner sep=2pt] at (2,1) (char) {#1}; }} \newcommand*\circledtwo[2]{\tikz[baseline=(char.base)]{ \draw (2,0.25) -- (2,0.5); \draw (0,0.5) -- (0,0.75); \draw (0,0.5) -- (2,0.5); \draw [black, fill=black] (0,0) circle(2pt); \draw [black, fill=black] (1,1) circle(2pt); \draw [black, fill=black] (2,1) circle(2pt); \node[shape=circle,draw,inner sep=2pt] at (1,0) (char) {#1}; \node[shape=circle,draw,inner sep=2pt] at (2,0) (char

In [16]:
# 10.20
import json
import random
import spacy
from spacy.training import Example

# Load ROR dataset
with open('ror-onename.json', 'r') as f:
    ror_data = json.load(f)

# Extract institution names (this is the one-name-one-insti version)
def extract_institution_names(ror_data):
    institution_names = []
    for entry in ror_data:
        institution_names.append(entry['name'])
    return institution_names

institution_names = extract_institution_names(ror_data)

# Create training examples from the ROR institution names
# def create_training_data_from_ror(institution_names, num_samples=1000):
#     templates = [
#         "{} is a leading research institute.",
#         "The work was done at {}.",
#         "The collaboration between {} and other universities is remarkable.",
#         "{} has been a pioneer in the field."
#     ]
    
#     training_data = []
#     for _ in range(num_samples):
#         institution = random.choice(institution_names)
#         template = random.choice(templates)
#         text = template.format(institution)
#         entity_start = text.find(institution)
#         entity_end = entity_start + len(institution)
#         training_data.append((text, {"entities": [(entity_start, entity_end, "INSTITUTION")]}))
    
#     return training_data

# # Create training data
# TRAIN_DATA = create_training_data_from_ror(institution_names)


In [15]:
# use real training set
import tarfile
import io
import os
import re
import json
import spacy
from spacy.training import Example
import glob
from tqdm.auto import tqdm

# Define directories
directory_tar = "./2201_samp/"
directory_json = "./2201_samp_test_set/"
tar_files = glob.glob(os.path.join(directory_tar, "*.tar.gz"))
json_files = glob.glob(os.path.join(directory_json, "*.json"))



# Load JSON and find affiliations
def load_json_data(json_file_path):
    '''Load the JSON file with the author data and extract affiliations.'''
    with open(json_file_path, 'r') as f:
        return json.load(f)

# Create training examples based on affiliation data and source text
def create_training_data_from_real_data(tar_files, directory_json):
    training_data = []
    
    for tar_file in tqdm(tar_files):
        # Extract the text before the abstract from the .tar.gz file
        source_text = source_from_tar(tar_file)
        content_before_abstract = extract_before_abstract(source_text)
        
        # If no content was found before the abstract, skip this file
        if content_before_abstract is None:
            print(f"No content before abstract found in {tar_file}. Skipping.")
            continue
        
        # Get the corresponding JSON file (assuming matching filenames)
        base_name = os.path.basename(tar_file).replace('.tar.gz', '')
        json_file_path = os.path.join(directory_json, f"{base_name}.json")
        
        if not os.path.exists(json_file_path):
            print(f"JSON file for {tar_file} not found. Skipping.")
            continue
        
        # Load the JSON data
        json_data = load_json_data(json_file_path)
        
        # Process each author and find their affiliation in the text
        for author, details in json_data.items():
            for affiliation in details['Affiliation']:
                # Search for the affiliation in the extracted content
                entity_start = content_before_abstract.find(affiliation)
                if entity_start != -1:
                    entity_end = entity_start + len(affiliation)
                    training_data.append((
                        content_before_abstract, 
                        {"entities": [(entity_start, entity_end, "INSTITUTION")]}
                    ))
                else:
                    print(f"Affiliation '{affiliation}' not found in {tar_file}.")
    
    return training_data


# Create training data
training_data = create_training_data_from_real_data(tar_files, directory_json)

# Create a blank spaCy model
nlp = spacy.blank("en")
ner = nlp.add_pipe("ner")

# Add the "INSTITUTION" label to the NER model
ner.add_label("INSTITUTION")

# Prepare the training data for spaCy
def train_ner_model(train_data):
    optimizer = nlp.begin_training()
    for itn in range(10):  # Train for 10 iterations
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            doc = nlp.make_doc(text)
            example = Example.from_dict(doc, annotations)
            nlp.update([example], drop=0.5, losses=losses)
        print(f"Iteration {itn} - Losses: {losses}")

# Train the NER model with the generated data
train_ner_model(training_data)


  0%|          | 0/17 [00:00<?, ?it/s]

Affiliation 'Max-Planck-Institut für Radioastronomie' not found in ./2201_samp/2201.00021v2.tar.gz.
Affiliation 'Max-Planck-Institut für Radioastronomie' not found in ./2201_samp/2201.00021v2.tar.gz.
Affiliation 'Max-Planck-Institut für Radioastronomie' not found in ./2201_samp/2201.00021v2.tar.gz.
Affiliation 'Max-Planck-Institut für Radioastronomie' not found in ./2201_samp/2201.00021v2.tar.gz.
Affiliation 'Max-Planck-Institut für Radioastronomie' not found in ./2201_samp/2201.00021v2.tar.gz.
Affiliation 'Max-Planck-Institut für Radioastronomie' not found in ./2201_samp/2201.00021v2.tar.gz.
No content before abstract found in ./2201_samp/2201.00022v1.tar.gz. Skipping.
Affiliation 'School of Mathematics, Georgia Tech' not found in ./2201_samp/2201.00045v1.tar.gz.
Affiliation 'Laboratoire de Physique de l'Ecole Normale Superieure, ENS, Universite PSL, CNRS, Sorbonne Universite, Universite de Paris' not found in ./2201_samp/2201.00032v1.tar.gz.
Affiliation 'Laboratoire de Physique de l'

1



Iteration 0 - Losses: {'ner': 14879.993637331985}
Iteration 1 - Losses: {'ner': 335.685229165806}
Iteration 2 - Losses: {'ner': 314.06838901501965}
Iteration 3 - Losses: {'ner': 124.19993905295995}
Iteration 4 - Losses: {'ner': 298.6189173506343}
Iteration 5 - Losses: {'ner': 135.54480493965687}
Iteration 6 - Losses: {'ner': 101.63419068081365}
Iteration 7 - Losses: {'ner': 130.19315916549846}
Iteration 8 - Losses: {'ner': 94.433108413656}
Iteration 9 - Losses: {'ner': 98.47840632453511}
Model saved to ./trained_ner_model


In [18]:

ruler = nlp.add_pipe("entity_ruler", after="ner")
patterns = [{"label": "INSTITUTION", "pattern": institution} for institution in institution_names]
ruler.add_patterns(patterns)

# Save the trained model
output_dir = "./trained_ner_model"
nlp.to_disk(output_dir)
print(f"Model saved to {output_dir}")


Model saved to ./trained_ner_model


In [5]:
import os
import re
import spacy
import json
from spacy.training import Example
from spacy import displacy
from sklearn.metrics import precision_score, recall_score, f1_score
import glob
from tqdm.auto import tqdm

# Load the trained model
output_dir = "./trained_ner_model"
nlp = spacy.load(output_dir)

# Define directories for test files
test_directory_tar = "./2201_samp_1/"
test_directory_json = "./2201_samp_test_set_1/"
test_tar_files = glob.glob(os.path.join(test_directory_tar, "*.tar.gz"))

# Load the test JSON data for evaluation
def load_json_data(json_file_path):
    '''Load the JSON file with the author data and extract affiliations.'''
    with open(json_file_path, 'r') as f:
        return json.load(f)

# Extract institutions from the JSON file (ground truth)
def extract_institutions_from_json(json_data):
    '''Extract all affiliations from the JSON file for comparison.'''
    institutions = []
    for author, details in json_data.items():
        institutions.extend(details['Affiliation'])
    return institutions

# Compare the extracted affiliations with model-predicted entities
def evaluate_model_on_test_data(test_tar_files, test_directory_json):
    true_entities = []
    predicted_entities = []

    for tar_file in tqdm(test_tar_files):
        # Extract text before abstract (same as training process)
        source_text = source_from_tar(tar_file)
        content_before_abstract = extract_before_abstract(source_text)
        
        if content_before_abstract:
            print(f"Content before abstract in {tar_file}:\n{content_before_abstract}\n")
        else:
            print(f"No abstract found in {tar_file}, or no content before abstract.\n")
        
        # Get the corresponding JSON file (assuming matching filenames)
        base_name = os.path.basename(tar_file).replace('.tar.gz', '')
        json_file_path = os.path.join(test_directory_json, f"{base_name}.json")
        
        if not os.path.exists(json_file_path):
            print(f"JSON file for {tar_file} not found. Skipping.")
            continue
        
        # Load the JSON data
        json_data = load_json_data(json_file_path)
        
        # Extract true institutions from the JSON file
        true_institutions = extract_institutions_from_json(json_data)
        true_entities.extend(true_institutions)
        
        # Run the trained model on the extracted text
        doc = nlp(content_before_abstract)
        
        # Extract predicted institutions from the model
        predicted_institutions = [ent.text for ent in doc.ents if ent.label_ == "INSTITUTION"]
        predicted_entities.extend(predicted_institutions)
        
        # Print comparison for debugging/analysis
        print(f"--- Test Case: {tar_file} ---")
        print("True Institutions:", true_institutions)
        print("Predicted Institutions:", predicted_institutions)
        print("-----------------------------------\n")
    
    return true_entities, predicted_entities

# Function to calculate and print accuracy metrics
def calculate_metrics(true_entities, predicted_entities):
    # Precision, Recall, and F1 are hard to directly calculate with non-exact matches (e.g., partial matches)
    # We will compute the precision/recall on exact matches, but you could extend this with fuzzy matching.
    
    true_set = set(true_entities)
    predicted_set = set(predicted_entities)
    
    true_positive = true_set.intersection(predicted_set)
    false_positive = predicted_set - true_set
    false_negative = true_set - predicted_set
    
    precision = len(true_positive) / (len(true_positive) + len(false_positive)) if len(true_positive) + len(false_positive) > 0 else 0
    recall = len(true_positive) / (len(true_positive) + len(false_negative)) if len(true_positive) + len(false_negative) > 0 else 0
    f1 = 2 * (precision * recall) / (precision + recall) if precision + recall > 0 else 0
    
    print(f"Precision: {precision * 100:.2f}%")
    print(f"Recall: {recall * 100:.2f}%")
    print(f"F1-Score: {f1 * 100:.2f}%")
    
    print(f"\nTrue Positives (Correctly Identified): {true_positive}")
    print(f"False Positives (Wrongly Identified): {false_positive}")
    print(f"False Negatives (Missed Entities): {false_negative}")

# Run the evaluation on the test data
true_entities, predicted_entities = evaluate_model_on_test_data(test_tar_files, test_directory_json)

# Calculate and display metrics
calculate_metrics(true_entities, predicted_entities)


  0%|          | 0/18 [00:00<?, ?it/s]

Content before abstract in ./2201_samp_1/2201.00048v1.tar.gz:
\documentclass[ preprint, amsmath,amssymb, aps, showkeys, showpacs ]{revtex4-2} \usepackage[utf8]{inputenc} \bibliographystyle{utphys} \newcommand{\GeV}{{\rm \,GeV}} \newcommand{\TeV}{{\rm \,TeV}} \newcommand{\MeV}{{\rm \,MeV}} \newcommand{\KeV}{{\rm \,KeV}} \newcommand{\eV}{{\rm \,eV}} \newcommand{\cm}{{\rm \,cm}} \newcommand{\km}{{\rm \,km}} \newcommand{\s}{{\rm \,s}} \newcommand{\erf}{{\rm \,Erf}} \def\be {\begin{equation}} \def\ee {\end{equation}} \def\ba {\begin{array}} \def\ea {\end{array}} \def\bea {\begin{eqnarray}} \def\eea {\end{eqnarray}} \def\bean {\begin{eqnarray*}} \def\eean {\end{eqnarray*}} \def\nn{\nonumber} \newcommand{\Msun}{M_\odot} \newcommand{\Mstar}{M_\star} \newcommand{\Rstar}{R_\star} \newcommand{\vstar}{v_\star} \newcommand{\tstar}{t_\star} \newcommand{\Tstar}{T_\star} \newcommand{\fMB}{f_{\rm MB}} \newcommand{\fFD}{f_{\rm FD}} \newcommand{\m}{{\rm \,m}} \newcommand{\sigmath}{\sigma_{th}} \begin{doc