In [1]:
import tarfile
import io
import zipfile
import importlib
import regex as re
import pyperclip  
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
import os


In [2]:
def find_doc_class(fp, name_match=False):
    '''Search for document class related lines in a file and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    # Read the content as bytes
    file_content = fp.read()
    try:
        # Try decoding with UTF-8
        file_text = file_content.decode('utf-8')
    except UnicodeDecodeError:
        # Fallback to latin-1 encoding if UTF-8 fails
        file_text = file_content.decode('latin-1')

    for line in file_text.splitlines():
        if doc_class_pat.search(line):
            if name_match:
                if sub_doc_class.search(line):
                    return -99999
                return 1  # Found document class line
    return 0  # No document class line found

def find_main_tex_source_in_tar(tar_file, encoding='utf-8'):
    tex_names = set(["paper", "main", "ms.", "article"])
    tex_files = [f for f in tar_file.getnames() if f.endswith('.tex')]

    if len(tex_files) == 1:
        return tex_files[0]

    main_files = {}
    for tf in tex_files:
        depth = len(tf.split('/')) - 1
        has_main_name = any(kw in tf for kw in tex_names)
        fp = tar_file.extractfile(tf)
        if fp:
            main_files[tf] = find_doc_class(fp, name_match=has_main_name) - depth
            fp.close()

    return max(main_files, key=main_files.get) if main_files else None

def pre_format(text):
    source_text = text.replace('\\}\\', '\\} \\').replace(')}', ') }').replace(')$', ') $')
    return source_text

def source_from_tar(tar_file, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_file, encoding=encoding)
    if tex_main:
        fp = tar_file.extractfile(tex_main)
        if fp is not None:
            file_content = fp.read()  # Read as bytes to keep it in memory
            try:
                # Attempt to decode using UTF-8
                source_text = pre_format(file_content.decode(encoding))
            except UnicodeDecodeError:
                # Fallback to latin-1 encoding if UTF-8 fails
                source_text = pre_format(file_content.decode('latin-1'))
            return source_text
    return None

def extract_before_abstract(source_text):
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)
    no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)
    text = re.sub(r'\\[a-zA-Z]+\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\\[a-zA-Z]+\[[^\]]*\]\{[^}]*\}', '', no_usepackage_text)
    text = re.sub(r'\$[^$]*\$', '', no_usepackage_text)
    text = no_usepackage_text.replace('{', '').replace('}', '').replace('\n', ' ')
    text = ' '.join(no_usepackage_text.split())
    abstract_match = re.search(r'\\begin\s*\{\s*abstract\s*\}', text)

    if abstract_match:
        return text[:abstract_match.start()].strip()
    
    abstract_word_match = re.search(r'\babstract\b', text, re.IGNORECASE)
    if abstract_word_match:
        return text[:abstract_word_match.start()].strip()
    return None

zip_file_path = "./2401.zip"
output_path = "./firstoutput.txt"

latest_versions = {}
# Track the latest version of each identifier
with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
    tar_files = [f for f in zip_file.namelist() if f.endswith('.tar.gz')]
    for tar_name in tar_files:
        base_name, version = tar_name.rsplit("v", 1)
        version_num = int(version.split('.')[0])  # Extract version number
        if base_name not in latest_versions or version_num > latest_versions[base_name][1]:
            latest_versions[base_name] = (tar_name, version_num)

with open(output_path, 'w', encoding='utf-8') as output_file:
    for base_name, (tar_name, version) in latest_versions.items():
        with zipfile.ZipFile(zip_file_path, 'r') as zip_file:
            with zip_file.open(tar_name) as tar_bytes:
                tar_file = tarfile.open(fileobj=io.BytesIO(tar_bytes.read()), mode='r:gz')
                source_text = source_from_tar(tar_file)
                if source_text:
                    content_before_abstract = extract_before_abstract(source_text)
                    if content_before_abstract:
                        output_file.write(f"Content before abstract in {tar_name}:\n{content_before_abstract}\n\n")
                    else:
                        output_file.write(f"No abstract found in {tar_name}, or no content before abstract.\n\n")
                tar_file.close()


In [3]:
# Read input from a text file and filter out files without valid content before the abstract
input_file_path = 'firstoutput.txt'  # Replace with your actual file path
output_file_path = 'filtered_files_with_content_macro.txt'

# Variables to keep track of statistics
total_files_count_author = 0
valid_files_count = 0
valid_files_with_content = []

# Reading and processing the input file
with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')
    
    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count_author += 1
            lines = block.split('\n', 1)  # Split to separate the file name from its content
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()
                
                # Check if the content does not indicate "No abstract found"
                if 'No abstract found' not in content and 'no content before abstract' not in content.lower():
                    valid_files_count += 1
                    valid_files_with_content.append((file_name, content))

# Writing the filtered results to an output file
with open(output_file_path, 'w') as outfile:
    for file_name, content in valid_files_with_content:
        outfile.write(f"Content before abstract in {file_name}:\n{content}\n\n")

# Print or save the statistics summary
# print(f"Total number of files processed: {total_files_count}")
# print(f"Total number of files with valid content before abstract: {valid_files_count}")
# print(f"Filtered output saved in: {output_file_path}")


1818 files with content
183 no abstract found (probably wrong tex file)

In [15]:
import re
import os

# Define the input file path and output file directory
input_file_path = 'filtered_files_with_content_macro.txt'
output_dir = './tagged_outputs/'

# Ensure output directory exists
os.makedirs(output_dir, exist_ok=True)

# Tags to search for, their output files, and counters
tags_to_search = [
    r'\\affiliations', r'\\affiliation', r'\\icmlaffiliation',  
    r'\\institute', r'\\affil', r'\\aff', r'\\AFF', r'\\address'
]
tag_files = {tag: [] for tag in tags_to_search}  # Dictionary to store file content for each tag
tag_counts = {tag: 0 for tag in tags_to_search}

# Function to extract the content within the first level of braces after each tag
def extract_tag_content(tag, content):
    pattern = rf"({tag}\{{)"
    results = []
    start = 0
    while (match := re.search(pattern, content[start:])) is not None:
        # Find the opening brace position and keep the tag itself
        start_idx = start + match.start()
        tag_with_brace = match.group(1)  # Keep the matched tag including the opening brace
        brace_level = 1
        end_idx = start_idx + len(tag_with_brace)

        # Find the matching closing brace
        while brace_level > 0 and end_idx < len(content):
            if content[end_idx] == '{':
                brace_level += 1
            elif content[end_idx] == '}':
                brace_level -= 1
            end_idx += 1

        # Extract and store the full tag with content in braces
        results.append(content[start_idx:end_idx])
        start = end_idx  # Move the start index to continue searching

    return results

# Function to process and assign content for each tag in a file
def extract_and_assign_tag_content(file_name, content, tags):
    for tag in tags:
        # Extract content with braces for each occurrence of the tag
        matches = extract_tag_content(tag, content)

        # If matches are found, add them to the corresponding tag's list
        if matches:
            extracted_content = f"Content in {tag} for {file_name}:\n" + "\n".join(matches) + "\n"
            tag_files[tag].append(extracted_content)
            tag_counts[tag] += len(matches)  # Count each match found
            return True  # Stop after the first matching tag
    return False

# Process the input file
total_files_count = 0

with open(input_file_path, 'r') as infile:
    content_blocks = infile.read().split('Content before abstract in ')

    for block in content_blocks:
        if block.strip():  # Ensure we are not processing an empty block
            total_files_count += 1
            lines = block.split('\n', 1)
            if len(lines) > 1:
                file_name = lines[0].strip().replace(':', '')
                content = lines[1].strip()

                # Extract and assign content within tags
                extract_and_assign_tag_content(file_name, content, tags_to_search)

# Write each tag's extracted content to its respective output file
for tag in tags_to_search:
    output_file_path = "{}{}_output.txt".format(output_dir, tag.replace('\\', ''))
    with open(output_file_path, 'w') as outfile:
        outfile.write('\n'.join(tag_files[tag]))

# Print statistics
print(f"Total number of files processed: {total_files_count}")
for tag, count in tag_counts.items():
    print(f"Number of papers with tag '{tag}': {count}")
print(f"Tagged output files saved in directory: {output_dir}")


Total number of files processed: 1818
Number of papers with tag '\\affiliations': 53
Number of papers with tag '\\affiliation': 2269
Number of papers with tag '\\icmlaffiliation': 85
Number of papers with tag '\\institute': 97
Number of papers with tag '\\affil': 37
Number of papers with tag '\\aff': 3
Number of papers with tag '\\AFF': 11
Number of papers with tag '\\address': 261
Tagged output files saved in directory: ./tagged_outputs/
