In [5]:
import tarfile
import io
import importlib
import os
import regex as re
import glob
import pandas as pd
import itertools as itr
import pyperclip  
from tqdm.auto import tqdm
from IPython.core.interactiveshell import InteractiveShell
# pretty print all cell's output and not just the last one
InteractiveShell.ast_node_interactivity = "all"
import TexSoup as TS
from TexSoup.tokens import MATH_ENV_NAMES
TS.__file__

def find_doc_class(wrapped_file, name_match=False):
    '''Search for document class related lines in a file  and return a code to represent the type'''
    doc_class_pat = re.compile(r"^\s*\\document(?:style|class)")
    sub_doc_class = re.compile(r"^\s*\\document(?:style|class).*(?:\{standalone\}|\{subfiles\})")

    for line in wrapped_file:
        if doc_class_pat.search(line):
            if name_match:
                # we can miss if there are two or more lines with documentclass 
                # and the first one is not the one that has standalone/subfile
                if sub_doc_class.search(line):
                    return -99999
                return 1 #main_files[tf] = 1
            
    return 0 #main_files[tf] = 0


def find_main_tex_source_in_tar(tar_path, encoding='uft-8'):
    '''Identify the main Tex file in a tarfile.
    
    Args:
        tar_path: A gzipped tar archive of a directory containing tex source and support files.
    '''
    
    tex_names = set(["paper", "main", "ms.", "article"])

    with tarfile.open(tar_path, 'r') as in_tar:
        tex_files = [f for f in in_tar.getnames() if f.endswith('.tex')]
        
        # got one file
        if len(tex_files) == 1:
            return tex_files[0]

        main_files = {}
        for tf in tex_files:
            depth = len(tf.split('/')) - 1
            has_main_name = any(kw in tf for kw in tex_names)
            fp = in_tar.extractfile(tf)
            wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
            # does it have a doc class?
            # get the type
            main_files[tf] = find_doc_class(wrapped_file, name_match = has_main_name) - depth 
            wrapped_file.close() 
        
        # got one file with doc class
        if len(main_files) == 1:
            return(main_files.keys()[0])
        
        # account for multi-file submissions
        return(max(main_files, key=main_files.get))

def pre_format(text):
    '''Apply some substititions to make LaTeX easier to parse'''
    source_text = (
        text
        .replace('\\}\\', '\\} \\')  # Due to escape rules \\ is equivalent to \
        .replace(')}', ') }')
        .replace(')$', ') $')
        #.replace(r'\left [', r'\left[ ')
        #.replace(r'\left (', r'\left( ')
        #.replace(r'\left \{', r'\left\{ ')
    )
    return source_text
    #clean_lines = []
    #for line in source_text.splitlines(False):
    #    cleanline = line.strip()
    #    if cleanline.startswith(r'\newcommand'):
    #        cleanline = r'%' + cleanline
    #    elif cleanline.startswith(r'\def'):
    #        cleanline = r'%' + cleanline
    #    clean_lines.append(cleanline)
    #return '\n'.join(clean_lines)

def soup_from_tar(tar_path, encoding='utf-8', tolerance=0):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        soup = TS.TexSoup(source_text, tolerance=tolerance, skip_envs=MATH_ENV_NAMES)
        return soup
    
def source_from_tar(tar_path, encoding='utf-8'):
    tex_main = find_main_tex_source_in_tar(tar_path, encoding=encoding)
    with tarfile.open(tar_path, 'r') as in_tar:
        fp = in_tar.extractfile(tex_main)
        wrapped_file = io.TextIOWrapper(fp, newline=None, encoding=encoding) #universal newlines
        source_text = pre_format(wrapped_file.read())
        return source_text

'/Library/Frameworks/Python.framework/Versions/3.11/lib/python3.11/site-packages/TexSoup/__init__.py'

In [21]:
def extract_before_abstract(source_text):
    # Extract all content before the \begin{abstract}, removing comments and \usepackage commands
    
    # Remove LaTeX comments (lines starting with %)
    no_comments_text = re.sub(r'(?<!\\)%.*', '', source_text)
    
    # Remove all \usepackage{xxx} lines
    no_usepackage_text = re.sub(r'\\usepackage\s*\{[^}]+\}', '', no_comments_text)
    
    # Use regex to locate the position of \begin{abstract}
    abstract_match = re.search(r'\\begin\s*\{\s*abstract\s*\}', no_usepackage_text)
    
    if abstract_match:
        # Extract everything before \begin{abstract}, without comments or \usepackage
        return no_usepackage_text[:abstract_match.start()].strip()
    return None  # Return None if no \begin{abstract} is found

In [22]:
directory = "./2201_samp/"

tar_files = glob.glob(os.path.join(directory, "*.tar.gz"))

for infile_path in tar_files:
    source_text = source_from_tar(infile_path)  # Get raw source text
    pyperclip.copy(source_text)  # Copy the raw text to clipboard if needed
    
    # Extract content before \begin{abstract} using raw LaTeX source text
    content_before_abstract = extract_before_abstract(source_text)
    
    if content_before_abstract:
        print(f"Content before abstract in {infile_path}:\n{content_before_abstract}\n")
    else:
        print(f"No abstract found in {infile_path}, or no content before abstract.\n")



Content before abstract in ./2201_samp/2201.00048v1.tar.gz:
\documentclass[





preprint,




 amsmath,amssymb,
 aps,
showkeys,
showpacs
]{revtex4-2}
\usepackage[utf8]{inputenc}
 
\bibliographystyle{utphys}











\newcommand{\GeV}{{\rm \,GeV}}
\newcommand{\TeV}{{\rm \,TeV}}
\newcommand{\MeV}{{\rm \,MeV}}
\newcommand{\KeV}{{\rm \,KeV}}
\newcommand{\eV}{{\rm \,eV}}
\newcommand{\cm}{{\rm \,cm}}
\newcommand{\km}{{\rm \,km}}
\newcommand{\s}{{\rm \,s}}

\newcommand{\erf}{{\rm \,Erf}}

 \def\be   {\begin{equation}}   \def\ee   {\end{equation}}
 \def\ba   {\begin{array}}      \def\ea   {\end{array}}
 \def\bea  {\begin{eqnarray}}   \def\eea  {\end{eqnarray}}
 
 \def\bean {\begin{eqnarray*}}  \def\eean {\end{eqnarray*}}
 
 \def\nn{\nonumber}
 
 \newcommand{\Msun}{M_\odot}
\newcommand{\Mstar}{M_\star}
\newcommand{\Rstar}{R_\star}
\newcommand{\vstar}{v_\star}
\newcommand{\tstar}{t_\star}
\newcommand{\Tstar}{T_\star}

\newcommand{\fMB}{f_{\rm MB}}
\newcommand{\fFD}{f_{\rm FD}}

\newcommand{\m