In [1]:
#pip install arxiv_latex_cleaner
import os
import re
import shutil

In [2]:
def process_latex_file(input_filepath: str, output_filepath: str):
    with open(input_filepath, 'r', encoding='utf-8') as infile:
        latex_content = infile.read()

    # Regex pattern to detect LaTeX environments
    env_pattern = re.compile(r'(\\begin\{.*?\}.*?\\end\{.*?\})', re.DOTALL)

    # Split content into environment and non-environment parts
    chunks = re.split(env_pattern, latex_content)

    processed_chunks = []
    for chunk in chunks:
        # If this chunk starts with '\begin', it is an environment; leave it unchanged
        if chunk.strip().startswith('\\begin'):
            processed_chunks.append(chunk)
        else:
            # Non-environment text
            lines = chunk.split('\n')
            processed_lines = []
            for line in lines:
                line = line.strip()
                if not line:
                    # Preserve blank lines
                    processed_lines.append('')
                    continue

                # Split line into sentences by a period followed by whitespace
                sentences = re.split(r'(?<=\.)\s', line)
                # If there's more than one sentence, join them with newlines
                # If only one sentence, leave it as is
                if len(sentences) > 1:
                    sentences = [s.strip() for s in sentences if s.strip()]
                    processed_line = '\n'.join(sentences)
                else:
                    processed_line = line

                processed_lines.append(processed_line)

            # Join the processed lines back together with newlines
            processed_chunk = '\n'.join(processed_lines)
            processed_chunks.append(processed_chunk)

    # Join all chunks together
    processed_content = ''.join(processed_chunks)

    with open(output_filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(processed_content)


def process_all_tex_files(root_dir: str, output_dir: str):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.tex') and "_processed" not in filename:
                input_filepath = os.path.join(dirpath, filename)
                output_filepath = os.path.join(output_dir, filename.replace('.tex', '_processed.tex'))
                process_latex_file(input_filepath, output_filepath)

In [None]:
# Example usage
root_dir = r"Latex_Project_A/"
output_dir =root_dir
process_all_tex_files(root_dir, output_dir)