In [4]:
#pip install arxiv_latex_cleaner
import os
import re
import shutil

#### Do this after step-4 and make sure the input tex file has no comments.

In [5]:
import re

def extract_environments(latex_code, environments):
    """
    Extract specified environments from the LaTeX code, ignoring commented-out environments or those with only commented content.

    :param latex_code: A string containing raw LaTeX code.
    :param environments: A list of environment names to extract.
    :return: A dictionary with environment names as keys and a list of extracted environments as values.
    """
    extracted = {}
    for env in environments:
        pattern = re.compile(r'(?<!%)\\begin\{' + env + r'\}(.*?)\\end\{' + env + r'\}', re.DOTALL)
        matches = re.findall(pattern, latex_code)

        # Filter out environments where all lines are commented out
        filtered_matches = []
        for match in matches:
            # Remove leading/trailing spaces and check if any line is NOT commented out
            content_lines = match.strip().split("\n")
            if any(not line.strip().startswith("%") for line in content_lines):
                filtered_matches.append(f"\\begin{{{env}}}{match}\\end{{{env}}}")

        extracted[env] = filtered_matches

    return extracted

def extract_graphics_path(latex_code):
    """
    Extract the \graphicspath command from the LaTeX preamble.

    :param latex_code: A string containing raw LaTeX code.
    :return: A string containing the \graphicspath command if found, else an empty string.
    """
    match = re.search(r'\\graphicspath\{.*?\}', latex_code)
    return match.group(0) if match else ''

def write_to_new_file(extracted, output_filename, graphics_path):
    """
    Write the extracted code to a new LaTeX file with a basic preamble.

    :param extracted: A dictionary with extracted environments.
    :param output_filename: The name of the output file.
    :param graphics_path: The \graphicspath command from the original file.
    """
    preamble = f"""
\\documentclass{{article}}
\\usepackage{{amsmath}}
\\usepackage{{graphicx}}
\\usepackage{{booktabs}}
\\usepackage{{caption}}
\\usepackage{{subcaption}}
{graphics_path+ "}"}  % Preserve original graphics path
\\begin{{document}}
"""
    postamble = "\n\\end{document}"

    with open(output_filename, 'w', encoding='utf-8') as f:
        f.write(preamble)
        for env in extracted:
            if extracted[env]:  # Only write non-empty environments
                f.write(f'\n% Extracted {env} environments\n')
                f.write('\n\n'.join(extracted[env]) + "\n")
        f.write(postamble)

In [6]:
# Example usage
input_folder = 'Latex_projects_arXiv/'
input_filename = input_folder + 'Manuscript_main_condensed_v2_regular.tex'
output_filename = input_filename[:-4] + '_floats_only.tex'


# Read the input LaTeX file
with open(input_filename, 'r', encoding='utf-8') as f:
    latex_code = f.read()

# Extract equation, align, table, and figure environments
extracted = extract_environments(latex_code, ['equation', 'align', 'table', 'figure'])

# Extract \graphicspath
graphics_path = extract_graphics_path(latex_code)

# Write the extracted code to a new LaTeX file
write_to_new_file(extracted, output_filename, graphics_path)
