In [1]:
#pip install arxiv_latex_cleaner
import os
import re
import shutil

### [in-situ] code that finds all the dependencies of a latex file and puts all of those along with the latex file in a folder called new_fig_folder

In [2]:
# Replace with the name of your .tex file
input_folder = 'Latex_projects_arXiv/'
source_file = 'Manuscript_main_condensed_v2_regular.tex'
source_file_name = input_folder + source_file

In [None]:
# Copy the source file first
def copy_tex_file(source_tex, destination_dir):
    """Copies an existing .tex file to the new directory."""
    os.makedirs(destination_dir, exist_ok=True)
    copied_file = source_tex[:-4]+'_copy.tex'
    destination_tex = os.path.join(destination_dir, os.path.basename(copied_file))
    shutil.copy2(source_tex, destination_tex)
    print(f"Copied {source_tex} to {destination_tex}")

    return copied_file


filename = copy_tex_file(source_file_name,input_folder)

In [4]:
# Base directory of the .tex file
base_dir = os.path.dirname(os.path.abspath(filename))

# Extract graphics path(s) from the .tex file
with open(filename, 'r') as file:
    content = file.read()

In [None]:
# Patterns
# \includegraphics command with optional arguments captured as group(1)
# and filename as group(2)

includegraphics_pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?)\{([^}]+)\}')

input_pattern = re.compile(r'\\input\{(.+?)\}')
include_pattern = re.compile(r'\\include\{(.+?)\}')
bibliography_pattern = re.compile(r'\\bibliography\{(.+?)\}')

graphicspath_pattern = re.compile(r'\\graphicspath\{\{(.+?)\}\}')
graphics_paths = graphicspath_pattern.findall(content)

new_fig_folder_name = 'Figures_'
new_fig_folder = input_folder + new_fig_folder_name

os.makedirs(new_fig_folder, exist_ok=True)

image_extensions = ['.pdf', '.png', '.jpg', '.jpeg', '.eps']
file_map = {}

def get_unique_filename(folder, original_basename):
    """Return a unique filename inside `folder` by appending a counter if needed."""
    base, ext = os.path.splitext(original_basename)
    unique_name = original_basename
    count = 1
    while os.path.exists(os.path.join(folder, unique_name)):
        unique_name = f"{base}_{count}{ext}"
        count += 1
    return unique_name

def update_references(content, old_name, new_name):
    """Update \ref, \cref, \eqref, and \label commands referencing old_name with new_name."""
    for cmd in ["ref", "cref", "eqref", "label"]:
        old_pattern = f"\\{cmd}{{{old_name}}}"
        new_pattern = f"\\{cmd}{{{new_name}}}"
        content = content.replace(old_pattern, new_pattern)
    return content

def find_file(old_filename):
    """Try to locate the file referenced by old_filename in base_dir and graphics_paths."""
    # Check if old_filename has a known extension
    has_ext = any(old_filename.endswith(ext) for ext in image_extensions)
    candidates = []
    if has_ext:
        # If it has a known extension, try directly
        candidates.append(os.path.join(base_dir, old_filename))
        for gpath in graphics_paths:
            candidates.append(os.path.join(base_dir, gpath, os.path.basename(old_filename)))
    else:
        # If no extension, try each image extension
        for ext in image_extensions:
            candidates.append(os.path.join(base_dir, old_filename + ext))
        for gpath in graphics_paths:
            for ext in image_extensions:
                candidates.append(os.path.join(base_dir, gpath, old_filename + ext))

    # Return the first existing file found
    for c in candidates:
        if os.path.exists(c):
            return c
    return None

# Process \includegraphics first to handle optional arguments carefully
for m in includegraphics_pattern.findall(content):
    prefix = m[0]  # \includegraphics and optional args
    old_filename = m[1]  # the filename inside { }
    if old_filename in file_map:
        # Already processed this file
        new_filename = file_map[old_filename]
        content = re.sub(
            rf'({re.escape(prefix)})\{{{re.escape(old_filename)}}}',
            rf'\1{{{new_filename}}}',
            content
        )
        content = update_references(content, old_filename, new_filename)
        continue

    # Find the file in the filesystem
    found_path = find_file(old_filename)
    if found_path:
        base_filename = os.path.basename(found_path)
        unique_filename = get_unique_filename(new_fig_folder, base_filename)
        shutil.copy(found_path, os.path.join(new_fig_folder, unique_filename))
        file_map[old_filename] = unique_filename

        # Replace in content
        content = re.sub(
            rf'({re.escape(prefix)})\{{{re.escape(old_filename)}}}',
            rf'\1{{{unique_filename}}}',
            content
        )
        content = update_references(content, old_filename, unique_filename)
    else:
        print(f"Image file not found: {old_filename}")

# Process other patterns (\input, \include, \bibliography)
# These typically have no optional arguments and don't need special handling
for pattern in [input_pattern, include_pattern, bibliography_pattern]:
    matches = pattern.findall(content)
    for match in matches:
        if match in file_map:
            # Already processed
            new_filename = file_map[match]
            content = content.replace(match, new_filename)
            content = update_references(content, match, new_filename)
            continue

        # Bibliography may need extension
        is_bib = (pattern == bibliography_pattern and not match.endswith('.bib'))
        match_with_ext = match + '.bib' if is_bib else match

        found_path = None
        # If we are dealing with a bibliography, check with .bib extension
        if pattern == bibliography_pattern:
            # Try base dir and graphics_paths as well
            candidates = [os.path.join(base_dir, match_with_ext)]
            for gpath in graphics_paths:
                candidates.append(os.path.join(base_dir, gpath, os.path.basename(match_with_ext)))
            for c in candidates:
                if os.path.exists(c):
                    found_path = c
                    break
        else:
            # input/include: just try the file directly
            candidates = [os.path.join(base_dir, match),
                          os.path.join(base_dir, match_with_ext)]
            for gpath in graphics_paths:
                candidates.append(os.path.join(base_dir, gpath, os.path.basename(match)))
                candidates.append(os.path.join(base_dir, gpath, os.path.basename(match_with_ext)))
            for c in candidates:
                if os.path.exists(c):
                    found_path = c
                    break

        if found_path:
            base_filename = os.path.basename(found_path)
            unique_filename = get_unique_filename(new_fig_folder, base_filename)
            shutil.copy(found_path, os.path.join(new_fig_folder, unique_filename))
            file_map[match] = unique_filename
            content = content.replace(match, unique_filename)
            content = update_references(content, match, unique_filename)
        else:
            print(f"File not found: {match}")

# After all processing, update the \graphicspath to point only to new_fig_folder
content = re.sub(r'\\graphicspath\{\{.+?\}\}', '', content)
documentclass_match = re.search(r'(\\usepackage{graphicx}.*?\n)', content)
graphicspath_line = '\\graphicspath{{' + new_fig_folder_name + '/}}\n'
if documentclass_match:
    insert_pos = documentclass_match.end()
    content = content[:insert_pos] + graphicspath_line + content[insert_pos:]
else:
    # If no \documentclass found, prepend at the start
    content = graphicspath_line + content

# Write the modified content
with open(filename, 'w', encoding='utf-8') as file:
    file.write(content)