In [5]:
# !pip install arxiv_latex_cleaner

In [2]:
#pip install arxiv_latex_cleaner
import os
import re
import shutil

In [None]:
os.popen("arxiv_latex_cleaner reassembled_document.tex --keep_bib")

### code that finds all the dependencies of a latex file and puts all of those along with the latex file in a folder called new_folder

In [8]:
# Replace with the name of your .tex file
filename = 'reassembled_document.tex'

In [9]:
# Base directory of the .tex file
base_dir = os.path.dirname(os.path.abspath(filename))

# Extract graphics path(s) from the .tex file
with open(filename, 'r') as file:
    content = file.read()

In [None]:
# Patterns
# \includegraphics command with optional arguments captured as group(1)
# and filename as group(2)
includegraphics_pattern = re.compile(r'(\\includegraphics(?:\[[^\]]*\])?)\{([^}]+)\}')

input_pattern = re.compile(r'\\input\{(.+?)\}')
include_pattern = re.compile(r'\\include\{(.+?)\}')
bibliography_pattern = re.compile(r'\\bibliography\{(.+?)\}')

graphicspath_pattern = re.compile(r'\\graphicspath\{\{(.+?)\}\}')
graphics_paths = graphicspath_pattern.findall(content)

new_folder = 'new_folder'
os.makedirs(new_folder, exist_ok=True)

image_extensions = ['.pdf', '.png', '.jpg', '.jpeg', '.eps']
file_map = {}

def get_unique_filename(folder, original_basename):
    """Return a unique filename inside `folder` by appending a counter if needed."""
    base, ext = os.path.splitext(original_basename)
    unique_name = original_basename
    count = 1
    while os.path.exists(os.path.join(folder, unique_name)):
        unique_name = f"{base}_{count}{ext}"
        count += 1
    return unique_name

def update_references(content, old_name, new_name):
    """Update \ref, \cref, \eqref, and \label commands referencing old_name with new_name."""
    for cmd in ["ref", "cref", "eqref", "label"]:
        old_pattern = f"\\{cmd}{{{old_name}}}"
        new_pattern = f"\\{cmd}{{{new_name}}}"
        content = content.replace(old_pattern, new_pattern)
    return content

def find_file(old_filename):
    """Try to locate the file referenced by old_filename in base_dir and graphics_paths."""
    # Check if old_filename has a known extension
    has_ext = any(old_filename.endswith(ext) for ext in image_extensions)
    candidates = []
    if has_ext:
        # If it has a known extension, try directly
        candidates.append(os.path.join(base_dir, old_filename))
        for gpath in graphics_paths:
            candidates.append(os.path.join(base_dir, gpath, os.path.basename(old_filename)))
    else:
        # If no extension, try each image extension
        for ext in image_extensions:
            candidates.append(os.path.join(base_dir, old_filename + ext))
        for gpath in graphics_paths:
            for ext in image_extensions:
                candidates.append(os.path.join(base_dir, gpath, old_filename + ext))

    # Return the first existing file found
    for c in candidates:
        if os.path.exists(c):
            return c
    return None

# Process \includegraphics first to handle optional arguments carefully
for m in includegraphics_pattern.findall(content):
    prefix = m[0]  # \includegraphics and optional args
    old_filename = m[1]  # the filename inside { }
    if old_filename in file_map:
        # Already processed this file
        new_filename = file_map[old_filename]
        content = re.sub(
            rf'({re.escape(prefix)})\{{{re.escape(old_filename)}}}',
            rf'\1{{{new_filename}}}',
            content
        )
        content = update_references(content, old_filename, new_filename)
        continue

    # Find the file in the filesystem
    found_path = find_file(old_filename)
    if found_path:
        base_filename = os.path.basename(found_path)
        unique_filename = get_unique_filename(new_folder, base_filename)
        shutil.copy(found_path, os.path.join(new_folder, unique_filename))
        file_map[old_filename] = unique_filename

        # Replace in content
        content = re.sub(
            rf'({re.escape(prefix)})\{{{re.escape(old_filename)}}}',
            rf'\1{{{unique_filename}}}',
            content
        )
        content = update_references(content, old_filename, unique_filename)
    else:
        print(f"Image file not found: {old_filename}")

# Process other patterns (\input, \include, \bibliography)
# These typically have no optional arguments and don't need special handling
for pattern in [input_pattern, include_pattern, bibliography_pattern]:
    matches = pattern.findall(content)
    for match in matches:
        if match in file_map:
            # Already processed
            new_filename = file_map[match]
            content = content.replace(match, new_filename)
            content = update_references(content, match, new_filename)
            continue

        # Bibliography may need extension
        is_bib = (pattern == bibliography_pattern and not match.endswith('.bib'))
        match_with_ext = match + '.bib' if is_bib else match

        found_path = None
        # If we are dealing with a bibliography, check with .bib extension
        if pattern == bibliography_pattern:
            # Try base dir and graphics_paths as well
            candidates = [os.path.join(base_dir, match_with_ext)]
            for gpath in graphics_paths:
                candidates.append(os.path.join(base_dir, gpath, os.path.basename(match_with_ext)))
            for c in candidates:
                if os.path.exists(c):
                    found_path = c
                    break
        else:
            # input/include: just try the file directly
            candidates = [os.path.join(base_dir, match),
                          os.path.join(base_dir, match_with_ext)]
            for gpath in graphics_paths:
                candidates.append(os.path.join(base_dir, gpath, os.path.basename(match)))
                candidates.append(os.path.join(base_dir, gpath, os.path.basename(match_with_ext)))
            for c in candidates:
                if os.path.exists(c):
                    found_path = c
                    break

        if found_path:
            base_filename = os.path.basename(found_path)
            unique_filename = get_unique_filename(new_folder, base_filename)
            shutil.copy(found_path, os.path.join(new_folder, unique_filename))
            file_map[match] = unique_filename
            content = content.replace(match, unique_filename)
            content = update_references(content, match, unique_filename)
        else:
            print(f"File not found: {match}")

# After all processing, update the \graphicspath to point only to new_folder
content = re.sub(r'\\graphicspath\{\{.+?\}\}', '', content)
documentclass_match = re.search(r'(\\usepackage{graphicx}.*?\n)', content)
graphicspath_line = '\\graphicspath{{' + new_folder + '/}}\n'
if documentclass_match:
    insert_pos = documentclass_match.end()
    content = content[:insert_pos] + graphicspath_line + content[insert_pos:]
else:
    # If no \documentclass found, prepend at the start
    content = graphicspath_line + content

# Write the modified content
with open(filename, 'w', encoding='utf-8') as file:
    file.write(content)


### Python code to split each section into different tex files

In [None]:
# Path to the input file and output directory
input_file = 'Manuscript.tex'
output_dir = 'sections'
os.makedirs(output_dir, exist_ok=True)

# Running the complete Python script again on the newly uploaded file
# Resetting variables for the new run
# Modifying the script to generate the main.tex file with all the \include and \includeonly commands.

section_content = []
section_files = []
preamble_content = ''
inside_preamble = True
inside_abstract = False
outfile = None
unmatched_lines = []  # to store lines that seem like sections but are not matched
bib_commands = ''  # to store bibliography and bibliography style commands


# Handle Abstract
begin_abstract_in_line = False
end_abstract_in_line = False

try:
    with open(input_file, 'r') as infile:
        for line in infile:
            
            # Handling Bibliography
            bib_match = re.search(r'\\bibliography{(.+?)}', line)
            bib_filename = bib_match.group(1) if bib_match else None
            if bib_filename:
                bib_commands += f"\\bibliography{{{bib_filename}}}\n"

            # Handling Bibliography Style
            bibstyle_match = re.search(r'\\bibliographystyle{(.+?)}', line)
            bibstyle_filename = bibstyle_match.group(1) if bibstyle_match else None
            if bibstyle_filename:
                bib_commands += f"\\bibliographystyle{{{bibstyle_filename}}}\n"
            
            if '\\begin{abstract}' in line:
                inside_abstract = True
                begin_abstract_in_line = True
                output_file = 'abstract'
                if outfile:
                    outfile.close()
                    outfile = None
                output_file_path = os.path.join(output_dir, f'{output_file}.tex')
                outfile = open(output_file_path, 'w')
                continue

            if '\\end{abstract}' in line:
                inside_abstract = False
                end_abstract_in_line = True
                if outfile:
                    outfile.write('\n\\end{abstract}')
                    outfile.close()
                    outfile = None
                section_content.append(f"\\include{{sections/{output_file}}}")
                section_files.append(f"sections/{output_file}")
                continue

            # Handle other sections or appendices
            match = re.match(r'\\(section\*?|appendix|preamble)\{(.+?)\}', line)
            if line.strip() == '\\begin{document}':
                inside_preamble = False
            elif inside_preamble:
                preamble_content += line
            elif match or line.strip() in ['\\end{document}']:
                if outfile:
                    outfile.close()
                    outfile = None
                
                if line.strip() == '\\end{document}':
                    break
                output_file = match.group(2).strip().replace(" ", "_") if match else None
                if output_file:  # Ensuring output_file is not None or empty
                    output_file_path = os.path.join(output_dir, f'{output_file}.tex')
                    outfile = open(output_file_path, 'w')
                else:  # Collect unmatched lines that seem like sections
                    unmatched_lines.append(line.strip())
                
                section_content.append(f"\\include{{sections/{output_file}}}")
                section_files.append(f"sections/{output_file}")

            if outfile and not inside_preamble:
                if '\\bibliographystyle{' not in line and '\\bibliography{' not in line:
                    if begin_abstract_in_line:
                        outfile.write('\\begin{abstract}\n'+line)
                        begin_abstract_in_line = False
                    else:
                        outfile.write(line)
except Exception as e:
    error_message = str(e)

# After exiting the loop, close the last outfile if it is open
if outfile:
    outfile.close()

# Writing \include for all section files
all_sections_content = '\n'.join(["\\include{"+f"{itr}"+"}" for itr in section_files])

# Add \includeonly to the preamble
preamble_content = preamble_content.split('\\begin{document}')[0]  # Removing everything after \begin{document}
preamble_content += '\n\\includeonly{'
preamble_content += '\n'.join([f"{itr}," for itr in section_files[:-1]])
preamble_content += '\n'+f"{section_files[-1]}"
preamble_content += "\n"+"}"+"\n"


preamble_content += '\\begin{document}\n\\maketitle\n'+'\\begingroup\n\let\clearpage\\relax'

# Write the main file
new_main_file_path = 'main3.tex'
main_file_content = preamble_content + "\n\n" + all_sections_content +'\n\\endgroup' +f"\n{bib_commands}" + "\n\\end{document}"

# Writing the main file content to main.tex in the correct directory
with open(new_main_file_path, 'w') as main_file:
    main_file.write(main_file_content)

# Reading the content of the main.tex file again to verify the corrections
with open(new_main_file_path, 'r') as main_file:
    corrected_new_main_file_content = main_file.read()

# Displaying the corrected main file content
corrected_new_main_file_content

### Every sentence in a new line

In [None]:
# def process_latex_file(input_filepath: str, output_filepath: str):
#     with open(input_filepath, 'r') as infile:
#         latex_content = infile.read()

#     # Regular expression to match \begin{} and \end{} environments
#     env_pattern = re.compile(r'(\\begin\{.*?\}.*?\\end\{.*?\})', re.DOTALL)

#     # Split content into environments and non-environment text
#     chunks = re.split(env_pattern, latex_content)

#     # Process only the non-environment text
#     processed_chunks = []
#     for chunk in chunks:
#         if chunk.strip().startswith('\\begin'):  # if it's an environment, keep it unchanged
#             processed_chunks.append(chunk)
#         else:  # otherwise, split into sentences and join with newlines
#             sentences = re.split(r'(?<=\.)\s', chunk)
#             processed_chunks.append('\n'.join(sentences))

#     # Join the processed chunks
#     processed_content = ''.join(processed_chunks)

#     # Write the processed content to the output file
#     with open(output_filepath, 'w') as outfile:
#         outfile.write(processed_content)


# def process_all_tex_files(root_dir: str, output_dir: str):
#     for dirpath, dirnames, filenames in os.walk(root_dir):
#         for filename in filenames:
#             if filename.endswith('.tex') and "_processed" not in filename:
#                 input_filepath = os.path.join(dirpath, filename)
#                 output_filepath = os.path.join(output_dir, filename.replace('.tex', '_processed.tex'))
#                 process_latex_file(input_filepath, output_filepath)



# # Example usage
# root_dir = r"D:\OneDrive - Texas A&M University\Academic\Acads IITK\Paper\SB\Experimental_damping_estimation_SB_v4\Experimental_damping_estimation_SB_AC_v1" # replace with the path to your .tex files
# output_dir = root_dir#'output'  # replace with the path where you want to save processed files
# process_all_tex_files(root_dir, output_dir)

In [13]:
def process_latex_file(input_filepath: str, output_filepath: str):
    with open(input_filepath, 'r', encoding='utf-8') as infile:
        latex_content = infile.read()

    # Regex pattern to detect LaTeX environments
    env_pattern = re.compile(r'(\\begin\{.*?\}.*?\\end\{.*?\})', re.DOTALL)

    # Split content into environment and non-environment parts
    chunks = re.split(env_pattern, latex_content)

    processed_chunks = []
    for chunk in chunks:
        # If this chunk starts with '\begin', it is an environment; leave it unchanged
        if chunk.strip().startswith('\\begin'):
            processed_chunks.append(chunk)
        else:
            # Non-environment text
            lines = chunk.split('\n')
            processed_lines = []
            for line in lines:
                line = line.strip()
                if not line:
                    # Preserve blank lines
                    processed_lines.append('')
                    continue

                # Split line into sentences by a period followed by whitespace
                sentences = re.split(r'(?<=\.)\s', line)
                # If there's more than one sentence, join them with newlines
                # If only one sentence, leave it as is
                if len(sentences) > 1:
                    sentences = [s.strip() for s in sentences if s.strip()]
                    processed_line = '\n'.join(sentences)
                else:
                    processed_line = line

                processed_lines.append(processed_line)

            # Join the processed lines back together with newlines
            processed_chunk = '\n'.join(processed_lines)
            processed_chunks.append(processed_chunk)

    # Join all chunks together
    processed_content = ''.join(processed_chunks)

    with open(output_filepath, 'w', encoding='utf-8') as outfile:
        outfile.write(processed_content)


def process_all_tex_files(root_dir: str, output_dir: str):
    for dirpath, dirnames, filenames in os.walk(root_dir):
        for filename in filenames:
            if filename.endswith('.tex') and "_processed" not in filename:
                input_filepath = os.path.join(dirpath, filename)
                output_filepath = os.path.join(output_dir, filename.replace('.tex', '_processed.tex'))
                process_latex_file(input_filepath, output_filepath)


# Example usage
root_dir = r"D:\OneDrive - Texas A&M University\Academic\Acads IITK\Paper\SB\Experimental_damping_estimation_SB_v4\Experimental_damping_estimation_SB_AC_v1"
output_dir =root_dir
process_all_tex_files(root_dir, output_dir)

In [None]:
#
# for %f in (*.tex) do latexindent "%f" > "%~nf_indented.tex"

### Reassemble

In [7]:
# Input files and directories
main_file_path = 'Main.tex'
sections_dir = 'Sections'

# Output file
output_file_path = 'reassembled_document.tex'

# Open the main file and read its content
with open(main_file_path, 'r') as main_file:
    main_content = main_file.read()


# Function to replace \include*{filename} with the content of the file
def replacer(match):
    filename = match.group(1)  # Get the filename from the regex match
    filepath = f"{filename}.tex"
    with open(filepath, 'r') as file:
        return file.read()

# Use a regex to find all \include*{filename} commands and replace them
replaced_content = re.sub(r'\\include\{(.+?)\}', replacer, main_content)
replaced_content = re.sub(r'\\includeonly\{.*?\}', '', replaced_content, flags=re.DOTALL)

# Write the replaced content to the output file
with open(output_file_path, 'w') as output_file:
    output_file.write(replaced_content)

### One last time look over the figures, equations, tables carefully. Generate a latex file that has all these. Check them separately. If needed, mark changes in the PDF and then again change them in the tex file. 

In [3]:
def extract_environments(latex_code, environments):
    """
    Extract specified environments from the LaTeX code.

    :param latex_code: A string containing raw LaTeX code.
    :param environments: A list of environment names to extract.
    :return: A dictionary with environment names as keys and a list of extracted environments as values.
    """
    extracted = {}
    for env in environments:
        pattern = re.compile(r'\\begin\{' + env + r'\}.*?\\end\{' + env + r'\}', re.DOTALL)
        extracted[env] = re.findall(pattern, latex_code)

    return extracted

def write_to_new_file(extracted, output_filename):
    """
    Write the extracted code to a new LaTeX file with a basic preamble.

    :param extracted: A dictionary with extracted environments.
    :param output_filename: The name of the output file.
    """
    preamble = """
\\documentclass{article}
\\usepackage{amsmath}
\\usepackage{graphicx}
\\usepackage{booktabs}
\\usepackage{caption}
\\usepackage{subcaption}
\\begin{document}
"""
    postamble = "\n\\end{document}"

    with open(output_filename, 'w') as f:
        f.write(preamble)
        for env in extracted:
            f.write(f'\n% Extracted {env} environments\n')
            f.write('\n\n'.join(extracted[env]))
        f.write(postamble)

# Example usage
input_filename = 'Paper.tex'
output_filename = 'Paper_check.tex'

# Read the input LaTeX file
with open(input_filename, 'r') as f:
    latex_code = f.read()

# Extract equation, align, table, and figure environments
extracted = extract_environments(latex_code, ['equation', 'align', 'table', 'figure'])

# Write the extracted code to a new LaTeX file
write_to_new_file(extracted, output_filename)


### Copy all the latex style files to a specific folder

In [None]:
def find_and_copy_latex_style_files(latex_file, output_folder):
    # Regular expression to find \usepackage{} commands in LaTeX
    usepackage_re = re.compile(r'\\usepackage\{([^\}]+)\}')
    
    # Open and read the LaTeX file
    with open(latex_file, 'r') as file:
        data = file.read()
    
    # Find all \usepackage{} commands
    packages = usepackage_re.findall(data)
    
    # Search paths for LaTeX style files
    search_paths = ['/usr/local/texlive/2022/texmf-dist/tex/latex/']
    
    # Find and copy the style files
    for package in packages:
        found = False
        for path in search_paths:
            for root, dirs, files in os.walk(path):
                if f'{package}.sty' in files:
                    shutil.copy(os.path.join(root, f'{package}.sty'), output_folder)
                    found = True
                    break
            if found:
                break
        if not found:
            print(f"Warning: Style file for package '{package}' not found.")
            
# Example usage:
# find_and_copy_latex_style_files('path/to/latex/file.tex', 'path/to/output/folder')