# Data preparation

Go through every .tex file
* Extract clean section names

Put data in a pandas dataframe that we can easily use for Word2Vec latter

In [13]:
import os
import re
import pandas as pd

# Function to parse LaTeX files and extract section titles
def parse_latex_files(directory):
    data = []
    section_regex = r'\\(?:sub)*section\*?\{([^\}]+)\}|\\begin\{abstract\}'

    # Walk through the directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            if file.endswith(".tex"):
                file_path = os.path.join(root, file)
                try:
                    with open(file_path, 'r', encoding='utf-8') as f:
                        content = f.read()
                        content = re.sub(r'\n', ' ', content)  # Replace newlines with spaces
                        titles = re.findall(section_regex, content)
                        for title in titles:
                            if title == 'abstract':
                                title = 'Abstract'  # Normalize the abstract title
                            data.append({'file': file, 'section_title': title})
                except UnicodeDecodeError:
                    # Some texts are in french and fail with utf-8 due to special characters
                    # Try reading with 'latin-1' encoding if utf-8 fails
                    try:
                        with open(file_path, 'r', encoding='latin-1') as f:
                            content = f.read()
                            content = re.sub(r'\n', ' ', content)  # Replace newlines with spaces
                            titles = re.findall(section_regex, content)
                            for title in titles:
                                if title == 'abstract':
                                    title = 'Abstract'  # Normalize the abstract title
                                data.append({'file': file, 'section_title': title})
                    except Exception as e:
                        print(f"Failed to read file {file} with error: {e}")
    return pd.DataFrame(data)


Let's correctly set the directory to the dataset

In [15]:
# Current working dir
print(os.getcwd())

/Users/christophzweifel/Downloads/Word2Vec


Do the actual extraction

In [16]:
# Parse the LaTeX files in the extracted directory
latex_data = parse_latex_files('/Users/christophzweifel/Downloads/Word2Vec/dataset/train')
latex_data

Unnamed: 0,file,section_title
0,1905.00526v2.tex,
1,1905.00526v2.tex,Introduction
2,1905.00526v2.tex,Related Work
3,1905.00526v2.tex,Radar Region Proposal Network
4,1905.00526v2.tex,Perspective Transformation
...,...,...
205630,1209.0359.tex,Communicating Processes
205631,1209.0359.tex,Recursive Communicating Processes
205632,1209.0359.tex,Topologies with Decidable State Reachability
205633,1209.0359.tex,Eager \qcp and the Mutex Restriction


And save it in a .csv file (intermediary result)

In [17]:
# Path for the CSV file
csv_path = 'section_titles.csv'  # Update this path to where you want to save the CSV

# Save the dataframe to a CSV file
latex_data.to_csv(csv_path, index=False)

print(f"Data saved to CSV at {csv_path}")

Data saved to CSV at section_titles.csv
