In [1]:
!pip install requests beautifulsoup4



In [2]:
import json
import os

PROCESSED_IDS_FILE = "processed_papers.json"

def load_processed_ids():
    """Load processed paper IDs from JSON file"""
    if os.path.exists(PROCESSED_IDS_FILE):
        with open(PROCESSED_IDS_FILE, 'r') as f:
            return set(json.load(f))
    return set()

def save_processed_ids(processed_ids):
    """Save processed paper IDs to JSON file"""
    with open(PROCESSED_IDS_FILE, 'w') as f:
        json.dump(list(processed_ids), f)

In [3]:
!pip install regex



In [4]:
import os
import regex
import csv
import time
import requests
import tarfile
import logging
from bs4 import BeautifulSoup

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

ARXIV_API = "http://export.arxiv.org/api/query"

class ArxivTikzProcessor:
    def __init__(self):
        self.output_dir = "tikz_output"
        os.makedirs(self.output_dir, exist_ok=True)

    def search_arxiv(self, query, start=0, max_results=20000):
        """Search arXiv for papers matching the query."""
        params = {
            "search_query": query,
            "start": start,
            "max_results": max_results,
            "sortBy": "submittedDate",
            "sortOrder": "descending"
        }
        response = requests.get(ARXIV_API, params=params)
        soup = BeautifulSoup(response.content, 'xml')
        return [entry.id.text.split('/')[-1] for entry in soup.find_all('entry')]

    def download_source(self, paper_id):
        """Download the source files of a paper from arXiv."""
        source_url = f"https://arxiv.org/e-print/{paper_id}"
        retries = 3

        for attempt in range(retries):
            try:
                response = requests.get(source_url)
                if response.status_code == 200:
                    output_dir = "sources"
                    os.makedirs(output_dir, exist_ok=True)
                    tar_path = os.path.join(output_dir, f"{paper_id}.tar.gz")
                    with open(tar_path, 'wb') as f:
                        f.write(response.content)
                    return tar_path
            except Exception as e:
                logger.error(f"Error downloading {paper_id}: {e}")
            time.sleep(5)

        return None

    def extract_tikz_metadata(self, content):
        """Extract LaTeX metadata from file content."""
        metadata = {
            'packages': regex.findall(r'\\usepackage{([^}]+)}', content),
            'libraries': regex.findall(r'\\usetikzlibrary{([^}]+)}', content),
            'captions': regex.findall(r'\\caption{(.*?)}', content, regex.DOTALL)
        }
        return metadata

    def extract_tikz_code(self, content):
      """Extract outermost TikZ code blocks and convert nested ones to scopes."""
      # Match outermost tikzpicture environments
      tikz_pattern = regex.compile(
          r'(\\begin{tikzpicture}((?:(?!\\begin{tikzpicture}|\\end{tikzpicture}).|(?0))*)\\end{tikzpicture})',
          flags=regex.DOTALL | regex.MULTILINE
      )

      blocks = []
      for match in tikz_pattern.finditer(content):
          full_code = match.group(0)

          # Convert nested tikzpictures to scopes using recursive substitution
          converted_code = regex.sub(
              r'\\begin{tikzpicture}((?:(?!\\begin{tikzpicture}).|(?0))*)\\end{tikzpicture}',
              r'\\begin{scope}\1\\end{scope}',
              full_code,
              flags=regex.DOTALL
          )

          blocks.append(converted_code)

      return blocks

    def create_compilable_tex(self, tikz_code, packages, libraries, output_path):
      """Create compilable LaTeX with proper scope handling."""
      # Ensure essential packages and libraries
      essential_packages = {'amsmath', 'amssymb', 'tikz', 'xcolor'}
      essential_libraries = {'arrows.meta', 'calc', 'positioning', 'scopes'}

      all_packages = list(set(packages) | essential_packages)
      all_libraries = list(set(libraries) | essential_libraries)

      # Build document with proper nesting support
      doc_header = (
          "\\documentclass[crop,tikz]{standalone}\n"
          + "\n".join([f"\\usepackage{{{pkg}}}" for pkg in all_packages])
          + f"\n\\usetikzlibrary{{{','.join(all_libraries)}}}"
          + "\n\\begin{document}\n"
      )

      # Wrap in scaling environment for better compilation
      full_code = (
          "\\begin{tikzpicture}[every scope/.style={scale=1}]\n"
          + tikz_code
          + "\n\\end{tikzpicture}"
      )

      with open(output_path, 'w', encoding='utf-8') as f:
          f.write(doc_header + full_code + "\n\\end{document}")

    def process_tex_file(self, content, paper_id, csv_writer):
        """Process a single TeX file and its TikZ content."""
        metadata = self.extract_tikz_metadata(content)
        tikz_blocks = self.extract_tikz_code(content)

        for idx, block in enumerate(tikz_blocks):
            try:
                caption = metadata['captions'][idx] if idx < len(metadata['captions']) else ""
                # Clean caption
                caption = regex.sub(r'\\[a-zA-Z]+', '', caption).strip()

                # Generate filename
                tex_filename = f"{paper_id}_figure_{idx+1}.tex"
                tex_path = os.path.join(self.output_dir, tex_filename)

                # Create compilable document
                self.create_compilable_tex(
                    block,
                    metadata['packages'],
                    metadata['libraries'],
                    tex_path
                )

                # Write to CSV
                csv_writer.writerow([
                    paper_id,
                    tex_path,
                    caption,
                    ','.join(metadata['packages']),
                    ','.join(metadata['libraries'])
                ])

            except Exception as e:
                logger.error(f"Error processing block {idx} in {paper_id}: {str(e)}")

        return len(tikz_blocks)

    def process_paper(self, paper_id, csv_writer):
        """Process a single arXiv paper."""
        try:
            tar_path = self.download_source(paper_id)
            if not tar_path:
                return 0

            total_figures = 0
            with tarfile.open(tar_path, 'r:gz') as tar:
                tex_files = [m for m in tar.getmembers() if m.name.endswith('.tex')]

                for tex_file in tex_files:
                    content = tar.extractfile(tex_file).read().decode('utf-8')
                    total_figures += self.process_tex_file(content, paper_id, csv_writer)

            return total_figures

        except Exception as e:
            logger.error(f"Error processing {paper_id}: {str(e)}")
            return 0

if __name__ == "__main__":
    processor = ArxivTikzProcessor()

    # Load previously processed papers
    processed_ids = load_processed_ids()
    new_processed_ids = set()

    with open('tikz_collection.csv', 'a', newline='', encoding='utf-8') as csvfile:
        writer = csv.writer(csvfile)
        # Write header if new file
        if os.stat('tikz_collection.csv').st_size == 0:
            writer.writerow(['source_id', 'code_path', 'caption', 'used_packages', 'used_libraries'])

        # Example search query
        query = "all:(tikz OR 'LaTeX diagram' OR 'LaTeX figure' OR pgfplots OR 'code illustration')"
        total_figures = 0
        start = 0
        BATCH_SIZE = 2000  # Max allowed by arXiv API
        TARGET_FIGURES = 10

        while total_figures < TARGET_FIGURES:
            paper_ids = processor.search_arxiv(query, start=start, max_results=BATCH_SIZE)
            if not paper_ids:
                break

                    # Process each paper in batch
            for pid in paper_ids:
                if pid in processed_ids:
                    print(f"Skipping already processed paper: {pid}")
                    continue

                figures = processor.process_paper(pid, writer)
                total_figures += figures
                new_processed_ids.add(pid)

                # Early exit if target reached
                if total_figures >= TARGET_FIGURES:
                    break

                # Rate limiting
                time.sleep(1.5)  # Reduced from 3s for faster processing

            # Update pagination
            start += len(paper_ids)
            save_processed_ids(processed_ids.union(new_processed_ids))  # Frequent saves

In [None]:
!zip -r arvixv2.zip tikz_output

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: tikz_output/2102.00276v1_figure_617.tex (deflated 65%)
  adding: tikz_output/1512.08296v3_figure_14.tex (deflated 52%)
  adding: tikz_output/2102.00276v1_figure_13.tex (deflated 45%)
  adding: tikz_output/2106.01855v1_figure_41.tex (deflated 52%)
  adding: tikz_output/2007.15675v3_figure_52.tex (deflated 67%)
  adding: tikz_output/1512.08296v3_figure_107.tex (deflated 50%)
  adding: tikz_output/2106.14211v4_figure_36.tex (deflated 66%)
  adding: tikz_output/1709.06005v2_figure_3.tex (deflated 56%)
  adding: tikz_output/1608.04259v2_figure_53.tex (deflated 57%)
  adding: tikz_output/1911.04630v3_figure_41.tex (deflated 52%)
  adding: tikz_output/1709.06005v2_figure_149.tex (deflated 54%)
  adding: tikz_output/1702.08108v3_figure_165.tex (deflated 52%)
  adding: tikz_output/1308.5150v2_figure_150.tex (deflated 72%)
  adding: tikz_output/2102.00276v1_figure_174.tex (deflated 63%)
  adding: tikz_output/1306.0074v4_f

In [None]:
from google.colab import files

# Download the zip file
files.download('tikz_collectionv2.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>