In [None]:
import re
import requests
import csv
import os
import time
from bs4 import BeautifulSoup

# Configure API parameters
API_URL = "https://api.stackexchange.com/2.3/questions"
PARAMS = {
    "order": "desc",
    "sort": "activity",
    "tagged": "tikz-pgf",
    "site": "tex",
    "pagesize": 100,
    "filter": "withbody",
    "key": "rl_7GGwTLUfiPg84pCtVMipME7HX"
}

# CSV output structure
CSV_COLUMNS = [
    'code',
    'caption',
    'source_url',
    'libraries',
    'packages',
    'question_id',
    'answer_id',
    'latex_file_path'  # New column for file paths
]

ESSENTIAL_PACKAGES = {'tikz', 'xcolor', 'amsmath', 'pgfplots'}
ESSENTIAL_LIBRARIES = {'arrows.meta', 'calc', 'positioning'}

# Configure output directories
OUTPUT_DIR = "tikz_data"
CODE_DIR = os.path.join(OUTPUT_DIR, "code")
os.makedirs(CODE_DIR, exist_ok=True)

def create_compilable_latex(tikz_code, packages, libraries):
    """Create compilable LaTeX document with full dependencies"""
    # Merge with essential dependencies
    all_packages = ESSENTIAL_PACKAGES.union(packages)
    all_libraries = ESSENTIAL_LIBRARIES.union(libraries)

    # Build package imports
    package_lines = []
    for pkg in all_packages:
        package_lines.append(f"\\usepackage{{{pkg}}}")

    # Build library imports
    library_line = f"\\usetikzlibrary{{{','.join(all_libraries)}}}" if all_libraries else ""

    package_block = "\n".join(package_lines)

    return f"""\\documentclass[crop,tikz]{{standalone}}
{package_block}
{library_line}
\\begin{{document}}
\\begin{{tikzpicture}}
{tikz_code}
\\end{{tikzpicture}}
\\end{{document}}"""


def save_latex_file(tikz_code, question_id, answer_id, packages, libraries):
    """Save TikZ code with proper dependencies"""
    try:
        filename = f"tikz_{question_id}_{answer_id}_{hash(tikz_code) % 1000000}.tex"
        filepath = os.path.join(CODE_DIR, filename)

        full_doc = create_compilable_latex(tikz_code, packages, libraries)

        with open(filepath, 'w', encoding='utf-8') as f:
            f.write(full_doc)

        return filepath

    except Exception as e:
        print(f"Error saving LaTeX file: {str(e)}")
        return None

def extract_tikz_code(body):
    """Extract TikZ code blocks from answer body with validation"""
    blocks = re.findall(
        r'\\begin{tikzpicture}(.*?)\\end{tikzpicture}',
        body,
        re.DOTALL
    )
    return [b.strip() for b in blocks if 'tikz' in b.lower()]


def extract_dependencies(body):
    """Extract packages and libraries from full answer body"""
    # Improved regex patterns to handle package options
    packages = set(re.findall(r'\\usepackage(?:\[[^\]]*\])?{([^}]+)}', body))
    libraries = set()

    # Handle multiple libraries in single declaration
    for lib_group in re.findall(r'\\usetikzlibrary{([^}]+)}', body):
        libraries.update(lib_group.split(','))

    return packages, libraries

def process_questions(output_file, start_page=1):
    """Main processing function with improved rate limiting"""
    with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
        writer = csv.DictWriter(csvfile, fieldnames=CSV_COLUMNS)
        writer.writeheader()

        page = start_page
        has_more = True
        retries = 0
        max_retries = 5

        while has_more and retries < max_retries:
            try:
                PARAMS['page'] = page
                response = requests.get(API_URL, params=PARAMS)

                if response.status_code == 429:
                    # Handle rate limit with Retry-After header
                    retry_after = int(response.headers.get('Retry-After', 60))
                    print(f"Rate limited. Retrying after {retry_after} seconds")
                    time.sleep(retry_after)
                    retries += 1
                    continue

                if response.status_code != 200:
                    print(f"Error: {response.status_code}")
                    break

                data = response.json()
                has_more = data.get('has_more', False)

                # Check remaining quota
                quota_remaining = data.get('quota_remaining', 1)
                if quota_remaining < 10:
                    print(f"Daily quota exhausted. Remaining: {quota_remaining}")
                    break

                # Process questions
                for question in data.get('items', []):
                    process_question(question, writer)
                    time.sleep(1)  # Add delay between questions

                # Handle backoff
                backoff = data.get('backoff', 1)
                print(f"Backing off for {backoff} seconds")
                time.sleep(backoff + 1)

                page += 1
                retries = 0  # Reset retry counter on success
                print(f"Processed page {page}")

            except requests.exceptions.RequestException as e:
                print(f"Request failed: {str(e)}")
                retries += 1
                time.sleep(min(2 ** retries, 60))  # Exponential backoff

def process_question(question, writer):
    """Process individual question with rate limit awareness"""
    try:
        answers_url = f"{API_URL}/{question['question_id']}/answers"
        response = requests.get(answers_url, params={
            "site": "tex",
            "filter": "withbody",
            "key": PARAMS["key"]
        })

        if response.status_code == 429:
            raise requests.exceptions.HTTPError("Rate limit exceeded")

        answers = response.json().get('items', [])
        for answer in answers:
            process_answer(question, answer, writer)
            time.sleep(0.5)  # Add delay between answers

    except Exception as e:
        print(f"Error processing question: {str(e)}")


def process_answer(question, answer, writer):
    """Process answer with enhanced dependency handling"""
    body = answer['body']

    # Extract dependencies from entire answer body
    packages, libraries = extract_dependencies(body)

    # Extract TikZ code blocks
    tikz_blocks = re.findall(r'\\begin{tikzpicture}(.*?)\\end{tikzpicture}', body, re.DOTALL)

    for code in tikz_blocks:
        code = code.strip()
        if not code:
            continue

        file_path = save_latex_file(
            code,
            question.get('question_id', 'unknown'),
            answer.get('answer_id', 'unknown'),
            packages,
            libraries
        )

        if not file_path:
            continue

        writer.writerow({
            'code': code,
            'caption': question.get('title', ''),
            'source_url': answer.get('link', ''),
            'libraries': ', '.join(libraries),
            'packages': ', '.join(packages),
            'question_id': question.get('question_id', ''),
            'answer_id': answer.get('answer_id', ''),
            'latex_file_path': file_path
        })

if __name__ == "__main__":
    output_csv = os.path.join(OUTPUT_DIR, "tex_stackexchange_tikz.csv")
    process_questions(output_csv, start_page=108)
    print(f"Data saved to {output_csv} and LaTeX files in {CODE_DIR}")

Backing off for 1 seconds
Processed page 109
Backing off for 1 seconds
Processed page 110
Backing off for 1 seconds
Processed page 111
Backing off for 1 seconds
Processed page 112
Backing off for 1 seconds
Processed page 113
Backing off for 1 seconds
Processed page 114
Backing off for 1 seconds
Processed page 115
Backing off for 1 seconds
Processed page 116
Backing off for 1 seconds
Processed page 117
Backing off for 1 seconds
Processed page 118
Backing off for 1 seconds
Processed page 119
Backing off for 1 seconds
Processed page 120
Backing off for 1 seconds
Processed page 121
Backing off for 1 seconds
Processed page 122
Backing off for 1 seconds
Processed page 123
Backing off for 1 seconds
Processed page 124
Backing off for 1 seconds
Processed page 125
Backing off for 1 seconds
Processed page 126
Backing off for 1 seconds
Processed page 127
Backing off for 1 seconds
Processed page 128
Backing off for 1 seconds
Processed page 129
Backing off for 1 seconds
Processed page 130
Backing of

In [None]:
!zip -r stackexchange2.zip '/content/tikz_data'

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
  adding: content/tikz_data/code/tikz_595157_595194_998810.tex (deflated 61%)
  adding: content/tikz_data/code/tikz_572051_572056_837236.tex (deflated 50%)
  adding: content/tikz_data/code/tikz_593437_593874_68474.tex (deflated 46%)
  adding: content/tikz_data/code/tikz_633668_633724_123003.tex (deflated 45%)
  adding: content/tikz_data/code/tikz_621640_621656_953602.tex (deflated 70%)
  adding: content/tikz_data/code/tikz_602869_602874_420770.tex (deflated 49%)
  adding: content/tikz_data/code/tikz_268830_269029_996117.tex (deflated 39%)
  adding: content/tikz_data/code/tikz_449911_450439_110531.tex (deflated 68%)
  adding: content/tikz_data/code/tikz_654451_654488_258234.tex (deflated 54%)
  adding: content/tikz_data/code/tikz_625993_626000_805869.tex (deflated 42%)
  adding: content/tikz_data/code/tikz_621321_621358_621688.tex (deflated 65%)
  adding: content/tikz_data/code/tikz_617805_617920_3777.tex (deflated 59%)
  

In [None]:
from google.colab import files

# Download the zip file
files.download('stackexchange2.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>