In [None]:
!pip install -r requirements.txt

Collecting pygithub (from -r requirements.txt (line 4))
  Downloading PyGithub-2.6.1-py3-none-any.whl.metadata (3.9 kB)
Collecting pynacl>=1.4.0 (from pygithub->-r requirements.txt (line 4))
  Downloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl.metadata (8.6 kB)
Downloading PyGithub-2.6.1-py3-none-any.whl (410 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m410.5/410.5 kB[0m [31m9.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading PyNaCl-1.5.0-cp36-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.manylinux_2_24_x86_64.whl (856 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m856.7/856.7 kB[0m [31m33.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pynacl, pygithub
Successfully installed pygithub-2.6.1 pynacl-1.5.0


In [None]:
import os
os.environ["GITHUB_TOKEN"] = "github_token"

In [None]:
#!/usr/bin/env python3
"""
GitHub TikZ Code Crawler WITH RANDOMNESS

This script systematically crawls GitHub repositories to extract TikZ code,
corresponding images, and captions. It uses the GitHub API to search for
repositories containing TikZ code and then processes the results.
"""

import os
import re
import base64
import time
import requests
import json
from urllib.parse import urlparse
from github import Github
from github.GithubException import RateLimitExceededException, UnknownObjectException
import pandas as pd
from PIL import Image
from io import BytesIO
import logging
import random
import pandas as pd

# Configure logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler("tikz_crawler.log"),
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

class GitHubTikZCrawler:
    def __init__(self, github_token, output_dir="tikz_data"):
        """
        Initialize the GitHub TikZ crawler.

        Args:
            github_token (str): GitHub API token for authentication
            output_dir (str): Directory to save extracted data
        """
        self.github_token = github_token
        self.output_dir = output_dir
        self.g = Github(github_token)
        self.tikz_data = []

        # Create output directories
        os.makedirs(output_dir, exist_ok=True)
        os.makedirs(os.path.join(output_dir, "code"), exist_ok=True)

    def search_repositories(self, max_repos=1):
        """
        Search for repositories containing TikZ code with added randomness.

        Args:
            max_repos (int): Maximum number of repositories to process

        Returns:
            list: List of repository objects
        """
        logger.info(f"Searching for repositories with randomness")
        repositories = []
        try:
            # Introduce randomness by using different keywords or sort orders
            random_keywords = [
                                "TikZ graphics",
                                "TikZ flowchart",
                                "TikZ scientific figure",
                                "LaTeX diagrams",
                                "pgfplots examples",
                                "TikZ neural network",
                                "TikZ machine learning",
                                "TikZ graph theory",
                                "TikZ tree diagram",
                                "TikZ data visualization",
                                "TikZ publication-ready figures",
                                "\\begin{tikzpicture}",
                                "\\usetikzlibrary"
                            ]
            additional_parameters = [
                            "extension:.tex",  # Focus on LaTeX files
                            "extension:.tikz",  # TikZ-specific files
                            "topic:latex",  # Repositories tagged with LaTeX-related topics
                            "topic:visualization",  # Visualization-related repositories
                            "size:<50000",  # Avoid overly large repositories
                        ]
            random_sort = random.choice(["stars", "forks", "updated"])
            randomized_query = f"{random.choice(random_keywords)} language:tex stars:>={random.randint(1, 100)} sort:{random_sort}"
            full_query = f"{randomized_query} {' '.join(additional_parameters)}"

            results = self.g.search_repositories(query=full_query, sort=random_sort)
            count = 0

            for repo in results:
                repositories.append(repo)
                count += 1
                if count >= max_repos:
                    break

                # Check if approaching rate limit
                if self.g.get_rate_limit().search.remaining < 10:
                    reset_time = self.g.get_rate_limit().search.reset
                    sleep_time = reset_time.timestamp() - time.time() + 10
                    logger.info(f"Approaching rate limit, sleeping for {sleep_time} seconds")
                    time.sleep(max(1, sleep_time))

            logger.info(f"Found {len(repositories)} repositories")
            return repositories

        except RateLimitExceededException:
            logger.warning("Rate limit exceeded, waiting for reset")
            reset_time = self.g.get_rate_limit().search.reset
            sleep_time = reset_time.timestamp() - time.time() + 10
            time.sleep(max(1, sleep_time))
            return repositories

    def find_tikz_files(self, repo, max_files=50):
        """
        Find files containing TikZ code in a repository.

        Args:
            repo: GitHub repository object
            max_files (int): Maximum number of files to process

        Returns:
            list: List of content file objects
        """
        logger.info(f"Searching for TikZ files in {repo.full_name}")
        tikz_files = []
        try:
            # Get repository contents
            contents = repo.get_contents("")
            count = 0

            while contents and count < max_files:
                file_content = contents.pop(0)

                if file_content.type == "dir":
                    # Add directory contents to the stack
                    try:
                        contents.extend(repo.get_contents(file_content.path))
                    except Exception as e:
                        logger.warning(f"Could not access directory {file_content.path}: {str(e)}")
                elif file_content.name.endswith('.tex'):
                    # Check if file contains TikZ code
                    try:
                        file_data = base64.b64decode(file_content.content).decode('utf-8', errors='ignore')
                        if '\\begin{tikzpicture}' in file_data or '\\usetikzlibrary' in file_data:
                            tikz_files.append(file_content)
                            count += 1
                    except Exception as e:
                        logger.warning(f"Could not decode file {file_content.path}: {str(e)}")

                # Check if approaching rate limit
                if self.g.get_rate_limit().core.remaining < 10:
                    reset_time = self.g.get_rate_limit().core.reset
                    sleep_time = reset_time.timestamp() - time.time() + 10
                    logger.info(f"Approaching rate limit, sleeping for {sleep_time} seconds")
                    time.sleep(max(1, sleep_time))

            logger.info(f"Found {len(tikz_files)} TikZ files in {repo.full_name}")
            return tikz_files

        except Exception as e:
            logger.error(f"Error processing repository {repo.full_name}: {str(e)}")
            return []

    def extract_tikz_code(self, file_content):
        """
        Extract TikZ code blocks from a file.

        Args:
            file_content: GitHub file content object

        Returns:
            list: List of dictionaries containing TikZ code blocks and metadata
        """
        try:
            file_data = base64.b64decode(file_content.content).decode('utf-8', errors='ignore')
            repo = file_content.repository

            # Extract TikZ picture environments
            tikz_blocks = re.findall(r'\\begin{tikzpicture}(.*?)\\end{tikzpicture}', file_data, re.DOTALL)

            results = []
            for i, block in enumerate(tikz_blocks):
                # Try to find a caption near the TikZ code
                caption_search = re.search(r'\\caption{(.*?)}', file_data, re.DOTALL)
                caption = caption_search.group(1) if caption_search else ""

                # Clean up the caption (remove LaTeX commands)
                caption = re.sub(r'\\[a-zA-Z]+(\[[^\]]*\])?(\{[^}]*\})?', '', caption).strip()

                # Create complete TikZ code block
                complete_code = f"\\begin{{tikzpicture}}{block}\\end{{tikzpicture}}"

                # Look for TikZ libraries
                libraries = re.findall(r'\\usetikzlibrary{(.*?)}', file_data)

                # Extract used packages
                used_packages = re.findall(r'\\usepackage{(.*?)}', file_data)

                results.append({
                    'code': complete_code,
                    'caption': caption,
                    'libraries': libraries,
                    'repo_name': repo.full_name,
                    'file_path': file_content.path,
                    'file_url': file_content.html_url,
                    'raw_url': file_content.download_url,
                    'used_packages': used_packages,
                })

            return results

        except Exception as e:
            logger.error(f"Error extracting TikZ code from {file_content.path}: {str(e)}")
            return []

    def save_tikz_code(self, tikz_data):
        """
        Save TikZ code to a file.

        Args:
            tikz_data (dict): TikZ code data
        """
        try:
            # Generate a filename
            repo_name_safe = tikz_data['repo_name'].replace('/', '_')
            file_path_safe = os.path.basename(tikz_data['file_path']).replace('/', '_')
            code_filename = f"{repo_name_safe}_{file_path_safe}.tex"

            # Create a complete LaTeX document for compilation
            libraries = ','.join(tikz_data.get('libraries', []))
            used_packages = tikz_data.get('used_packages', [])

            document = '\\documentclass[crop, tikz]{standalone}\n'
            document += '\\usepackage{tikz}\n'

            if libraries:
                document += f'\\usetikzlibrary{{{libraries}}}\n'
            for package in used_packages:
                document += f'\\usepackage{{{package}}}\n'

            document += '\\begin{document}\n'
            document += tikz_data['code'] + '\n'
            document += '\\end{document}'

            # Save the complete document
            complete_code_path = os.path.join(self.output_dir, "code", f"complete_{code_filename}")
            with open(complete_code_path, 'w', encoding='utf-8') as f:
                f.write(document)

            # Update TikZ data
            tikz_data['local_complete_code_path'] = complete_code_path

            logger.info(f"Successfully saved TikZ code to {complete_code_path}")
            return True

        except Exception as e:
            logger.error(f"Error saving TikZ code: {str(e)}")
            return False

    def crawl(self, max_repos=3, max_files_per_repo=100):
        """
        Crawl GitHub repositories for TikZ code.

        Args:
            max_repos (int): Maximum number of repositories to process
            max_files_per_repo (int): Maximum number of files to process per repository
        """
        logger.info("Starting TikZ crawler")

        # Load previously processed repository names (if available)
        processed_repos_path = os.path.join(self.output_dir, "processed_repos.txt")
        if os.path.exists(processed_repos_path):
            with open(processed_repos_path, 'r') as f:
                processed_repos = set(f.read().splitlines())
        else:
            processed_repos = set()

        # Search for new repositories with randomness introduced above.
        repositories = self.search_repositories(max_repos=max_repos)

        for repo in repositories:
            if repo.full_name in processed_repos:
                logger.info(f"Skipping already processed repository: {repo.full_name}")
                continue

            try:
                # Find TikZ files in the repository.
                tikz_files = self.find_tikz_files(repo, max_files=max_files_per_repo)

                for file_content in tikz_files:
                    # Extract TikZ code blocks from each file.
                    tikz_blocks = self.extract_tikz_code(file_content)

                    for tikz_block in tikz_blocks:
                        # Save TikZ code block.
                        self.save_tikz_code(tikz_block)
                        self.tikz_data.append(tikz_block)

                # Mark repository as processed.
                with open(processed_repos_path, 'a') as f:
                    f.write(repo.full_name + '\n')

            except Exception as e:
                logger.error(f"Error processing repository {repo.full_name}: {str(e)}")

        # Save the dataset after crawling is complete.
        self.save_dataset()

        logger.info(f"Crawling complete. Extracted {len(self.tikz_data)} TikZ code blocks.")

    def save_dataset(self):
      """
      Save the dataset to a CSV file and a JSON file, appending new data if the file exists
      and ensuring no duplicates are added.
      """
      try:
          # Convert the current TikZ data to a DataFrame
          df_new = pd.DataFrame(self.tikz_data)

          # Paths for CSV and JSON files
          csv_path = os.path.join(self.output_dir, "tikz_dataset.csv")
          json_path = os.path.join(self.output_dir, "tikz_dataset.json")

          # Handle CSV file
          if os.path.exists(csv_path):
              # Load existing data
              df_existing = pd.read_csv(csv_path)

              # Combine new and existing data, removing duplicates
              df_combined = pd.concat([df_existing, df_new]).drop_duplicates(subset=['repo_name', 'file_path', 'code'])

              # Save back to CSV
              df_combined.to_csv(csv_path, index=False)
          else:
              # Save new data directly if file doesn't exist
              df_new.to_csv(csv_path, index=False)

          # Handle JSON file
          if os.path.exists(json_path):
              with open(json_path, 'r', encoding='utf-8') as f:
                  existing_data = json.load(f)

              # Combine new and existing data, removing duplicates based on unique keys
              combined_data = list({tuple(sorted(d.items())): d for d in existing_data + self.tikz_data}.values())

              with open(json_path, 'w', encoding='utf-8') as f:
                  json.dump(combined_data, f, indent=2)
          else:
              # Save new data directly if file doesn't exist
              with open(json_path, 'w', encoding='utf-8') as f:
                  json.dump(self.tikz_data, f, indent=2)

          logger.info(f"Dataset saved to {csv_path} and {json_path}")

      except Exception as e:
          logger.error(f"Error saving dataset: {str(e)}")

def main():
    """
    Main function to run the crawler.
    """
    github_token = os.environ.get("GITHUB_TOKEN")
    if not github_token:
        logger.error("GitHub token not found. Please set the GITHUB_TOKEN environment variable.")
        return

    crawler = GitHubTikZCrawler(github_token)
    for i in range(10):
      print(i)
      crawler.crawl(max_repos=3, max_files_per_repo=100)

if __name__ == "__main__":
    main()

0


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
1


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
2


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
3


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
4


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
5


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
6


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
7


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
8


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768
9


ERROR:__main__:Error saving dataset: unhashable type: 'list'


6698
1768


In [None]:
from google.colab import files
files.download("tikz_data/tikz_dataset.json")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
!zip -r code.zip "tikz_data"

  adding: tikz_data/ (stored 0%)
  adding: tikz_data/pdfs/ (stored 0%)
  adding: tikz_data/pdfs/Exam_images_mock-img04.tex/ (stored 0%)
  adding: tikz_data/pdfs/Exam_images_mock-img04.tex/Exam_images_mock-img04.tex.tex (deflated 68%)
  adding: tikz_data/pdfs/Exam_images_mock-img04.tex/Exam_images_mock-img04.tex_page_1.png (deflated 17%)
  adding: tikz_data/pdfs/Exam_images_mock-img04.tex/Exam_images_mock-img04.tex.pdf (deflated 3%)
  adding: tikz_data/pdfs/advanced-math_exercise_7-integral-calculus-of-multivariate-functions_integral-calculus-of-multivariate-functions.tex/ (stored 0%)
  adding: tikz_data/pdfs/advanced-math_exercise_7-integral-calculus-of-multivariate-functions_integral-calculus-of-multivariate-functions.tex/advanced-math_exercise_7-integral-calculus-of-multivariate-functions_integral-calculus-of-multivariate-functions.tex_page_1.png (deflated 7%)
  adding: tikz_data/pdfs/advanced-math_exercise_7-integral-calculus-of-multivariate-functions_integral-calculus-of-multivaria

In [None]:
from google.colab import files
files.download("code.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>