In [None]:
# src/config.py
EXAM_ID = 5
#MAX_IMAGES = 250
MAX_IMAGES = 30
ARXIV_BASE_URL = "https://export.arxiv.org/pdf/"
MIN_IMAGE_AREA = 12_000   # important later
VECTOR_DPI = 250


In [None]:
# src/download_all_pdfs.py

import time
import requests
from pathlib import Path
import logging
from PIL import Image

# -----------------------------
# Configuration
# -----------------------------
EXAM_ID = 5
ARXIV_BASE_URL = "https://export.arxiv.org/pdf/"
REQUEST_DELAY = 10       # seconds (arXiv friendly)
MAX_RETRIES = 3
MIN_PDF_SIZE = 5_000     # bytes
BASE_DIR = Path("/Volumes/TEJASWI MS/NLP_examid_5_Tejaswi_Duptala")
#BASE_DIR = Path("/Users/tejaswiduptala/Desktop/NLP")

#BASE_DIR = Path.cwd()
DATA_DIR = BASE_DIR / "data"
PDF_DIR = DATA_DIR / "pdfs"
LOG_DIR = BASE_DIR / "outputs" / "logs"
PAPER_LIST_PATH = BASE_DIR / "paper_list_5.txt"
OUTPUT_DIR = BASE_DIR / "outputs"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
IMAGE_DIR = BASE_DIR / f"images_{EXAM_ID}"
IMAGE_DIR.mkdir(parents=True, exist_ok=True)
PDF_DIR.mkdir(parents=True, exist_ok=True)
LOG_DIR.mkdir(parents=True, exist_ok=True)


DATASET_JSON_PATH = OUTPUT_DIR / "dataset_5.json"
COUNTS_CSV_PATH = OUTPUT_DIR / "paper_list_counts_5.csv"

# Gate keywords 
GATE_KEYWORDS = [
    "H", "X", "Y", "Z", "CNOT", "CZ", "SWAP", "CCNOT", "TOFFOLI",
    "RX", "RY", "RZ", "T", "S"
]

# Algorithm keywords 
ALGO_KEYWORDS = {
    "teleport": "Quantum Teleportation",
    "grover": "Grover Search",
    "fourier": "Quantum Fourier Transform",
    "qft": "Quantum Fourier Transform",
    "phase estimation": "Phase Estimation",
    "shor": "Shor's Algorithm",
    "vqe": "Variational Quantum Eigensolver",
    "qaoa": "QAOA"
}



# -----------------------------
# Logging
# -----------------------------

logging.basicConfig(
    filename=LOG_DIR / "download_all.log",
    level=logging.INFO,
    format="%(asctime)s [%(levelname)s] %(message)s"
)
'''
logging.getLogger().disabled = True
'''

def log(msg, logfile=None, level=logging.INFO):
    print(msg)
    logging.log(level, msg)
    if logfile:
        with open(LOG_DIR / logfile, "a", encoding="utf-8") as f:
            f.write(msg + "\n")

def load_paper_list(path=PAPER_LIST_PATH):
    """Load arXiv paper IDs from paper_list_5.txt in given order."""
    with open(path, "r", encoding="utf-8") as f:
        return [line.strip() for line in f if line.strip()]

def normalize_arxiv_id(arxiv_id: str) -> str:
    """Normalize arXiv ID for URL usage (remove 'arXiv:' prefix)."""
    arxiv_id = arxiv_id.strip()
    if arxiv_id.lower().startswith("arxiv:"):
        arxiv_id = arxiv_id.split(":", 1)[1]
    return arxiv_id

# -----------------------------
# PDF Downloader
# -----------------------------
SESSION = requests.Session()
SESSION.headers.update({"User-Agent": "NLP-QuantumCircuitDataset/1.0"})

def download_pdf(arxiv_id: str, timeout=30):
    """Download arXiv PDF with caching, retries, and rate limiting."""
    clean_id = normalize_arxiv_id(arxiv_id)
    url = f"{ARXIV_BASE_URL}{clean_id}.pdf"
    out_path = PDF_DIR / f"{clean_id}.pdf"

    # Cache
    if out_path.exists() and out_path.stat().st_size > MIN_PDF_SIZE:
        log(f"[CACHE] {clean_id}")
        return out_path

    for attempt in range(1, MAX_RETRIES + 1):
        log(f"[REQUEST] ({attempt}/{MAX_RETRIES}) {clean_id}")

        try:
            resp = SESSION.get(url, timeout=timeout, stream=True)
            time.sleep(REQUEST_DELAY)

            if resp.status_code != 200:
                log(f"[ERROR] HTTP {resp.status_code} for {clean_id}", logfile="download_errors.log")
                resp.close()
                continue

            tmp_path = out_path.with_suffix(".tmp")
            with open(tmp_path, "wb") as f:
                for chunk in resp.iter_content(chunk_size=4096):
                    if chunk:
                        f.write(chunk)
            resp.close()

            if tmp_path.stat().st_size < MIN_PDF_SIZE:
                log(f"[ERROR] PDF too small for {clean_id}", logfile="download_errors.log")
                tmp_path.unlink(missing_ok=True)
                continue

            tmp_path.rename(out_path)
            log(f"[SUCCESS] Saved {out_path.name}")
            return out_path

        except Exception as e:
            log(f"[EXCEPTION] {clean_id}: {e}", logfile="download_errors.log")
            time.sleep(REQUEST_DELAY)

    log(f"[FAILED] Could not download {clean_id}", logfile="download_errors.log")
    return None







In [None]:
def find_descriptions_for_figure(
    full_text: str,
    page_texts: list,
    page_number: int,
    figure_number: str | None,
    window: int = 600
):
    """
    Memory-safe description extraction.
    Operates ONLY on the current page text.
    """
    if figure_number is None:
        return [], []

    page_text = dict(page_texts).get(page_number, "")
    if not page_text:
        return [], []

    import re
    pat = re.compile(
        rf"(fig\.?|figure)\s*{re.escape(str(figure_number))}",
        re.IGNORECASE
    )

    m = pat.search(page_text)
    if not m:
        return [], []

    start = max(0, m.start() - window)
    end = min(len(page_text), m.end() + window)

    snippet = page_text[start:end].strip()

    # Map snippet into global text position (approximate, documented)
    global_start = full_text.find(snippet)
    if global_start == -1:
        return [snippet], []

    return [snippet], [[global_start, global_start + len(snippet)]]


In [None]:
#ertract full text 
def extract_full_text(pdf_path):
    """
    Extract full text and per-page text from a PDF.

    Returns
    -------
    full_text : str
        Concatenated text of all pages.
    page_texts : list of (page_number, page_text)
        Page-level text used for raster classification.
    """
    doc = fitz.open(pdf_path)
    full_text_pages = []
    page_texts = []

    for i, page in enumerate(doc):
        text = page.get_text("text")
        clean = " ".join(text.split())
        page_number = i + 1
        page_texts.append((page_number, clean))
        full_text_pages.append(clean)

    doc.close()

    full_text = "\n".join(full_text_pages)
    return full_text, page_texts


In [None]:

def save_pixmap_as_png(pixmap, filename: str, out_dir: Path):
    try:
        mode = "RGB" if pixmap.n < 4 else "RGBA"
        img = Image.frombytes(mode, (pixmap.width, pixmap.height), pixmap.samples)
    except Exception:
        pix = fitz.Pixmap(fitz.csRGB, pixmap)
        img = Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

    out_path = out_dir / filename
    img.save(out_path, format="PNG")
    return out_path


In [None]:
#rasster image
def extract_images_from_pdf(pdf_path: Path, min_area: int = 12_000):
    """
    Extract embedded raster images from a PDF, filtering out
    tiny images (icons, equations, glyphs).

    Parameters
    ----------
    pdf_path : Path
        Path to the PDF file.
    min_area : int
        Minimum on-page area (PDF coordinate space) an image
        must occupy to be considered a valid candidate.

    Returns
    -------
    List[dict]
        Each dict contains:
        - page_number : int (1-based)
        - image_index : int (0-based index on that page)
        - pixmap      : fitz.Pixmap
        - rects       : List[fitz.Rect] (locations on page)
    """
    results = []
    doc = fitz.open(pdf_path)

    for page_idx, page in enumerate(doc):
        page_number = page_idx + 1
        image_list = page.get_images(full=True)

        if not image_list:
            continue

        for img_idx, img_info in enumerate(image_list):
            xref = img_info[0]

            # Where does this image appear on the page?
            rects = page.get_image_rects(xref)
            if not rects:
                continue

            # Reject tiny on-page images (equations, icons)
            max_rect_area = max(r.get_area() for r in rects)
            if max_rect_area < min_area:
                continue

            try:
                pix = fitz.Pixmap(doc, xref)
                results.append({
                    "page_number": page_number,
                    "image_index": img_idx,
                    "pixmap": pix,
                    "rects": rects
                })
            except Exception as e:
                log(f"[ERROR] Failed to extract image xref={xref} "
                    f"on page {page_number}: {e}",
                    logfile="parsing_errors.log")

    doc.close()
    log(f"[INFO] Extracted {len(results)} raster image candidates from {pdf_path.name}")
    return results

def is_raster_circuit_image(pixmap, page_text: str = "") -> bool:
    """
    Conservative rule-based classifier for raster quantum circuit images.
    High precision by requiring multiple independent signals.
    """

    # ---------------------------
    # 1. Geometry constraints
    # ---------------------------
    w, h = pixmap.width, pixmap.height
    if w < 200 or h < 100:
        return False

    aspect = w / max(h, 1)
    if aspect < 1.3:
        return False

    # ---------------------------
    # 2. Reject plot-heavy pages
    # ---------------------------
    text = (page_text or "").lower()

    PLOT_KEYWORDS = [
        "energy", "loss", "fidelity", "probability",
        "iteration", "epoch", "accuracy", "temperature",
        "distance", "spectrum", "histogram", "counts"
    ]
    if any(k in text for k in PLOT_KEYWORDS):
        return False

    # ---------------------------
    # 3. Circuit & gate evidence
    # ---------------------------
    CIRCUIT_KEYWORDS = [
    "quantum circuit",
    "circuit diagram",
    "quantum gate",
    "gate sequence",
    "quantum register",
    "qubit",
    "qubits",

    "ansatz",
    "variational circuit",
    "parameterized circuit",
    "parametrized circuit",
    "layered circuit",
    "hardware efficient",

    "gate decomposition",
    "circuit decomposition",
    "compiled circuit",
    "circuit compilation",
    "mapped circuit",
    "transpiled circuit",

    "entangling gate",
    "two-qubit gate",
    "multi-qubit gate",
    "controlled gate",
    "control qubit",
    "target qubit",

    "measurement circuit",
    "readout circuit",
    "basis rotation",
    "measurement basis",

    "circuit schematic",
    "schematic diagram",
    "logic circuit",
    "quantum logic",
    ]


    GATE_TOKENS = [
        "rx", "ry", "rz",
        "cnot", "cx", "cz",
        "ccx", "ccnot", "toffoli",
        "swap", "xx", "yy", "zz"
    ]

    keyword_hits = sum(1 for k in CIRCUIT_KEYWORDS if k in text)
    gate_hits = sum(1 for g in GATE_TOKENS if g in text)

    if keyword_hits >= 2 and gate_hits >= 1:
        return True

    return False




In [None]:
import fitz
import cv2
import pytesseract
import numpy as np
from pathlib import Path
from PIL import Image
import re

def pil_from_pixmap(pix: fitz.Pixmap) -> Image.Image:
    """Convert fitz.Pixmap -> PIL Image safely."""
    if pix.n >= 4:
        pix = fitz.Pixmap(fitz.csRGB, pix)
    return Image.frombytes("RGB", (pix.width, pix.height), pix.samples)

#===========
#model a
#===========
def horizontal_line_score(img: Image.Image) -> float:
    """
    Measures how dominant horizontal lines are.
    Circuits ‚Üí high
    Plots ‚Üí low
    """
    gray = np.array(img.convert("L"))
    edges = cv2.Canny(gray, 50, 150)

    # Hough transform for lines
    lines = cv2.HoughLinesP(
        edges,
        rho=1,
        theta=np.pi/180,
        threshold=100,
        minLineLength=gray.shape[1] // 4,
        maxLineGap=10
    )

    if lines is None:
        return 0.0

    horiz = 0
    total = len(lines)

    for l in lines:
        x1, y1, x2, y2 = l[0]
        if abs(y1 - y2) < 5:  # horizontal
            horiz += 1

    return horiz / max(total, 1)

def is_vector_crop_candidate(img: Image.Image) -> bool:
    """Validate cropped rendered region is likely a circuit."""
    w, h = img.size
    #if w < 300 or h < 120:
    if w < 220 or h < 100:
        return False

    aspect = w / max(h, 1)
    #if aspect < 1.2:
    if aspect < 1.05:
        return False

    d = line_density_score(img)

    # Too low => empty / whitespace
    #if d < 0.01:
    if d < 0.005:
        return False

    # Too high => solid plot / filled region
    #if d > 0.35:
    if d > 0.40:
        return False

    return True


#===========
#ocr based model b
#===========
def ocr_text_from_image(img: Image.Image) -> str:
    """
    Extract text from image using OCR.
    Used only for rejection, never sole acceptance.
    """
    try:
        text = pytesseract.image_to_string(img)
        return text.lower()
    except Exception:
        return ""
def line_density_score(img: Image.Image) -> float:
    """
    Fraction of dark pixels.
    Circuits ‚Üí low
    Plots ‚Üí high
    """
    gray = np.array(img.convert("L"))
    return np.mean(gray < 160)
def strong_horizontal_wire_count(img: Image.Image) -> int:
    """
    Count long horizontal lines (qubit wires).
    """
    gray = np.array(img.convert("L"))
    edges = cv2.Canny(gray, 50, 150)

    lines = cv2.HoughLinesP(
        edges,
        rho=1,
        theta=np.pi / 180,
        threshold=120,
        minLineLength=int(gray.shape[1] * 0.6),
        maxLineGap=5
    )

    if lines is None:
        return 0

    return sum(
        abs(l[0][1] - l[0][3]) < 4
        for l in lines
    )

#=========
#common for both models
#=========
def horizontal_band_count(
    img: Image.Image,
    percentile: float = 95,
    min_peaks: int = 6) -> int:
    """
    Count horizontal intensity transitions (parallel bands).
    Circuits ‚Üí many bands
    Plots/text ‚Üí few
    """

    # Convert to grayscale
    gray = np.array(img.convert("L"))

    # Mean intensity per row
    row_mean = gray.mean(axis=1)

    # Row-to-row intensity change
    diff = np.abs(np.diff(row_mean))

    # Adaptive threshold
    thresh = np.percentile(diff, percentile)

    # Count significant transitions
    peaks = np.where(diff > thresh)[0]

    return len(peaks)

def vertical_line_score(img: Image.Image) -> float:
    import cv2
    import numpy as np

    gray = np.array(img.convert("L"))
    edges = cv2.Canny(gray, 50, 150)

    lines = cv2.HoughLinesP(
        edges,
        rho=1,
        theta=np.pi / 180,
        threshold=100,
        minLineLength=gray.shape[0] // 4,
        maxLineGap=10
    )

    if lines is None:
        return 0.0

    vert = 0
    total = len(lines)

    for l in lines:
        x1, y1, x2, y2 = l[0]
        if abs(x1 - x2) < 5:  # vertical
            vert += 1

    return vert / max(total, 1)
import cv2
import numpy as np
from PIL import Image

def small_box_count(
    img: Image.Image,
    min_area_ratio: float = 0.0003,
    max_area_ratio: float = 0.02,
    aspect_tol: float = 0.35
) -> int:
    """
    Count small rectangular gate-like boxes.

    Circuits ‚Üí many small rectangles
    Plots/text ‚Üí near zero
    """

    gray = np.array(img.convert("L"))

    # Binary image
    _, bw = cv2.threshold(gray, 200, 255, cv2.THRESH_BINARY_INV)

    # Find contours
    contours, _ = cv2.findContours(
        bw, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE
    )

    h, w = gray.shape
    img_area = h * w

    count = 0
    for c in contours:
        x, y, cw, ch = cv2.boundingRect(c)
        area = cw * ch

        # Area filter
        if area < min_area_ratio * img_area:
            continue
        if area > max_area_ratio * img_area:
            continue

        # Aspect ratio (rectangular)
        ar = cw / max(ch, 1)
        if abs(ar - 1.0) > aspect_tol:
            continue

        count += 1

    return count

#===================
#passage images removal
#===================

def is_text_page(img: Image.Image) -> bool:
    """
    Detect crops that are mostly text / equations / section headers,
    not diagrams. This is intentionally aggressive.
    """
    w, h = img.size

    ocr = ocr_text_from_image(img)
    words = ocr.split()
    word_count = len(words)

    d = line_density_score(img)
    boxes = small_box_count(img)
    wires = strong_horizontal_wire_count(img)

    # 0) Very thin horizontal strips with words ‚Üí header/footer line
    if h < 220 and word_count >= 3:
        return True

    # 1) Paragraphs or equations: many words, almost no boxes/wires
    if word_count >= 8 and wires <= 2 and boxes <= 3:
        return True

    # 2) Section headers: few words, almost no structure, very low density
    if 2 <= word_count <= 12 and wires == 0 and boxes <= 1 and d < 0.18:
        return True

    # 3) Medium density, no wires, few boxes ‚Üí text/equations
    if 0.05 < d < 0.35 and wires == 0 and boxes <= 2 and word_count >= 3:
        return True

    # 4) OCR failed (no words), geometry looks like a text line
    if word_count == 0 and wires == 0 and boxes <= 1 and d < 0.25 and h < 220:
        return True

    return False


# ==================
# GRAPH REJECTION 
# ==================

# ============================================================
# MODEL A ‚Äì STRUCTURAL VECTOR DETECTOR
# ============================================================

def is_vector_circuit_model_A(
    descriptions,
    pil_img: Image.Image,
    max_words: int = 12,
    min_bands: int = 6,
    min_horiz_ratio: float = 0.55,
    min_boxes: int = 3,
    max_boxes: int = 40,   # üî• NEW
) -> bool:
    """
    Robust vector quantum circuit classifier.
    """
    
    if pil_img.width < 300 or pil_img.height < 120:
        return False

    if pil_img is None:
        return False

    #  Visual sanity
    if not is_vector_crop_candidate(pil_img):
        return False

    # Parallel wires
    band_count = horizontal_band_count(pil_img)
    if band_count < min_bands:
        return False

    #  Horizontal dominance
    if horizontal_line_score(pil_img) < min_horiz_ratio:
        return False

    # Gate box structure (KEY FIX)
    box_count = small_box_count(pil_img)
    if not (min_boxes <= box_count <= max_boxes):
        return False

    # Caption-based rejection
    text = " ".join(descriptions).lower().strip()
    if text:
        if len(text.split()) > max_words:
            return False

        REJECT_TERMS = [
            "plot","plot shows", "energy", "probability", "curve",
            "iterations", "accuracy", "loss",
            "graph", "chart"
        ]
        if any(t in text for t in REJECT_TERMS):
            return False

    return True

# ============================================================
# MODEL B ‚Äì OCR + WIRE DETECTOR
# ============================================================

def is_vector_circuit_model_B(
    img: Image.Image,
    caption_text: str = "",
) -> bool:
    """
    Combined Model B:
    - STRICT branch (close to your original robust settings)
    - SOFT branch (more permissive for smaller / lighter circuits)
    - Shared OCR + caption-based graph/text rejection.
    """

    # ---------- 1) OCR + caption filter (shared) ----------
    ocr_text  = ocr_text_from_image(img)
    ocr_words = ocr_text.split()
    ocr_lower = ocr_text.lower()
    cap_lower = caption_text.lower()

    # Hard reject clear graph-like / training-like captions
    HARD_REJECT_TERMS = [
        "plot", "plot shows", "graph", "chart", "curve",
        "spectrum", "histogram", "time series", "counts",
        "accuracy", "loss", "probability",
        "distribution", "training", "iterations",
        "mean", "variance", "error rate", "fidelity"
    ]
    if any(t in ocr_lower for t in HARD_REJECT_TERMS):
        return False
    if any(t in cap_lower for t in HARD_REJECT_TERMS):
        return False

    wire_count = strong_horizontal_wire_count(img)
    box_count  = small_box_count(img)
    density    = line_density_score(img)
    vscore     = vertical_line_score(img)

    if density > 0.35:
        return False

    def strict_branch() -> bool:
        max_ocr_words = 30
        max_density   = 0.22
        min_wires     = 4
        min_boxes     = 4

        if len(ocr_words) > max_ocr_words:
            return False
        if density > max_density:
            return False

        signals = 0
        signals += wire_count >= min_wires
        signals += box_count  >= min_boxes
        signals += vscore > 0.15
        signals += density < max_density

        return signals >= 3

    # ---------- 4) SOFT branch (more permissive) ----------
    def soft_branch() -> bool:
        max_ocr_words = 60
        max_density   = 0.30
        min_wires     = 2
        min_boxes     = 2

        if len(ocr_words) > max_ocr_words and "circuit" not in ocr_lower:
            return False
        if density > max_density:
            return False

        signals = 0
        signals += wire_count >= min_wires
        signals += box_count  >= min_boxes
        # in circuits vertical lines should not dominate like axes
        signals += vscore < 0.30

        return signals >= 2

    # ---------- 5) Final decision ----------
    return strict_branch() or soft_branch()



#=================
#merging
#==================

#2 30%
def is_vector_circuit_final(img: Image.Image, descriptions: list[str]) -> bool:
    """
    Final decision for vector circuits.
    Uses a soft scoring rule + the two specialist models.
    """
    caption_text = " ".join(descriptions)

    # 1) Hard rejections
    if is_definitely_graph(img,caption_text):
        return False
    
    if is_text_page(img):
        return False
    # 2) Geometry features
    h = horizontal_line_score(img)
    v = vertical_line_score(img)
    d = line_density_score(img)
    bands = horizontal_band_count(img)
    wires = strong_horizontal_wire_count(img)
    boxes = small_box_count(img)

    score = 0
    if h >= 0.6:   # horizontal structure
        score += 1
    if v <= 0.30:  # not too many long verticals
        score += 1
    if d <= 0.35:  # not super dense / filled
        score += 1
    if bands >= 4:
        score += 1
    if wires >= 2:
        score += 1
    if boxes >= 2:
        score += 1

    # require at least 3 independent "circuit-like" signals
    if score < 3:
        return False

    # 3) Specialist models (A + B)
    ok_A = is_vector_circuit_model_A(
        descriptions=descriptions,
        pil_img=img
    )

    ok_B = is_vector_circuit_model_B(
        img=img,
        caption_text=" ".join(descriptions)
    )

    # If either model likes it, accept.
    return ok_A or ok_B





In [None]:
#graph handling 
GRAPH_TERMS = [
    "accuracy", "loss", "probability", "iterations",
    "distribution", "fidelity", "mean", "variance",
    "plot", "plot shows", "graph", "chart", "curve",
    "histogram", "spectrum"
]

GRAPH_CAPTION_KEYWORDS = [
    # very graph-ish phrases
    "as a function of",
    "dependence of",
    "dependence on",
    "versus",
    "vs.",
    "vs ",
    "variation of",
    "evolution of",
    "time trace",
    "histogram",
    "spectrum",
    "spectra",
    "density of states",
    "probability distribution",
    "probability density",
    "power spectrum",
    "fourier spectrum",
    "band structure",
    "correlation function","plot", "plot shows", "graph", "chart", "curve",
    "histogram", "spectrum",

    # axes-ish words that almost never describe a circuit
    "real part","imaginary","imaginary part",
    "imaginary part",
    "re[",
    "im[",
    "x-axis",
    "y-axis",

    # physics-style labels that show up in your examples
    "re(œá", "im(œá", "œá(",  # chi plots
    "v_d/v_s", "v_d / v_s",
    "log scale",
]

def caption_looks_like_graph(descriptions: list[str]) -> bool:
    """
    Decide if the *caption text* is clearly talking about a graph/plot.

    We keep this pretty strict so that we don't kill real circuit figures.
    """
    if not descriptions:
        return False

    text = " ".join(descriptions).lower()

    # Strong, almost-plot-only phrases
    for kw in GRAPH_CAPTION_KEYWORDS:
        if kw in text:
            return True

    # Common pattern: "X vs Y"
    import re
    if re.search(r"\bvs\.?\b", text):
        return True

    # Very typical sentence for plots:
    # "Figure N shows ... as a function of ..."
    if "as a function of" in text:
        return True

    return False

def is_definitely_graph(img: Image.Image, caption_text: str = "") -> bool:
    """
    Graph / plot detector using OCR + geometry + optional caption text.
    """

    # ---------- 0) Caption-based test ----------
    # If the caption screams "graph", bail out immediately.
    if caption_text:
        if caption_looks_like_graph([caption_text]):
            return True


    # ---------- 1) Textual hints (OCR) ----------
    ocr = ocr_text_from_image(img).lower()
    if any(t in ocr for t in GRAPH_TERMS):
        return True

    # ---------- 2) Geometry ----------
    d  = line_density_score(img)        # fraction of dark pixels
    h  = horizontal_line_score(img)     # fraction of long horizontal lines
    v  = vertical_line_score(img)       # fraction of long vertical lines
    bx = small_box_count(img)           # gate-like rectangles
    bands = horizontal_band_count(img)  # horizontal intensity bands

    # Heuristic:
    # - graphs: some horizontals + some verticals, not too boxy
    # - circuits: mostly horizontals, very few verticals, many boxes

    # (a) Classic ‚Äúaxes + curves‚Äù look
    if (
        h < 0.85 and         # horizontals not overwhelmingly dominant
        v > 0.18 and         # clear vertical structure (y-axis, grid)
        bx <= 3 and          # almost no small boxes (unlike gates)
        d > 0.15 and         # not extremely sparse
        bands >= 5           # several horizontal bands across the plot area
    ):
        return True

    # (b) Very dense figures with grid-like structure & few boxes
    if (
        d > 0.30 and         # dense
        v > 0.15 and
        bx <= 2
    ):
        return True

    # otherwise: not confidently a graph
    return False

In [None]:

def is_text_strip(img: Image.Image) -> bool:
    """
    Detect wide, thin crops that are basically just text lines / headings.

    This is intentionally aggressive: we *prefer* to reject too many
    rather than keep paragraphs as 'figures'.
    """
    w, h = img.size
    if w == 0 or h == 0:
        return True

    aspect = w / float(h)

    ocr = ocr_text_from_image(img)
    words = ocr.split()
    word_count = len(words)

    d = line_density_score(img)
    boxes = small_box_count(img)
    wires = strong_horizontal_wire_count(img)

    # --- 1) Very wide & thin with some words  -> header/footer/line of text
    if aspect > 4.0 and h < 220 and word_count >= 3:
        return True

    # --- 2) Paragraph-ish: several words, no wires, almost no boxes
    if word_count >= 5 and wires <= 0 and boxes <= 1:
        return True

    # --- 3) Medium density, no wires, few boxes  -> equations / text
    if 0.05 < d < 0.35 and wires == 0 and boxes <= 2 and word_count >= 3:
        return True

    return False


FIG_CAPTION_RE = re.compile(
    r"(fig\.?|figure)\s*\.?\s*(\d+(?:\.\d+)?)",
    re.IGNORECASE
)

def extract_vector_circuit_images(pdf_path, full_text, page_texts, dpi=120):
    doc = fitz.open(pdf_path)
    zoom = dpi / 72
    mat = fitz.Matrix(zoom, zoom)

    page_text_dict = dict(page_texts)

    for page_idx in range(len(doc)):
        page = doc[page_idx]
        page_number = page_idx + 1
        page_text = page_text_dict.get(page_number, "")

        # Only pages that even mention a figure
        low_page_text = page_text.lower()
        if "fig" not in low_page_text and "figure" not in low_page_text:
            continue
        #if "fig" not in page_text and "figure" not in page_text:
         #   continue

        caption_rect = None
        figure_number = None

        # ---- find caption block + parse figure number ----
        for block in page.get_text("blocks"):
            raw_txt = (block[4] or "")
            m = FIG_CAPTION_RE.search(raw_txt)
            if m:
                figure_number = m.group(2)      # e.g. "2" or "3.1"
                caption_rect = fitz.Rect(block[:4])
                break

        if caption_rect is None:
            # no recognizable caption, skip page
            continue

        page_rect = page.rect
        crop_height = 0.35 * page_rect.height

        clip = fitz.Rect(
            page_rect.x0,
            max(page_rect.y0, caption_rect.y0 - crop_height),
            page_rect.x1,
            caption_rect.y0
        )

        try:
            pix = page.get_pixmap(matrix=mat, clip=clip, alpha=False)
        except Exception:
            continue

        # size sanity
        if pix.width < 300 or pix.height < 150:
            continue

       
        pil_img = pil_from_pixmap(pix)
        #if is_text_strip(pil_img) or is_definitely_graph(pil_img):
         #   continue
        # strong early rejections
        #if is_text_page(pil_img): 
        #    continue

        #if is_text_strip(pil_img):
         #   continue

        #descriptions, positions = find_descriptions_for_figure(
         #   full_text, page_texts, page_number, figure_number)
        descriptions, positions = find_descriptions_for_figure(
            full_text=full_text,
            page_texts=page_texts,
            page_number=page_number,
            figure_number=figure_number
        )


        yield {
            "page_number": page_number,
            "figure_number": figure_number,
            "pixmap": pix,
            "descriptions": descriptions,
            "positions": positions,
        }

    doc.close()

In [None]:
#meta data processing
#FIG_REGEX = re.compile(r"\b(fig\.?|figure)\s*\.?\s*(\d+(\.\d+)?)\b", re.IGNORECASE)
def semantic_info_from_image(pixmap, descriptions):
    """
    Use OCR + textual descriptions to infer gates and quantum problem.
    Called from both raster and vector pipelines.

    Parameters
    ----------
    pixmap : fitz.Pixmap
        Image data as extracted from the PDF.
    descriptions : list[str]
        Figure captions / local descriptions (may be empty).

    Returns
    -------
    gates : list[str]
    problem : str
    """
    img = pil_from_pixmap(pixmap)
    ocr_text = ocr_text_from_image(img)

    gates = detect_quantum_gates(descriptions, ocr_text=ocr_text)
    problem = detect_quantum_problem(descriptions, ocr_text=ocr_text)

    return gates, problem

FIG_REGEX = re.compile(
    r"(fig|figure)\s*\.?\s*(\d+(\.\d+)?)",
    re.IGNORECASE
)

def infer_figure_number_from_page(page_text: str, local_image_index: int):

    """
    Infer the figure number for a given image on a page.

    Strategy
    --------
    1. Parse all occurrences of 'Fig. X' / 'FIG. X' / 'Figure X' in the page text.
    2. Assume that the N-th extracted image on the page corresponds to the
       N-th 'Fig.' occurrence in the text (0-based indexing).
    3. If the image index exceeds the number of figure mentions, fall back
       to the last figure mention on that page.
    4. If no figure mention is found at all, return None.

    Parameters
    ----------
    page_text : str
        The text content of the page (we extracted this via extract_full_text()).
    local_image_index : int
        0-based index of the image on that page, as returned by extract_images_from_pdf().

    Returns
    -------
    str or None
        The inferred figure number (e.g., '2', '3.1') or None if no figure
        reference could be found.

    Notes
    -----
    This method is heuristic and may misalign images and figure numbers in
    papers with very complex layouts. We document this limitation in the report.
    """
    matches = list(FIG_REGEX.finditer(page_text))
    if not matches:
        # No figure references on this page
        return None

    # If we have at least as many figure mentions as images,
    # map image index directly to figure mention index.
    if local_image_index < len(matches):
        m = matches[local_image_index]
        fig_num = m.group(2)  # '2' or '3.1'
        return fig_num

    # Fallback: more images than 'Fig.' mentions,
    # return the last figure number on this page.
    last_match = matches[-1]
    return last_match.group(2)


BASE_GATES = [
    "H", "X", "Y", "Z",
    "RX", "RY", "RZ",
    "CNOT", "CZ", "SWAP",
    "T", "S",
    "TOFFOLI"
]
# Strong multi-qubit / parameterized gates
STRONG_GATES = {
    "CNOT", "CZ", "SWAP",
    "TOFFOLI", "CCNOT",
    "RX", "RY", "RZ"
}

GATE_SYNONYMS = {
    "CX": "CNOT",
    "C-X": "CNOT",
    "C X": "CNOT",
    "CCX": "TOFFOLI",
    "CCNOT": "TOFFOLI"
}

def normalize_gate_token(token: str) -> str:
    token = token.upper().strip()

    token = token.replace("R_X", "RX").replace("R_Y", "RY").replace("R_Z", "RZ")
    token = token.replace("R X", "RX").replace("R Y", "RY").replace("R Z", "RZ")

    if token in GATE_SYNONYMS:
        token = GATE_SYNONYMS[token]

    token = token.replace("‚Ä†", "")

    return token if token in BASE_GATES else ""

def detect_quantum_gates(descriptions, ocr_text=None):
    combined = " ".join(descriptions) if descriptions else ""
    if ocr_text:
        combined += " " + ocr_text

    combined_upper = combined.upper()
    tokens = re.split(r"[^A-Z0-9_]+", combined_upper)

    gates_found = set()

    for tok in tokens:
        gate = normalize_gate_token(tok)
        if gate:
            gates_found.add(gate)

    extra_patterns = [
        r"\bC\-X\b", r"\bC X\b",
        r"\bCX\b", r"\bCCX\b", r"\bCCNOT\b",
        r"\bR_X\b", r"\bR_Y\b", r"\bR_Z\b"
    ]

    for pat in extra_patterns:
        for m in re.finditer(pat, combined_upper):
            gate = normalize_gate_token(m.group(0))
            if gate:
                gates_found.add(gate)

    return sorted(gates_found)

# ============================================================
# 2. QUANTUM PROBLEM DETECTION (2.3.6)
# ============================================================

PROBLEM_PATTERNS = [
    ("Quantum Teleportation", [
        r"\bquantum\s+teleportation\b",
        r"\bteleportation\s+protocol\b",
        r"\bteleportation\b",
    ]),
    ("Grover Search", [
        r"\bgrover('?s)?\b",
        r"\bgrover\s+search\b",
    ]),
    ("Quantum Fourier Transform", [
        r"\bquantum\s+fourier\s+transform\b",
        r"\bqft\b",
    ]),
    ("Phase Estimation", [
        r"\bquantum\s+phase\s+estimation\b",
        r"\bphase\s+estimation\b",
        r"\bqpe\b",
        r"\bpea\b",
    ]),
    ("Shor's Algorithm", [
        r"\bshor('?s)?\b",
        r"\bshor'?s\s+algorithm\b",
        r"\border\s+finding\b",
    ]),
    ("VQE", [
        r"\bvariational\s+quantum\s+eigensolver\b",
        r"\bvqe\b",
    ]),
    ("QAOA", [
        r"\bquantum\s+approximate\s+optimization\s+algorithm\b",
        r"\bqaoa\b",
    ]),
    ("Quantum Error Correction", [
        r"\bquantum\s+error\s+correction\b",
        r"\berror\s+correction\b",
        r"\bstabilizer\b",
        r"\bsyndrome\b",
        r"\bmagic\s+state\b",
        r"\bdistillation\b",
    ]),
    ("Quantum Simulation", [
        r"\bquantum\s+simulation\b",
        r"\bhamiltonian\s+simulation\b",
        r"\btime\s+evolution\b",
        r"\btrotter\b",
        r"\btrotterization\b",
    ]),
    ("Variational / Ansatz Circuit", [
        r"\bansatz\b",
        r"\bvariational\b",
        r"\bparameteri[sz]ed\b",
    ]),
    ("Deutsch‚ÄìJozsa", [
        r"\bdeutsch[-\s]*jozsa\b",
        r"\bdj\s+algorithm\b",
    ]),
    ("Bernstein‚ÄìVazirani", [
        r"\bbernstein[-\s]*vazirani\b",
        r"\bbv\s+algorithm\b",
    ]),
    ("Simon's Algorithm", [
        r"\bsimon('?s)?\s+algorithm\b",
        r"\bsimon\b",
    ]),
    ("Amplitude Estimation", [
        r"\bamplitude\s+estimation\b",
        r"\bqae\b",
    ]),
    ("HHL", [
        r"\bhhl\b",
    ])
]

def detect_quantum_problem(descriptions, ocr_text=None):
    combined = " ".join(descriptions) if descriptions else ""
    if ocr_text:
        combined += " " + ocr_text

    text = combined.lower()

    for label, patterns in PROBLEM_PATTERNS:
        for pat in patterns:
            if re.search(pat, text, flags=re.IGNORECASE):
                return label

    return "unspecified"



In [None]:
#saving meta data and json and csv
#Build Metadata Entry (2.3.7) demo code 
import os, re, csv, logging, time, random
import json   # ‚Üê REQUIRED
from pathlib import Path

def build_metadata_entry(
    image_filename,
    arxiv_id,
    page_number,
    figure_number,
    gates,
    problem,
    descriptions,
    positions
):
    """
    Build the final metadata dict for one saved circuit image.
    Ensures JSON-safe types (e.g., tuples -> lists).
    """
    # Ensure correct types
    gates = list(gates) if gates else []
    descriptions = list(descriptions) if descriptions else []
    
    # positions should be list of [start, end]
    fixed_positions = []
    for pos in positions or []:
        if isinstance(pos, tuple):
            fixed_positions.append([int(pos[0]), int(pos[1])])
        else:
            fixed_positions.append([int(pos[0]), int(pos[1])])

    # Make sure descriptions and positions lengths match
    if len(descriptions) != len(fixed_positions):
        # safest fallback: drop both to avoid invalid dataset
        descriptions = []
        fixed_positions = []

    return {
        "image_filename": image_filename,
        "arxiv_number": arxiv_id,
        "page_number": int(page_number),
        "figure_number": figure_number,  # may be None or string
        "quantum_gates": gates,
        "quantum_problem": problem if problem else "unspecified",
        "descriptions": descriptions,
        "text_positions": fixed_positions
    }
#JSON & CSV Writers (3.1, 3.2) demo code 
def save_dataset_json(entries, out_path=DATASET_JSON_PATH):
    """
    Save all metadata entries to dataset_5.json.
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", encoding="utf-8") as f:
        json.dump(entries, f, indent=4, ensure_ascii=False)
    log(f"[INFO] Wrote dataset JSON ‚Üí {out_path}")

def save_counts_csv(paper_ids, paper_counts, out_path=COUNTS_CSV_PATH):
    """
    Save per-paper image counts to paper_list_counts_5.csv.

    Rules:
    - For inspected papers: write count (including 0)
    - For uninspected papers: leave blank
    """
    out_path.parent.mkdir(parents=True, exist_ok=True)
    with open(out_path, "w", newline="", encoding="utf-8") as f:
        writer = csv.writer(f)
        writer.writerow(["paper_id", "num_images"])

        for pid in paper_ids:
            if pid in paper_counts:
                writer.writerow([pid, paper_counts[pid]])   # inspected
            else:
                writer.writerow([pid, ""])                  # uninspected (blank)

    log(f"[INFO] Wrote counts CSV ‚Üí {out_path}")


In [None]:
import numpy as np
import cv2
import fitz

def compute_color_score(pix: fitz.Pixmap, debug=False) -> float:
    """
    Compute a color score for a pixmap, indicating visual complexity.

    Parameters
    ----------
    pix : fitz.Pixmap
        A PyMuPDF Pixmap object (from PDF).
    debug : bool
        If True, also return intermediate values (for tuning)

    Returns
    -------
    float
        Final score between 0.0 and 1.0 (high = complex image)
    """

    # --- Convert pixmap to RGB numpy image ---
    img = np.frombuffer(pix.samples, dtype=np.uint8)
    img = img.reshape(pix.height, pix.width, pix.n)
    if img.shape[2] > 3:
        img = img[:, :, :3]  # strip alpha

    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)

    # --- Unique colors ---
    unique_colors = len(np.unique(img.reshape(-1, 3), axis=0))

    # --- Curve ratio ---
    edges = cv2.Canny(gray, 50, 150)
    total_edge_pixels = np.sum(edges > 0)

    lines = cv2.HoughLinesP(
        edges, 1, np.pi / 180,
        threshold=150,
        minLineLength=40,
        maxLineGap=5
    )

    def straight_line_pixel_count(lines):
        if lines is None:
            return 0
        return sum(np.sqrt((x2 - x1)**2 + (y2 - y1)**2) for [[x1, y1, x2, y2]] in lines)

    straight_pixels = straight_line_pixel_count(lines)
    curve_ratio = 1.0 - (straight_pixels / total_edge_pixels) if total_edge_pixels > 0 else 0.0

    # --- Better normalization ranges (tighter) ---
    def normalize(val, low, high):
        return min(max((val - low) / (high - low), 0.0), 1.0)

    # Aggressively penalize colorful and curvy diagrams
    norm_color = normalize(unique_colors, low=300, high=1200)     # more sensitive
    norm_curve = normalize(curve_ratio, low=0.2, high=0.5)         # curves matter earlier

    # Combine with higher weight for curves
    color_score = 0.4 * norm_color + 0.6 * norm_curve

    if debug:
        return round(color_score, 3)

    return round(color_score, 3)


In [None]:
from typing import Tuple


def compute_text_object_ratio(pix: fitz.Pixmap) -> float:
    """
    Computes the ratio between text (OCR) and objects (lines, shapes) in a given pix image.
    
    Parameters:
    -----------
    pix : fitz.Pixmap
        A PyMuPDF Pixmap image object.

    Returns:
    --------
    ratio : float
        Ratio of total text area to total object area. Returns -1 if nothing detected.
    """
    # 1. Convert Pixmap to PIL Image
    img_mode = "RGB" if pix.n < 4 else "RGBA"
    pil_img = Image.frombytes(img_mode, [pix.width, pix.height], pix.samples)
    img = np.array(pil_img.convert("RGB"))

    gray = cv2.cvtColor(img, cv2.COLOR_RGB2GRAY)
    blurred = cv2.GaussianBlur(gray, (3, 3), 0)

    # 2. OCR text detection (bounding boxes)
    data = pytesseract.image_to_data(pil_img, output_type=pytesseract.Output.DICT)
    text_area = 0
    for i in range(len(data["text"])):
        if int(data["conf"][i]) > 30:  # confidence threshold
            (w, h) = (int(data["width"][i]), int(data["height"][i]))
            text_area += w * h

    # 3. Object detection (contours)
    edged = cv2.Canny(blurred, 50, 150)
    contours, _ = cv2.findContours(edged, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)

    object_area = 0
    for cnt in contours:
        area = cv2.contourArea(cnt)
        if area > 10:  # ignore small noise
            object_area += area

    # 4. Compute ratio
    if object_area == 0:
        return -1  # avoid division by zero
    return round(text_area / object_area, 3)


In [None]:
def run_pipeline():
    """
    Main processing loop (RASTER ONLY).

    - Iterates over papers in paper_list_5.txt (in order)
    - Downloads each PDF (if needed)
    - Extracts raster image candidates
    - Classifies them with is_raster_circuit_image(pixmap, page_text)
    - Saves only accepted circuit images as PNGs
    - Builds metadata entries for each accepted image
    - Writes dataset_5.json and paper_list_counts_5.csv

    Uses:
    - MAX_IMAGES, IMAGE_DIR, DATASET_JSON_PATH, COUNTS_CSV_PATH
    - load_paper_list, download_pdf, extract_full_text, extract_images_from_pdf
    - is_raster_circuit_image, infer_figure_number_from_page, find_descriptions_for_figure
    - detect_quantum_gates, detect_quantum_problem
    - build_metadata_entry, save_pixmap_as_png, save_dataset_json, save_counts_csv
    """

    paper_ids = load_paper_list()
    paper_counts = {}       # arxiv_id -> number of accepted circuit images
    dataset_entries = []    # list of metadata dicts

    total_circuit_images = 0
    next_image_index = 1

    log(f"[INFO] Starting RASTER pipeline. Target MAX_IMAGES={MAX_IMAGES}")
    log(f"[INFO] Papers in list: {len(paper_ids)}")

    for arxiv_id in paper_ids:
        # Global stopping condition
        if total_circuit_images >= MAX_IMAGES:
            log("[INFO] Reached MAX_IMAGES. Stopping.")
            break

        log(f"[INFO] Processing paper {arxiv_id}")
        num_circuits_in_paper = 0

        # -----------------------------
        # 1) Download / open PDF
        # -----------------------------
        pdf_path = download_pdf(arxiv_id)
        if pdf_path is None:
            log(f"[WARN] Skipping {arxiv_id} (PDF download failed).")
            paper_counts[arxiv_id] = 0
            continue

        # -----------------------------
        # 2) Extract text once (full + per-page)
        # -----------------------------
        try:
            full_text, page_texts = extract_full_text(pdf_path)
        except Exception as e:
            log(f"[ERROR] Text extraction failed for {arxiv_id}: {e}",
                logfile="parsing_errors.log")
            paper_counts[arxiv_id] = 0
            continue

        page_text_dict = {p: t for p, t in page_texts}

        # -----------------------------
        # 3) Extract raster image candidates
        # -----------------------------
        raster_images = extract_images_from_pdf(pdf_path)
        log(f"[DEBUG] {arxiv_id}: {len(raster_images)} raster image candidates")

        # -----------------------------
        # 4) Classify & keep only circuits
        # -----------------------------
        for rec in raster_images:
            if total_circuit_images >= MAX_IMAGES:
                break

            page_number = rec["page_number"]
            pixmap = rec["pixmap"]
            local_idx = rec["image_index"]   # index of this image on the page

            page_text = page_text_dict.get(page_number, "")

            #SAME RASTER CLASSIFICATION AS TEST CODE
            if not is_raster_circuit_image(pixmap, page_text):
                continue  # reject non-circuit

            # -----------------------------
            # 5) Metadata: figure number, descriptions, positions
            # -----------------------------
            figure_number = infer_figure_number_from_page(page_text, local_idx)

            descriptions, positions = [], []
            if figure_number is not None:
                descriptions, positions = find_descriptions_for_figure(
                    full_text=full_text,
                    page_texts=page_texts,
                    page_number=page_number,
                    figure_number=figure_number
                )

            gates = detect_quantum_gates(descriptions)
            problem = detect_quantum_problem(descriptions)

            # -----------------------------
            # 6) Save image PNG
            # -----------------------------
            
            color_score =  compute_color_score(pixmap)
            img_ratio = compute_text_object_ratio(pixmap)
            image_filename = f"image_{next_image_index:04d}-{color_score}-{img_ratio}.png"
            if color_score < 0.685 and img_ratio < 2.6 :
                save_pixmap_as_png(pixmap, image_filename, IMAGE_DIR)

              
          # 7) Build metadata entry
                meta = build_metadata_entry(
                    image_filename=image_filename,
                    arxiv_id=arxiv_id,
                    page_number=page_number,
                    figure_number=figure_number,
                    gates=gates,
                    problem=problem,
                    descriptions=descriptions,
                    positions=positions
                )
                dataset_entries.append(meta)

            # 8) Update counters
                next_image_index += 1
                total_circuit_images += 1
                num_circuits_in_paper += 1
                
        paper_counts[arxiv_id] = num_circuits_in_paper
        log(f"[INFO] Paper {arxiv_id}: {num_circuits_in_paper} raster circuit images")
        # -----------------------------
        # 3) Extract vector candidates
        # -----------------------------
      
        # -----------------------------
        # 3) Extract vector candidates
        # -----------------------------
        try:
            # materialize generator for debugging
            vector_recs = list(
                extract_vector_circuit_images(pdf_path, full_text, page_texts)
            )
        except Exception as e:
            log(
                f"[ERROR] Vector extraction failed for {arxiv_id}: {e}",
                logfile="parsing_errors.log"
            )
            paper_counts[arxiv_id] = 0
            continue

        log(f"[DEBUG] {arxiv_id}: {len(vector_recs)} raw vector candidates")

        # -----------------------------
        # 4) Classify & keep only circuits
        # -----------------------------
        for i, vrec in enumerate(vector_recs):
            if total_circuit_images >= MAX_IMAGES:
                break

            pixmap = vrec["pixmap"]
            img = pil_from_pixmap(pixmap)

            page_number   = vrec["page_number"]
            figure_number = vrec.get("figure_number")  # may be None
            descriptions  = vrec.get("descriptions", []) or []
            positions     = vrec.get("positions", []) or []

            if not descriptions and figure_number is not None:
                try:
                    desc2, pos2 = find_descriptions_for_figure(
                        full_text=full_text,
                        page_texts=page_texts,
                        page_number=page_number,
                        figure_number=figure_number
                    )
                    if desc2:
                        descriptions = desc2
                        positions = pos2
                except Exception as e:
                    log(
                        f"[WARN] Description extraction failed for {arxiv_id} "
                        f"page {page_number}, fig {figure_number}: {e}",
                        logfile="parsing_errors.log"
                    )
            #if is_definitely_graph(img):
             #   continue
            if not is_vector_circuit_final(img, descriptions):
                continue  # reject non-circuit

            # 5) Gate + problem detection (OCR + captions)
            gates, problem = semantic_info_from_image(pixmap, descriptions)

            # 6) Save image PNG
            color_score =  compute_color_score(pixmap)
            img_ratio = compute_text_object_ratio(pixmap)
            image_filename = f"image_{next_image_index:04d}-{color_score}-{img_ratio}.png"
            if color_score < 0.685 and img_ratio < 2.6 :
                save_pixmap_as_png(pixmap, image_filename, IMAGE_DIR)

        # 7) Build metadata entry
                meta = build_metadata_entry(
                    image_filename=image_filename,
                    arxiv_id=arxiv_id,
                    page_number=page_number,
                    figure_number=figure_number,
                    gates=gates,
                    problem=problem,
                    descriptions=descriptions,
                    positions=positions
                )
                dataset_entries.append(meta)

            # 8) Update counters
                next_image_index += 1
                total_circuit_images += 1
                num_circuits_in_paper += 1
            


        paper_counts[arxiv_id] = num_circuits_in_paper
        log(f"[INFO] Paper {arxiv_id}: {num_circuits_in_paper} vector circuit images")

    # -----------------------------
    # 9) Save outputs
    # -----------------------------
    save_dataset_json(dataset_entries)
    save_counts_csv(paper_ids, paper_counts)
    log(f"[INFO] RASTER pipeline finished. Total images: {total_circuit_images}")


In [None]:
if __name__ == "__main__":
    run_pipeline()