# Debug Pipeline: FlowFigTabMiner

Use this notebook to step-by-step debug the pipeline. You can override API keys and Model selection here directly without changing environment files.

In [1]:
# 1. Setup & Configuration
import os
import sys

# Ensure project root is in path
sys.path.append(os.getcwd())

from config import Config
from src.llm_factory import LLMFactory

# Note: All configuration is now centrally managed by src.config.Config and .env
# This ensures consistency between this notebook and main.py

print(f"Current Provider: {Config.LLM_PROVIDER}")
print(f"Current Model: {Config.LLM_MODEL_NAME}")

# Test LLM Connection
try:
    print("Testing LLM connection...")
    # response = LLMFactory.create_completion("Hello, config is working!")
    # print(f"LLM Response: {response}")
    print("LLM Factory initialized (Uncomment lines above to test actual call)")
except Exception as e:
    print(f"LLM Connection Error: {e}")

    You are using a Python version 3.9 past its end of life. Google will update
    google-auth with critical bug fixes on a best-effort basis, but not
    with any other fixes or features. Please upgrade your Python version,
    and then update google-auth.
    
    You are using a Python version 3.9 past its end of life. Google will update
    google-auth with critical bug fixes on a best-effort basis, but not
    with any other fixes or features. Please upgrade your Python version,
    and then update google-auth.
    


Current Provider: qwen
Current Model: qwen-vl-max
Testing LLM connection...
LLM Factory initialized (Uncomment lines above to test actual call)


  from .autonotebook import tqdm as notebook_tqdm

All support for the `google.generativeai` package has ended. It will no longer be receiving 
updates or bug fixes. Please switch to the `google.genai` package as soon as possible.
See README for more details:

https://github.com/google-gemini/deprecated-generative-ai-python/blob/main/README.md

  import google.generativeai as genai


In [2]:
# 2. PDF Parsing (Step 1)
from src.parsing.docling_wrapper import parse_pdf_to_markdown

input_pdf = "data/input/example.pdf"
if not os.path.exists(input_pdf):
    print(f"Warning: {input_pdf} not found. Please place a PDF there.")
else:
    print("Running Docling...")
    # markdown_text = parse_pdf_to_markdown(input_pdf)
    # print(f"Docling extracted {len(markdown_text)} characters")
    # print(markdown_text[:500] + "...") # Preview

Running Docling...


In [3]:
# 3. TF-ID Extraction (Step 2 - Track B)
import src.parsing.active_area_detector
from importlib import reload
reload(src.parsing.active_area_detector)
from src.parsing.active_area_detector import ActiveAreaDetector

if os.path.exists(input_pdf):
    print("Initializing TF-ID Detector (this will download models to local dir)...")
    detector = ActiveAreaDetector()
    detections = detector.process_pdf(input_pdf)
    import json
    print(json.dumps(detections, indent=2))
    
    # --- SAVE CROPS ---
    print("Saving detected crops to data/intermediate...")
    saved_paths = detector.save_crops(input_pdf, detections, "data/intermediate")
    print(f"Saved {len(saved_paths)} images: {saved_paths}")
    
    # Define variables for next steps
    table_images = [p for p in saved_paths if '_table_' in p]
    figure_images = [p for p in saved_paths if '_figure_' in p]

Initializing TF-ID Detector (this will download models to local dir)...
Loading TF-ID (Florence-2) model: yifeihu/TF-ID-base on mps...


Florence2LanguageForConditionalGeneration has generative capabilities, as `prepare_inputs_for_generation` is explicitly defined. However, it doesn't directly inherit from `GenerationMixin`. From 👉v4.50👈 onwards, `PreTrainedModel` will NOT inherit from `GenerationMixin`, and this model will lose the ability to call `generate` and other related functions.
  - If you are the owner of the model architecture code, please modify your model class such that it inherits from `GenerationMixin` (after `PreTrainedModel`, otherwise you'll get an exception).
  - If you are not the owner of the model architecture class, please contact the model code owner to update it.


PATCHING: Adding GenerationMixin to Florence2LanguageForConditionalGeneration bases for compatibility.
PATCHING: Initializing missing generation_config for language_model.
{
  "page_1": [
    {
      "label": "table",
      "score": 1.0,
      "box": [
        95.88,
        59.33,
        1093.93,
        193.79
      ]
    }
  ],
  "page_2": [
    {
      "label": "figure",
      "score": 1.0,
      "box": [
        360.28,
        948.41,
        1095.12,
        1438.83
      ]
    }
  ],
  "page_3": [
    {
      "label": "figure",
      "score": 1.0,
      "box": [
        361.47,
        556.07,
        1096.32,
        1437.25
      ]
    }
  ],
  "page_4": [
    {
      "label": "table",
      "score": 1.0,
      "box": [
        603.24,
        847.16,
        1095.12,
        1441.99
      ]
    }
  ],
  "page_5": [
    {
      "label": "figure",
      "score": 1.0,
      "box": [
        605.62,
        611.44,
        1095.12,
        1109.77
      ]
    },
    {
      "la

In [4]:
# 4. Table Extraction (Step 3A)
from src.extraction.table_agent import extract_table
import os

extracted_tables = []
# Use the cropped images from Step 3
if 'table_images' in locals() and table_images:
    print(f"Processing {len(table_images)} Tables...")
    for img_path in table_images:
        print(f"--> Extracting: {os.path.basename(img_path)}")
        try:
            result = extract_table(img_path)
            if result.get("is_valid"):
                print("    ✅ Valid Table Extracted")
                extracted_tables.append({"source": img_path, "data": result})
            else:
                print(f"    ❌ Rejected: {result.get('reason')}")
        except Exception as e:
            print(f"    ⚠️ Error: {e}")
else:
    print("No table images found from Step 3.")

print(f"\nTotal Valid Tables: {len(extracted_tables)}")

In [5]:
# 5. Figure Extraction (Step 3B)
from src.extraction.figure_agent import extract_figure
import os

extracted_figures = []
if 'figure_images' in locals() and figure_images:
    print(f"Processing {len(figure_images)} Figures...")
    for img_path in figure_images:
        print(f"--> Extracting: {os.path.basename(img_path)}")
        try:
            result = extract_figure(img_path)
            if result.get("is_valid"):
                print("    ✅ Valid Figure Extracted")
                extracted_figures.append({"source": img_path, "data": result})
            else:
                print(f"    ❌ Rejected: {result.get('reason')}")
        except Exception as e:
            print(f"    ⚠️ Error: {e}")
else:
    print("No figure images found from Step 3.")

print(f"\nTotal Valid Figures: {len(extracted_figures)}")

In [6]:
# 6. Fusion (Step 4)
# from src.fusion.data_merger import fuse_data

# final_dataset = fuse_data(table_data, figure_data, global_context)
# print("Fusion Complete")