# Phase 2: Data Ingestion Pipeline

This notebook orchestrates the loading, chunking, and structured extraction of the BRSR Report.

In [None]:
import sys
import os
# Add project root to path
sys.path.append(os.path.abspath('..'))

from src.ingest import IngestionEngine
from dotenv import load_dotenv

load_dotenv()

# Initialize Engine
engine = IngestionEngine(model_name="gpt-4o")

In [None]:
pdf_path = "../data/target_report.pdf"

if os.path.exists(pdf_path):
    # 1. Load and Chunk (CalQuity Style)
    chunks = engine.load_and_chunk(pdf_path)
    print(f"Loaded {len(chunks)} text chunks.")
    
    # 2. Extract Structured Data (DataWeave Style)
    # For demo, we might only send specific relevant pages to save tokens
    # 'Principle 6' usually appears in specific sections. 
    # A naive approach is to send everything, but for costs we might filter.
    
    # Simple keyword filter to find relevant chunks for the Agent
    sub_text = "\n".join([c['text'] for c in chunks if "Principle 6" in c['text'] or "emissions" in c['text'].lower()])
    
    print(f"Extracted Context Length: {len(sub_text)} chars")
    
    # 3. Run Extraction Agent
    data = engine.extract_principle_6(sub_text[:50000]) # Hard limit for tokens in demo
    print("\n--- Extracted Data ---\n")
    print(data.model_dump_json(indent=2))
else:
    print("PDF not found. Please place 'target_report.pdf' in the data/ directory.")