# Chunk Categorization

This notebook classifies text chunks into specific contract categories using LLMs.

**Workflow:**
1. **Load Chunks**: Reads chunked text files.
2. **Langfuse Integration**: Manages prompts and traces execution.
3. **Gemini Classification**: Uses Google Gemini 2.5 Pro to classify each chunk into:
   - **Contract Section Type** (e.g., COMPENSATION)
   - **Second Level** (e.g., PAYABLE_ENTITLEMENT)
   - **Third Level** (e.g., LABOUR)
4. **Output**: Saves the classification results to JSON for database loading.


In [1]:
# ==============================================================================
# STEP 3: Batch classify chunk files with Gemini 2.5 Pro via LangChain + Langfuse
# ==============================================================================

import json
import os
import re
import time
from pathlib import Path

# --- Langfuse Imports ---
from langfuse import Langfuse
from langfuse.langchain import CallbackHandler

from langchain_core.messages import HumanMessage, SystemMessage
from langchain_google_genai import ChatGoogleGenerativeAI

In [2]:
LANGFUSE_SECRET_KEY = "sk-lf-47e1f03b-e7b8-4d88-9676-5b499727c074"
LANGFUSE_PUBLIC_KEY = "pk-lf-ae5d3d74-de3b-498c-84bf-4ed6b9f43c2f"
LANGFUSE_BASE_URL = "http://localhost:3000"
GOOGLE_API_KEY = "AIzaSyC9J2RpqaNZnZ3XMSh4XnR1JT44THF4wE4"

In [3]:
os.environ["LANGFUSE_SECRET_KEY"] = LANGFUSE_SECRET_KEY 
os.environ["LANGFUSE_PUBLIC_KEY"] = LANGFUSE_PUBLIC_KEY
os.environ["LANGFUSE_HOST"] = "http://localhost:3000" 
os.environ.setdefault("GOOGLE_API_KEY", GOOGLE_API_KEY) 

'AIzaSyC9J2RpqaNZnZ3XMSh4XnR1JT44THF4wE4'

In [None]:

BASE_DIR = Path("/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Docling_Tweak/CHUNK_NEW")
OUTPUT_PATH = Path("/Users/swathi.gnanasekar/Documents/Vista_Vu_Project/Phase 1/Docling_Tweak/Categorized/result_version 0.json")

SYSTEM_SUFFIX = "\n\nOnly return the JSON object described under 'Required Output Format'. Do not add code fences or extra text."

# 2. INITIALIZE LANGFUSE
langfuse = Langfuse()

# 3. INITIALIZE MODEL
model = ChatGoogleGenerativeAI(
    model="gemini-2.5-pro",
    temperature=0,
    convert_system_message_to_human=True,
)

def extract_json(content) -> dict:
    """Normalize model content then parse JSON."""
    if isinstance(content, list):
        text_parts = []
        for part in content:
            if isinstance(part, dict):
                text_parts.append(part.get("text", ""))
            else:
                text_parts.append(str(part))
        content = " ".join(text_parts).strip()
    if not isinstance(content, str):
        content = str(content)

    try:
        return json.loads(content)
    except json.JSONDecodeError:
        match = re.search(r"\{.*\}", content, re.DOTALL)
        if match:
            return json.loads(match.group())
        raise

def classify_chunk(chunk_text: str, chunk_name: str) -> dict:
    # 4. FETCH PROMPT FROM LANGFUSE
    # This pulls the latest version. If you edit the prompt in the UI, 
    # this line gets the new text automatically.
    langfuse_prompt = langfuse.get_prompt("chunk-classification-system")
    
    # We compile the prompt (if you had variables like {{variable}}, this handles them)
    # Since your system prompt is static text, .compile() just returns the string.
    system_prompt_text = langfuse_prompt.compile()

    messages = [
        SystemMessage(content=system_prompt_text + SYSTEM_SUFFIX),
        HumanMessage(content=f"Chunk name: {chunk_name}\n\nChunk text:\n{chunk_text}"),
    ]
    
    # 5. INITIALIZE CALLBACK HANDLER
    # This handler watches the execution and sends data to your localhost Langfuse
    langfuse_handler = CallbackHandler()

    # 6. INVOKE WITH CALLBACK AND PROMPT TRACKING
    # We pass the handler to 'callbacks'.
    # We verify the trace is linked to the prompt by passing the prompt object to the handler logic if needed,
    # but strictly speaking, LangChain integration traces the *execution*.
    # To strictly link the "Prompt Version" in the UI, we usually use langfuse_prompt.get_langchain_prompt()
    # BUT for your custom message structure, the handler will log the input/output perfectly.
    response = model.invoke(
        messages, 
        config={"callbacks": [langfuse_handler]}
    )
    
    return extract_json(response.content)

# ... (The rest of your file loop logic remains the same) ...

results = []
txt_files = sorted(BASE_DIR.glob("**/*.txt"))

# Processing loop
for idx, path in enumerate(txt_files, start=1):
    chunk_text = path.read_text(encoding="utf-8", errors="ignore").strip()
    if not chunk_text:
        continue

    print(f"Processing: {path.name}")
    
    try:
        classification = classify_chunk(chunk_text, path.name)
    except Exception as exc:
        classification = {
            "Contract Section Type": "ERROR",
            "Second Level": "",
            "Third Level": "",
            "error": str(exc),
        }

    results.append(
        {
            "chunk_file": str(path.relative_to(BASE_DIR)),
            "classification": classification,
        }
    )

    # Small delay
    time.sleep(0.2)

OUTPUT_PATH.parent.mkdir(parents=True, exist_ok=True)
with OUTPUT_PATH.open("w", encoding="utf-8") as f:
    json.dump(results, f, indent=2)

print(f"Classified {len(results)} chunks into {OUTPUT_PATH}")
 
# Ensure all traces are sent before exiting
langfuse.flush()

E0000 00:00:1767120014.726540  373686 alts_credentials.cc:93] ALTS creds ignored. Not running on GCP and untrusted ALTS is not enabled.


Processing: 01__ARTICLE_100_-PURPOSE.txt
Processing: 02__ARTICLE_200_-RECOGNITION_AND_CRAFT_JURISDICTION.txt
Processing: 03__ARTICLE_300_-MANAGEMENT_RIGHTS.txt
Processing: 04__ARTICLE_400_-UNION_SECURITY_AND_DUES_COLLECTION.txt
Processing: 05__ARTICLE_500_-NO_STRIKES_OR_LOCKOUTS.txt
Processing: 06__ARTICLE_600_-WORKING_CONDITIONS_SAFEFTY_MEASURES_HEALTH_AND_SANITATION.txt
Processing: 07__ARTICLE_700_-WELDING_TESTS.txt
Processing: 08__ARTICLE_800_-ACCESS_TO_JOBS.txt
Processing: 09__ARTICLE_900_-JOB_STEWARDS.txt
Processing: 10__ARTICLE_1000_-GRIEVANCE_PROCEDURE.txt
Processing: 11__ARTICLE_1100_-HOURS_OF_WORK.txt
Processing: 12__ARTICLE_1200_-SHIFT_WORK.txt
Processing: 13__ARTICLE_1300_-OVERTIMEOVERTIME_MEAL_BREAKS.txt
Processing: 14__ARTICLE_1400_-RECOGNIZED_HOLIDAYS.txt
Processing: 15__ARTICLE_1500_-WAITING_AND_REPORTING_TIME.txt
Processing: 16__ARTICLE_1600_-TRAVEL_AND_SUBSISTENCE.txt
Processing: 17__ARTICLE_1700_-VACATION_WITH_PAY.txt
Processing: 18__ARTICLE_1800_-PAY_DAY.txt
Processi