In [1]:
from dotenv import load_dotenv
import os

In [2]:

load_dotenv('../.env')

True

In [3]:
anthropic_api_key = os.getenv("ANTHROPIC_API_KEY")

In [4]:
import json
from pathlib import Path

from src.services.llm import AnthropicBatchProvider, build_extraction_batch_request
from src.knowledge_graph.schemas import ExtractionResult, Ontology

In [5]:
import json
from pathlib import Path

from src.services.llm import AnthropicBatchProvider, build_extraction_batch_request
from src.knowledge_graph.schemas import ExtractionResult, Ontology

# ============================================================================
# Extraction prompt template (from src/knowledge_graph/extract_entities_batch.py)
# ============================================================================
EXTRACTION_PROMPT_TEMPLATE = """
You are an expert knowledge graph extractor for the "{series_name}" series.
Your goal is to extract a comprehensive list of entities and relationships from the provided text chunk.

STRICT ADHERENCE & ADAPTIVE EVOLOUTION:
1. You must primarily use the provided Ontology Schema to categorize entities and relationships.
2. IF you encounter a significant entity or relationship that strongly clearly does NOT fit existing definitions:
   - DO NOT force it into an incorrect category.
   - DO NOT ignore it if it is important.
   - PROPOSE a schema update in the `schema_proposals` field (e.g., "new_entity_type", "new_relationship_type").
3. Use the Canonical Renaming Rules to normalize entity names.

INPUT CONTEXT:
This text is from: {book_names}
Section ID: {section_id}

TASK:
1. Identify all significant entities (Characters, Locations, Organizations, Artifacts, Events, etc.).
2. Extract detailed attributes for each entity.
3. Identify all relationships between these entities.
4. Extract evidence (quotes) for relationships.
5. Assign a CONFIDENCE score (0.0-1.0) to each extraction.
6. Identify gaps in the schema and propose updates in `schema_proposals`.

OUTPUT FORMAT:
Return a JSON object strictly matching the `ExtractionResult` Pydantic model.
"""

# ============================================================================
# Initialize provider
# ============================================================================
provider = AnthropicBatchProvider(
    api_key=anthropic_api_key,  
    jobs_dir="../data/batch_jobs"
)

# ============================================================================
# Load schema and build extraction prompt for one section
# ============================================================================
# Load ontology schema for the series
with open("../data/schemas/harry_potter_schema.json", "r") as f:
    schema_data = json.load(f)
ontology = Ontology(**schema_data)
schema_context = f"ONTOLOGY:\n{ontology.model_dump_json(indent=2)}"

# Load one section file
section_file = Path("../data/processed_books_claude_200k/harry_potter_section_01.txt")
text_content = section_file.read_text()

# Load metadata
with open("../data/processed_books_claude_200k/harry_potter_section_01.meta.json") as f:
    meta = json.load(f)
book_names = ", ".join(meta.get("books", []))
section_id = str(meta.get("section", "1"))

# Build full prompt
extraction_prompt = EXTRACTION_PROMPT_TEMPLATE.format(
    series_name="Harry Potter",
    book_names=book_names,
    section_id=section_id,
)
full_prompt = f"{extraction_prompt}\n\n{schema_context}\n\nTEXT CONTENT:\n{text_content}"

# Create batch request using ExtractionResult schema
requests = [
    build_extraction_batch_request(
        custom_id="harry_potter_section_01",
        prompt=full_prompt,
        model="claude-opus-4-5-20251101",
        schema=ExtractionResult,  # Pydantic model from src/knowledge_graph/schemas.py
        max_tokens=64000,
        temperature=0.0,
    )
]



[2m2026-01-12 13:27:26[0m [[32m[1minfo     [0m] [1mbatch_provider_initialized    [0m [36mapi_key[0m=[35m...ZQAA[0m [36mjobs_dir[0m=[35m../data/batch_jobs[0m [36mprovider[0m=[35mAnthropicBatchProvider[0m


In [None]:
# ============================================================================
# Submit batch
# ============================================================================
job = await provider.create_batch(
    requests=requests,
    output_dir="../data/extracted_graph_batch",
    model="claude-opus-4-5-20251101",
    series_name="Harry Potter"
)

print(f"Batch ID: {job.id}")

In [6]:
status = await provider.get_batch_status("msgbatch_01SM3TT5kGqVeajXysiPkj49")
print(f"Status: {status.processing_status}")
print(f"Succeeded: {status.request_counts.succeeded}")
print(f"Processing: {status.request_counts.processing}")

[2m2026-01-12 13:27:32[0m [[32m[1mdebug    [0m] [1mjob_saved                     [0m [36mbatch_id[0m=[35mmsgbatch_01SM3TT5kGqVeajXysiPkj49[0m [36mpath[0m=[35m../data/batch_jobs/msgbatch_01SM3TT5kGqVeajXysiPkj49.json[0m [36mprovider[0m=[35mAnthropicBatchProvider[0m
[2m2026-01-12 13:27:32[0m [[32m[1minfo     [0m] [1mbatch_status_retrieved        [0m [36mbatch_id[0m=[35mmsgbatch_01SM3TT5kGqVeajXysiPkj49[0m [36mendpoint[0m=[35mget_batch_status[0m [36merrored[0m=[35m0[0m [36mprocessing[0m=[35m0[0m [36mprovider[0m=[35mAnthropicBatchProvider[0m [36mstatus[0m=[35mended[0m [36msucceeded[0m=[35m1[0m
Status: ended
Succeeded: 1
Processing: 0


In [7]:
result = await provider.process_results_to_files(
    batch_id="msgbatch_01SM3TT5kGqVeajXysiPkj49",
    schema=ExtractionResult
)

[2m2026-01-12 13:27:34[0m [[32m[1mdebug    [0m] [1mjob_saved                     [0m [36mbatch_id[0m=[35mmsgbatch_01SM3TT5kGqVeajXysiPkj49[0m [36mpath[0m=[35m../data/batch_jobs/msgbatch_01SM3TT5kGqVeajXysiPkj49.json[0m [36mprovider[0m=[35mAnthropicBatchProvider[0m
[2m2026-01-12 13:27:34[0m [[32m[1minfo     [0m] [1mbatch_status_retrieved        [0m [36mbatch_id[0m=[35mmsgbatch_01SM3TT5kGqVeajXysiPkj49[0m [36mendpoint[0m=[35mget_batch_status[0m [36merrored[0m=[35m0[0m [36mprocessing[0m=[35m0[0m [36mprovider[0m=[35mAnthropicBatchProvider[0m [36mstatus[0m=[35mended[0m [36msucceeded[0m=[35m1[0m
[2m2026-01-12 13:27:34[0m [[32m[1minfo     [0m] [1mretrieving_batch_results      [0m [36mbatch_id[0m=[35mmsgbatch_01SM3TT5kGqVeajXysiPkj49[0m [36mendpoint[0m=[35mretrieve_results[0m [36mprovider[0m=[35mAnthropicBatchProvider[0m
[2m2026-01-12 13:27:35[0m [[32m[1minfo     [0m] [1mbatch_results_retrieved       [0m [36mbatch

In [8]:
result

{'processed': 1,
 'failed': 0,
 'usage': UsageMetrics(input_tokens=123342, output_tokens=28540, total_tokens=151882, cached_tokens=0, reasoning_tokens=0, input_cost_usd=None, output_cost_usd=None, total_cost_usd=None, provider='anthropic', model='claude-opus-4-5-20251101', timestamp=1768204655.095932, api_key_last4='...ZQAA'),
 'files_written': ['../data/extracted_graph_batch/harry_potter_section_01_extracted.json']}