# Analyze Corpus

This notebook analyzes our corpus.

## Imports

In [None]:
import os
import pathlib

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns



Third-party modules

In [None]:
import dotenv
from openai import OpenAI
import tiktoken

Switch to the parent directory so paths can resolve and we write to the right directories.

In [None]:
cwd = pathlib.Path.cwd().resolve()
project_root = cwd.parent if cwd.name == "notebooks" else cwd
scripts_dir = project_root / "scripts"
if scripts_dir.is_dir():
    if cwd != project_root:
        print(f"Changing working directory from {cwd} to {project_root}")
        os.chdir(project_root)  # Change to the project root directory.
print("Working directory:", pathlib.Path.cwd())

Add imports from within the project (depends on prior cell)

In [None]:
from lcats import constants
from lcats import stories
from lcats import utils

from lcats.analysis import survey
from lcats.analysis import llm_extractor
from lcats.analysis import scenes
from lcats.analysis import text_indexing



In [None]:
from importlib import reload

RELOAD_MODULES = [
    constants,
    stories,
    llm_extractor,
    scenes,
    survey,
    text_indexing,
    utils,
]
def reloader():
    for module in RELOAD_MODULES:
        print("Reloading", module)
        reload(module)
    print("Reloading complete.")


## Project Setup

### Path Setup

In [None]:
# Working corpora
# CORPORA_ROOT = project_root / "data"
# Checked-in corpora
CORPORA_ROOT = project_root / ".." / "corpora"
CORPORA_ROOT = CORPORA_ROOT.resolve()  # Resolve to absolute path.

print("Corpora root:", CORPORA_ROOT)
print("Corpora top-level directories:", end=" ")
os.listdir(CORPORA_ROOT)

In [None]:
json_stories = survey.find_corpus_stories(CORPORA_ROOT)
len(json_stories)
print(utils.sml(json_stories))

In [None]:
# Testing just with a sample of 10 stories for speed.
# short_stories = stories[:10]  # lol
# story_stats, author_stats = survey.compute_corpus_stats(short_stories)
story_stats, author_stats = survey.compute_corpus_stats(json_stories)


In [None]:
story_stats.describe()

In [None]:
author_stats.describe()


In [None]:
author_stats

In [None]:
story_stats

In [None]:
fig, ax = survey.plot_author_stories_vs_tokens(author_stats, annotate_top=15)
plt.show()


In [None]:
fig, ax = survey.plot_author_stories_vs_tokens_sns(author_stats, annotate_top=10)
plt.show()

In [None]:
fig, ax = survey.plot_tokens_per_story_by_author(story_stats, top_n=24, min_stories=2, rotate_labels=45)
plt.show()


In [None]:
fig, ax = survey.plot_tokens_per_story_by_author_sns(story_stats, top_n=24, min_stories=2, rotate_labels=45)
plt.show()


In [None]:
fig, ax = survey.plot_tokens_per_story_vs_stories(
    author_stats, annotate_top=15, log_y=True, jitter=0.05, spread_step=4, x_spread=6)
plt.show()


## Scene-Sequel Extraction

### Path Setup

In [None]:
import pathlib

# Where the notebook is executing (absolute, resolved)
CURRENT_PATH = pathlib.Path.cwd().resolve()

# Project root = formerly parent of notebooks/, now just current dir
# PROJECT_ROOT = CURRENT_PATH.parent 
PROJECT_ROOT = CURRENT_PATH

# Local data/output inside the project
DEV_CORPUS = (PROJECT_ROOT / "data")
DEV_OUTPUT = (PROJECT_ROOT / "output")

# Sibling-level resources (one level up from project root)
GIT_CORPUS = (PROJECT_ROOT.parent / "corpora")
OPENIA_API_KEYS_ENV = (PROJECT_ROOT.parent / ".secrets" / "openai_api_keys.env")

def check_path(path: pathlib.Path, description: str) -> None:
    if path.exists():
        print(f"Found {description} at: {path}")
    else:
        print(f"Missing {description} from: {path}")

check_path(DEV_CORPUS, "DEV_CORPUS")
check_path(DEV_OUTPUT, "DEV_OUTPUT")
check_path(GIT_CORPUS, "GIT_CORPUS")
check_path(OPENIA_API_KEYS_ENV, "OPENIA_API_KEYS_ENV")


## OpenAI Client

Get the OpenAI API key.

In [None]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

Verify that we can get a client.

In [None]:
client = OpenAI()
print(f"Loaded OpenAI client: {client} with version: {client._version}")

Verify the API is working. This week. And that you have credits.

In [None]:
response = client.responses.create(
    model="gpt-4o",
    input="Write a one-sentence bedtime story about a starship captain visiting a planet."
)

print(f"Story generated on: {date.today()}:")
utils.pprint(response.output_text)

## Story Corpora

In [None]:
# If run from within a notebook, the corpora root is two paths up from the notebook's location.
CORPORA_ROOT = GIT_CORPUS  # Checked-in corpora
# CORPORA_ROOT = DEV_CORPUS  # Command line working corpora

# Now load the corpora
corpora = stories.Corpora(CORPORA_ROOT)

print("Loaded corpora:")
print(f" - root: {corpora.corpora_root}")
print(f" - corpora: {len(corpora.corpora)}")
print(f" - stories: {len(corpora.stories)}")
print()

print(f"Example story: corpora.stories[0]:")
example_story = corpora.stories[0]
print(f"Story type: {type(example_story)} with a body of {len(example_story.body)} characters.")
print(example_story)


## Scene and Sequel Extraction

In [None]:
scene_extractor = scenes.make_scene_sequel_extractor(client)

In [None]:
example_extraction = scene_extractor.extract(example_story.body)
example_extraction

In [None]:
example_result = example_extraction['extracted_output']
example_result

In [None]:
def display_scenes(scenes):
    for i, scene in enumerate(scenes):
        event_type = scene.get('event_type', 'unknown')
        reason = scene.get('reason', 'unknown')
        event_text = scene.get('event_text', '')
        print(f"Scene {i}: Type {event_type}")
        print(f" - Reason: {reason}")
        print(f" - Text ({len(event_text)} characters): {utils.sm(event_text, limit=100)}")
        print()


display_scenes(example_result)

In [None]:
reloader()

### Revised Prompts

In [None]:

SCENE_SEQUEL_SYSTEM_PROMPT = """
You are a narrative segmentation assistant. Your job is to segment a story
into COARSE-GRAINED, contiguous narrative segments (“scenes” at the level
of time/place), then label each segment.

### Segment Types
- dramatic_scene: a narrative scene where a focal character with a Goal takes
  Action, encounters Conflict, and reaches a Disaster or Success (GACD).
- dramatic_sequel: a narrative scene (typically after a dramatic_scene) where
  a focal character experiences Emotion, reasons about Options, Anticipates
  outcomes, and Chooses a new goal (ERAC).
- narrative_scene: a narrative scene unified by time/place (and often
  character/action) but lacking clear GACD/ERAC structure.
- other: text that is not a narrative scene (e.g., front/back matter,
  epigraphs, meta-commentary, tables of contents, etc.).

### Granularity Rules (VERY IMPORTANT)
1) Coarse segmentation only. Prefer FEWER, LARGER segments over many small ones.
2) Split primarily on MEANINGFUL changes in TIME and/or PLACE (or explicit
   scene-break markers like “***”, chapter headers, clear time jumps).
3) Do NOT split simply because a paragraph or a couple of sentences shift topic.
   If time/place is stable, keep them in the same segment.
4) Merge tiny candidate segments (< ~3 sentences or ~100 characters) into
   adjacent segments unless there is an explicit time/place change.
5) Dialogue ping-pong alone is not a boundary; treat as one scene unless
   time/place changes.
6) A dramatic_sequel typically follows a dramatic_scene in the SAME time/place,
   unless the text clearly relocates the character in time/place.
7) If unsure between dramatic_scene vs narrative_scene, choose narrative_scene.
   If unsure between dramatic_sequel vs narrative_scene, choose narrative_scene.

### Output Requirements
- Output MUST be valid JSON only (no preface or commentary).
- Return a single object: { "segments": [ ... ] }.
- For each segment, include:
  - segment_id: integer index starting at 1.
  - segment_type: one of "dramatic_scene", "dramatic_sequel",
    "narrative_scene", "other".
  - start_char, end_char: 0-based character offsets into the provided story_text
    covering the ENTIRE contiguous segment (Python slicing semantics).
  - summary: a short summary (not the full text), ≤ 200 characters.
  - cohesion: brief notes identifying the unifying TIME/PLACE/CHARACTERS.
  - gacd: for dramatic_scene only (else null): { "goal": "...", "action": "...",
    "conflict": "...", "outcome": "Disaster|Success|Unclear" }.
  - erac: for dramatic_sequel only (else null): { "emotion": "...", "reason": "...",
    "anticipation": "...", "choice": "..." }.
  - reason: 1–3 sentences explaining the label and boundary choice (focus on
    time/place continuity and GACD/ERAC evidence).
  - confidence: float in [0,1].

Ensure segments are contiguous, non-overlapping, and collectively cover only
the parts of the text that are actual narrative (it is okay if front/back matter
is labeled as "other" and some gaps are unsegmented).
"""

SCENE_SEQUEL_USER_PROMPT_TEMPLATE = """
You will receive a STORY in plain text. Segment it into COARSE narrative
segments and label each as dramatic_scene, dramatic_sequel, narrative_scene,
or other, following the system instructions.

Procedure you MUST follow (internally):
1) Skim the STORY to identify major time/place blocks and explicit scene-breaks.
2) Propose initial boundaries at major time/place changes or explicit markers.
3) Merge adjacent tiny spans (< ~3 sentences or ~100 chars) unless there is a
   real time/place shift.
4) Classify each final segment:
   - dramatic_scene → find GACD evidence (goal, action, conflict, outcome).
   - dramatic_sequel → find ERAC evidence (emotion, reason, anticipation, choice).
   - narrative_scene → time/place unified but no clear GACD/ERAC.
   - other → non-narrative material.
5) Produce the JSON described in the system prompt. Use the exact schema and keys.

Return ONLY JSON with this shape:
{{
  "segments": [
    {{
      "segment_id": 1,
      "segment_type": "dramatic_scene" | "dramatic_sequel" | "narrative_scene" | "other",
      "start_char": 0,
      "end_char": 1234,
      "summary": "<≤200-char summary of the segment>",
      "cohesion": {{
        "time": "<what time unifies this segment (if stated or implied)>",
        "place": "<what place unifies this segment (if stated or implied)>",
        "characters": ["<main character(s)>"]
      }},
      "gacd": {{
        "goal": "...",
        "action": "...",
        "conflict": "...",
        "outcome": "Disaster|Success|Unclear"
      }} | null,
      "erac": {{
        "emotion": "...",
        "reason": "...",
        "anticipation": "...",
        "choice": "..."
      }} | null,
      "reason": "<why this label and these boundaries>",
      "confidence": 0.0
    }}
  ]
}}

STORY:
\"\"\"{story_text}\"\"\"
"""

def make_scene_sequel_extractor(client):
    return llm_extractor.JSONPromptExtractor(
        client,
        system_prompt=SCENE_SEQUEL_SYSTEM_PROMPT,
        user_prompt_template=SCENE_SEQUEL_USER_PROMPT_TEMPLATE,
        output_key="segments",
        default_model="gpt-4o",
        temperature=0.2,
        force_json=True,
    )

revised_extractor = make_scene_sequel_extractor(client)

In [None]:
revised_extraction = revised_extractor.extract(example_story.body)
revised_extraction

In [None]:
revised_result = revised_extraction['extracted_output']
revised_result

In [None]:
def display_revised(story_text, extracted_scenes):
    for i, scene in enumerate(extracted_scenes):
        segment_id = scene.get('segment_id', 'unknown')
        segment_type = scene.get('segment_type', 'unknown')
        start_char = scene.get('start_char', -1)
        end_char = scene.get('end_char', -1)
        summary = scene.get('summary', '')
        cohesion = scene.get('cohesion', {})
        gacd = scene.get('gacd', None)
        erac = scene.get('erac', None)
        reason = scene.get('reason', 'unknown')
        confidence = scene.get('confidence', -1.0)
        print(f"Scene {i}: Type {segment_type} (Confidence: {confidence})")
        print(f" - Segmentation Rationale: {reason}")
        print(f" - Summary: {summary}")
        print(f" - Segment ID: {segment_id}, Chars: [{start_char}:{end_char}], Length: {end_char - start_char} chars")
        print(f" - Cohesion: {cohesion}")
        if gacd:
            print(f" - GACD: {gacd}")
        if erac:
            print(f" - ERAC: {erac}")
        # scene_text = story_text[start_char:end_char] if 0 <= start_char < end_char <= len(story_text) else ''
        # print(f" - Scene Text:")
        # print(scene_text)
        print()

display_revised(example_story.body, revised_result)


In [None]:
len(example_story.body)

In [None]:
utils.pprint(example_story.body)

In [None]:
SCENE_SEQUEL_SYSTEM_PROMPT = """
You are a narrative segmentation assistant. Your job is to segment a story
into COARSE-GRAINED, contiguous narrative segments (“scenes” at the level
of time/place), then label each segment.

### Segment Types
- dramatic_scene: a narrative scene where a focal character with a Goal takes
  Action, encounters Conflict, and reaches a Disaster or Success (GACD).
- dramatic_sequel: a narrative scene (typically after a dramatic_scene) where
  a focal character experiences Emotion, reasons about Options, Anticipates
  outcomes, and Chooses a new goal (ERAC).
- narrative_scene: a narrative scene unified by time/place (and often
  character/action) but lacking clear GACD/ERAC structure.
- other: text that is not a narrative scene (e.g., front/back matter,
  epigraphs, meta-commentary, tables of contents, etc.).

### Granularity Rules (VERY IMPORTANT)
1) Coarse segmentation only. Prefer FEWER, LARGER segments over many small ones.
2) Split primarily on MEANINGFUL changes in TIME and/or PLACE (or explicit
   scene-break markers like “***”, chapter headers, clear time jumps).
3) Do NOT split simply because a paragraph or a couple of sentences shift topic.
   If time/place is stable, keep them in the same segment.
4) Merge tiny candidate segments (< ~3 sentences or ~100 characters) into
   adjacent segments unless there is an explicit time/place change.
5) Dialogue ping-pong alone is not a boundary; treat as one scene unless
   time/place changes.
6) A dramatic_sequel typically follows a dramatic_scene in the SAME time/place,
   unless the text clearly relocates the character in time/place.
7) If unsure between dramatic_scene vs narrative_scene, choose narrative_scene.
   If unsure between dramatic_sequel vs narrative_scene, choose narrative_scene.

### Coverage & Ordering Rules
- Ensure coverage across the entire STORY, not just the beginning. If later
  paragraphs are narrative but do not clearly fit GACD/ERAC, label them as
  narrative_scene (or other) rather than omitting them.
- Segments must be in ascending order, contiguous within their own boundaries,
  and non-overlapping. It is acceptable to include “other” segments for
  non-narrative material.

### Output Requirements (JSON ONLY)
Return exactly one JSON object: { "segments": [ ... ] }

For each segment include:
- segment_id: integer index starting at 1.
- segment_type: "dramatic_scene" | "dramatic_sequel" | "narrative_scene" | "other".

# --- Robust location selectors (PRIMARY) ---
- start_par_id: integer paragraph id where the segment begins (inclusive).
- end_par_id: integer paragraph id where the segment ends (inclusive).
- start_exact: the FIRST ≤120 characters of the segment, COPIED VERBATIM from the STORY text.
- end_exact: the LAST ≤120 characters of the segment, COPIED VERBATIM from the STORY text.
- start_prefix: ≤60 characters immediately BEFORE start_exact in the STORY ("" if none).
- end_suffix: ≤60 characters immediately AFTER end_exact in the STORY ("" if none).

Rules for anchors:
- Copy characters EXACTLY as they appear in the STORY (whitespace/punctuation included).
- Do NOT include paragraph id markers like [P0001] in start_exact/end_exact/prefix/suffix.
  These markers are scaffolding, not part of the narrative text.

# --- Advisory offsets (OPTIONAL) ---
- start_char: 0-based start index into the STORY string (Python slicing) or null if unsure.
- end_char: 0-based end index (exclusive) into the STORY or null if unsure.

# --- Descriptive fields ---
- summary: ≤200 characters summarizing the segment (not the full text).
- cohesion: brief notes identifying the unifying TIME/PLACE/CHARACTERS.
- gacd: for dramatic_scene only, else null:
  { "goal": "...", "action": "...", "conflict": "...", "outcome": "Disaster|Success|Unclear" }.
- erac: for dramatic_sequel only, else null:
  { "emotion": "...", "reason": "...", "anticipation": "...", "choice": "..." }.
- reason: 1–3 sentences justifying the label and boundary (refer to time/place continuity and GACD/ERAC cues).
- confidence: float in [0,1].
"""

SCENE_SEQUEL_USER_PROMPT_TEMPLATE = """
You will receive a STORY with paragraph ids embedded as markers like [P0001].
Use paragraph ids for boundaries and supply robust text anchors as described.

Procedure you MUST follow (internally):
1) Skim the STORY to identify major time/place blocks and explicit scene-breaks.
2) Propose initial boundaries at meaningful time/place changes or explicit markers.
3) Merge adjacent tiny spans (< ~3 sentences or ~100 chars) unless there is a real time/place shift.
4) Classify each final segment:
   - dramatic_scene → find GACD evidence (goal, action, conflict, outcome).
   - dramatic_sequel → find ERAC evidence (emotion, reason, anticipation, choice).
   - narrative_scene → time/place unified but no clear GACD/ERAC.
   - other → non-narrative material.
5) Ensure later paragraphs are not omitted: if unsure, label as narrative_scene or other.
6) Produce ONLY the JSON described in the system prompt, using the exact keys and schema.

Return ONLY JSON with this shape:
{{
  "segments": [
    {{
      "segment_id": 1,
      "segment_type": "dramatic_scene" | "dramatic_sequel" | "narrative_scene" | "other",
      "start_par_id": 1,
      "end_par_id": 3,
      "start_exact": "<first ≤120 chars of this segment, verbatim from STORY>",
      "end_exact": "<last ≤120 chars of this segment, verbatim from STORY>",
      "start_prefix": "<≤60 chars before start_exact or "">",
      "end_suffix": "<≤60 chars after end_exact or "">",
      "start_char": null,
      "end_char": null,
      "summary": "<≤200-char summary>",
      "cohesion": {{
        "time": "<unifying time (stated or implied)>",
        "place": "<unifying place (stated or implied)>",
        "characters": ["<main character(s)>"]
      }},
      "gacd": {{
        "goal": "...",
        "action": "...",
        "conflict": "...",
        "outcome": "Disaster|Success|Unclear"
      }} | null,
      "erac": {{
        "emotion": "...",
        "reason": "...",
        "anticipation": "...",
        "choice": "..."
      }} | null,
      "reason": "<why these boundaries/label>",
      "confidence": 0.0
    }}
  ]
}}

STORY (with paragraph ids; DO NOT include [P####] markers in anchors):
\"\"\"{indexed_story_text}\"\"\"
"""

revised_extractor = llm_extractor.JSONPromptExtractor(
    client,
    system_prompt=SCENE_SEQUEL_SYSTEM_PROMPT,
    user_prompt_template=SCENE_SEQUEL_USER_PROMPT_TEMPLATE,  # expects {indexed_story_text} or {story_text}
    output_key="segments",
    default_model="gpt-4o",
    temperature=0.2,
    force_json=True,
    text_indexer=text_indexing.paragraph_text_indexer,
    result_aligner=text_indexing.segments_result_aligner,
)


In [None]:
revised_extraction = revised_extractor.extract(example_story.body)
revised_extraction

In [None]:
revised_result = revised_extraction['extracted_output']
revised_result

In [None]:
def display_revised(story_text, extracted_scenes):
    """
    Pretty-print segment results produced by the updated extractor.

    - Uses start_char/end_char when valid.
    - If missing/invalid, derives a best-effort span from start_exact/end_exact.
    - Uses utils.sm for compact previews.
    - Normalizes preview text:
        * collapse runs of spaces to a single space
        * single newlines -> spaces
        * 2+ newlines -> single newline
    """
    import re
    from lcats import utils

    def _normalize_preview(s: str) -> str:
        if not s:
            return ""
        # unify newlines
        s = s.replace("\r\n", "\n").replace("\r", "\n")
        # mark paragraph breaks (2+ newlines)
        s = re.sub(r"\n{2,}", "\u2029", s)
        # single newlines -> spaces
        s = s.replace("\n", " ")
        # collapse spaces/tabs
        s = re.sub(r"[ \t\u00A0]+", " ", s).strip()
        # restore paragraph breaks to single newline
        s = s.replace("\u2029", "\n")
        return s

    def _sm_norm(s: str, limit: int) -> str:
        return utils.sm(_normalize_preview(s or ""), limit=limit)

    n_text = len(story_text)

    for i, seg in enumerate(extracted_scenes):
        segment_id   = seg.get("segment_id", "unknown")
        segment_type = seg.get("segment_type", "unknown")
        confidence   = seg.get("confidence", -1.0)
        reason       = seg.get("reason", "unknown")
        summary      = seg.get("summary", "")

        cohesion     = seg.get("cohesion", {}) or {}
        gacd         = seg.get("gacd", None)
        erac         = seg.get("erac", None)

        # Anchors & paragraph ids (new fields)
        start_par_id = seg.get("start_par_id", None)
        end_par_id   = seg.get("end_par_id", None)
        start_exact  = seg.get("start_exact", "") or ""
        end_exact    = seg.get("end_exact", "") or ""
        start_prefix = seg.get("start_prefix", "") or ""
        end_suffix   = seg.get("end_suffix", "") or ""

        # Offsets (may be missing/invalid)
        start_char = seg.get("start_char", None)
        end_char   = seg.get("end_char", None)

        def _valid_span(a, b):
            return isinstance(a, int) and isinstance(b, int) and 0 <= a < b <= n_text

        span_note = ""
        if not _valid_span(start_char, end_char):
            # Derive from anchors if possible (raw text; no normalization here)
            s_idx = story_text.find(start_exact) if start_exact else -1
            if s_idx != -1:
                e_pos = story_text.find(end_exact, s_idx) if end_exact else -1
                if e_pos != -1:
                    start_char = s_idx
                    end_char = e_pos + len(end_exact)
                    if _valid_span(start_char, end_char):
                        span_note = " (derived from anchors)"
                    else:
                        start_char = end_char = None
                else:
                    # fallback: partial window from start_exact
                    if start_exact:
                        start_char = s_idx
                        end_char = min(n_text, s_idx + max(len(start_exact), 120))
                        if _valid_span(start_char, end_char):
                            span_note = " (partial span from start_exact)"
                        else:
                            start_char = end_char = None

        length_str = (
            f"{end_char - start_char} chars" if _valid_span(start_char, end_char) else "unknown"
        )

        print(f"Segment {i}: Type {segment_type} (Confidence: {confidence})")
        print(f" - Segmentation Rationale: {_sm_norm(reason, 200)}")
        print(f" - Summary: {_sm_norm(summary, 200)}")
        print(
            f" - Segment ID: {segment_id}, Chars: [{start_char}:{end_char}] {span_note}, Length: {length_str}"
        )

        # Paragraph & anchors (normalized+sm for readability)
        print(f" - Paragraphs: start_par_id={start_par_id}, end_par_id={end_par_id}")
        print(
            " - Anchors:"
            f"\n     start_prefix='{_sm_norm(start_prefix, 80)}'"
            f"\n     start_exact ='{_sm_norm(start_exact, 120)}'"
            f"\n     end_exact   ='{_sm_norm(end_exact, 120)}'"
            f"\n     end_suffix  ='{_sm_norm(end_suffix, 80)}'"
        )

        # Cohesion pretty-print
        time_ = cohesion.get("time", "")
        place = cohesion.get("place", "")
        chars = cohesion.get("characters", [])
        # Normalize the string fields for display
        print(f" - Cohesion: time='{_sm_norm(time_, 120)}', place='{_sm_norm(place, 120)}', characters={chars}")

        if gacd:
            # Normalize each field of GACD for display
            g_goal = _sm_norm((gacd or {}).get("goal", ""), 140)
            g_act  = _sm_norm((gacd or {}).get("action", ""), 140)
            g_con  = _sm_norm((gacd or {}).get("conflict", ""), 140)
            g_out  = (gacd or {}).get("outcome", "")
            print(f" - GACD: goal='{g_goal}', action='{g_act}', conflict='{g_con}', outcome='{g_out}'")
        if erac:
            e_emo = _sm_norm((erac or {}).get("emotion", ""), 140)
            e_rea = _sm_norm((erac or {}).get("reason", ""), 140)
            e_ant = _sm_norm((erac or {}).get("anticipation", ""), 140)
            e_cho = _sm_norm((erac or {}).get("choice", ""), 140)
            print(f" - ERAC: emotion='{e_emo}', reason='{e_rea}', anticipation='{e_ant}', choice='{e_cho}'")

        # Optional: show a normalized + sm preview slice if we have a valid span
        if _valid_span(start_char, end_char):
            snippet = story_text[start_char:end_char]
            print(f" - Preview: {_normalize_preview(snippet)[:200]}")

        print()

display_revised(example_story.body, revised_result)

In [None]:
reloader()

In [None]:
def validate_segments(story_text, extracted_scenes, *, preview_limit=160):
    """
    Validate coverage and overlaps for extracted segments.

    Returns:
      missing_components: list of dicts for uncovered ranges:
        - type: 'start_gap' | 'gap' | 'end_gap'
        - start, end, length
        - preview
        - left_segment_id/right_segment_id (for 'gap')
      overlapping_components: list of dicts for overlaps:
        - type: 'partial_overlap' | 'contained' | 'duplicate'
        - a_index, b_index (indices in start-sorted order)
        - a_segment_id, b_segment_id
        - start, end, length
    """
    import re
    from lcats import utils

    n = len(story_text)

    def _normalize_preview(s: str) -> str:
        if not s:
            return ""
        s = s.replace("\r\n", "\n").replace("\r", "\n")
        s = re.sub(r"\n{2,}", "\u2029", s)  # mark paragraph breaks
        s = s.replace("\n", " ")            # single newlines -> spaces
        s = re.sub(r"[ \t\u00A0]+", " ", s).strip()
        return s.replace("\u2029", "\n")    # restore paragraph breaks to single LF

    def _sm(text: str, limit: int = preview_limit) -> str:
        return utils.sm(_normalize_preview(text or ""), limit=limit)

    def _valid(a, b) -> bool:
        return isinstance(a, int) and isinstance(b, int) and 0 <= a < b <= n

    # Gather valid spans
    spans = []
    for idx, seg in enumerate(extracted_scenes):
        s = seg.get("start_char")
        e = seg.get("end_char")
        if _valid(s, e):
            spans.append({
                "i": idx,
                "segment_id": seg.get("segment_id", f"#{idx}"),
                "start": int(s),
                "end": int(e),
            })

    # Sort by start (then end)
    spans.sort(key=lambda d: (d["start"], d["end"]))

    missing_components = []
    overlapping_components = []

    if not spans:
        # Nothing valid; entire story is missing if non-empty
        if n > 0:
            missing_components.append({
                "type": "start_gap",
                "start": 0, "end": n, "length": n,
                "preview": _sm(story_text[0:n]),
            })
        return missing_components, overlapping_components

    # Coverage sweep: track the farthest right endpoint we've covered so far
    # as 'covered_end'. Also track last-by-start (prev_seg) for pairwise overlap
    # and 'max_seg' as the segment currently contributing the farthest end.
    covered_end = 0
    prev_seg = None
    max_seg = None

    for pos, cur in enumerate(spans):
        s, e = cur["start"], cur["end"]

        # GAP relative to covered_end?
        # If the next segment starts after everything we've covered so far,
        # the uncovered chunk is [covered_end, s).
        if s > covered_end:
            gap_type = "start_gap" if covered_end == 0 else "gap"
            gap = {
                "type": gap_type,
                "start": covered_end,
                "end": s,
                "length": s - covered_end,
                "preview": _sm(story_text[covered_end:s]),
            }
            if gap_type == "gap" and prev_seg is not None:
                gap["left_segment_id"] = prev_seg["segment_id"]
                gap["right_segment_id"] = cur["segment_id"]
            missing_components.append(gap)

        # OVERLAP checks
        if prev_seg is not None and s < prev_seg["end"]:
            ov_start = max(prev_seg["start"], s)
            ov_end = min(prev_seg["end"], e)
            otype = "duplicate" if (s == prev_seg["start"] and e == prev_seg["end"]) \
                else ("contained" if (s >= prev_seg["start"] and e <= prev_seg["end"]) else "partial_overlap")
            overlapping_components.append({
                "type": otype,
                "a_index": pos - 1, "b_index": pos,
                "a_segment_id": prev_seg["segment_id"],
                "b_segment_id": cur["segment_id"],
                "start": ov_start, "end": ov_end, "length": max(0, ov_end - ov_start),
            })

        # Containment vs the current max-extent segment (if different from prev)
        if max_seg is not None and max_seg is not prev_seg:
            if s < max_seg["end"]:  # cur overlaps the max-extent segment
                if s >= max_seg["start"] and e <= max_seg["end"]:
                    # cur fully contained in max_seg
                    overlapping_components.append({
                        "type": "contained",
                        "a_index": max_seg["i"], "b_index": cur["i"],
                        "a_segment_id": max_seg["segment_id"],
                        "b_segment_id": cur["segment_id"],
                        "start": s, "end": e, "length": e - s,
                    })
                elif e > max_seg["end"]:
                    # partial overlap across max_seg right edge
                    overlapping_components.append({
                        "type": "partial_overlap",
                        "a_index": max_seg["i"], "b_index": cur["i"],
                        "a_segment_id": max_seg["segment_id"],
                        "b_segment_id": cur["segment_id"],
                        "start": max(s, max_seg["start"]),
                        "end": max_seg["end"],
                        "length": max_seg["end"] - max(s, max_seg["start"]),
                    })

        # Update coverage frontier: farthest end wins
        if e > covered_end:
            covered_end = e

        # Update max-extent segment
        if max_seg is None or e > max_seg["end"]:
            max_seg = cur

        # Advance 'prev-by-start'
        prev_seg = cur

    # END GAP after the farthest covered end
    if covered_end < n:
        missing_components.append({
            "type": "end_gap",
            "start": covered_end, "end": n, "length": n - covered_end,
            "preview": _sm(story_text[covered_end:n]),
        })

    return missing_components, overlapping_components



missing_components, overlapping_components = validate_segments(example_story.body, revised_result)


In [None]:
missing_components