# Analyze Corpus, Reloaded

This notebook analyzes our corpus with updated code moved into libraries.

## Imports

In [None]:
import datetime
import json
import os
import pathlib
import random
import re
import sys


from typing import Any, Dict, Iterable, List, Optional, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm


Third-party modules

In [None]:
import dotenv
from openai import OpenAI
import tiktoken

Switch to the parent directory so paths can resolve and we write to the right directories.

In [None]:
cwd = pathlib.Path.cwd().resolve()
project_root = cwd.parent if cwd.name == "notebooks" else cwd
scripts_dir = project_root / "scripts"
if scripts_dir.is_dir():
    if cwd != project_root:
        print(f"Changing working directory from {cwd} to {project_root}")
        os.chdir(project_root)  # Change to the project root directory.
print("Working directory:", pathlib.Path.cwd())

Add imports from within the project (depends on prior cell)

In [None]:
from lcats import constants
from lcats import stories
from lcats import utils

from lcats.analysis import corpus_surveyor
from lcats.analysis import graph_plotters
from lcats.analysis import llm_extractor
from lcats.analysis import scene_analysis
from lcats.analysis import story_analysis
from lcats.analysis import story_processors
from lcats.analysis import text_segmenter


In [None]:
from importlib import reload

RELOAD_MODULES = [
    constants,
    stories,
    corpus_surveyor,
    graph_plotters,
    llm_extractor,
    scene_analysis,
    story_analysis,
    story_processors,
    text_segmenter,
    utils,
]
def reloader():
    for module in RELOAD_MODULES:
        print("Reloading", module)
        reload(module)
    print("Reloading complete.")


## Project Setup

### Path Setup

In [None]:
# Where the notebook is executing (absolute, resolved)
CURRENT_PATH = pathlib.Path.cwd().resolve()

# Project root = formerly parent of notebooks/, now just current dir
# PROJECT_ROOT = CURRENT_PATH.parent 
PROJECT_ROOT = CURRENT_PATH

# Local data/output inside the project
DEV_CORPUS = (PROJECT_ROOT / "data")
DEV_OUTPUT = (PROJECT_ROOT / "output")

# Sibling-level resources (one level up from project root)
GIT_CORPUS = (PROJECT_ROOT.parent / "corpora")
OPENIA_API_KEYS_ENV = (PROJECT_ROOT.parent / ".secrets" / "openai_api_keys.env")

def check_path(path: pathlib.Path, description: str) -> None:
    if path.exists():
        print(f"Found {description} at: {path}")
    else:
        print(f"Missing {description} from: {path}")

check_path(DEV_CORPUS, "DEV_CORPUS")
check_path(DEV_OUTPUT, "DEV_OUTPUT")
check_path(GIT_CORPUS, "GIT_CORPUS")
check_path(OPENIA_API_KEYS_ENV, "OPENIA_API_KEYS_ENV")


In [None]:
# Working corpora
# CORPORA_ROOT = project_root / "data"
# Checked-in corpora
CORPORA_ROOT = project_root / ".." / "corpora"
CORPORA_ROOT = CORPORA_ROOT.resolve()  # Resolve to absolute path.

print("Corpora root:", CORPORA_ROOT)
print("Corpora top-level directories:", end=" ")
os.listdir(CORPORA_ROOT)

### OpenAI Client

Get the OpenAI API key.

In [None]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

Verify that we can get a client.

In [None]:
client = OpenAI()
print(f"Loaded OpenAI client: {client} with version: {client._version}")

Verify the API is working. This week. And that you have credits.

In [None]:
response = client.responses.create(
    model="gpt-4o",
    input="Write a one-sentence bedtime story about a starship captain visiting a planet."
)

print(f"Story generated on: {datetime.date.today()}:")
utils.pprint(response.output_text)

## Corpora-level Analysis

### Story Corpora

In [None]:
# If run from within a notebook, the corpora root is two paths up from the notebook's location.
CORPORA_ROOT = GIT_CORPUS  # Checked-in corpora
# CORPORA_ROOT = DEV_CORPUS  # Command line working corpora

# Now load the corpora
corpora = stories.Corpora(CORPORA_ROOT)

print("Loaded corpora:")
print(f" - root: {corpora.corpora_root}")
print(f" - corpora: {len(corpora.corpora)}")
print(f" - stories: {len(corpora.stories)}")
print()

print(f"Example story: corpora.stories[0]:")
example_story = corpora.stories[0]
print(f"Story type: {type(example_story)} with a body of {len(example_story.body)} characters.")
print(example_story)


### JSON Corpora Load

In [None]:
json_stories = corpus_surveyor.find_corpus_stories(CORPORA_ROOT)
len(json_stories)
print(utils.sml(json_stories))
print("Type of path element:", type(json_stories[0]))

### Corpora Analysis

In [None]:
# You can test with a sample of 10 stories for speed.
# short_stories = stories[:10]  # lol
# story_stats, author_stats = survey.compute_corpus_stats(short_stories)
story_stats, author_stats = corpus_surveyor.compute_corpus_stats(json_stories)


In [None]:
story_stats.describe()

In [None]:
author_stats.describe()

In [None]:
author_stats

In [None]:
story_stats

### Corpora Graphs

In [None]:
fig, ax = graph_plotters.plot_author_stories_vs_tokens(author_stats, annotate_top=15)
plt.show()

In [None]:
fig, ax = graph_plotters.plot_author_stories_vs_tokens_sns(author_stats, annotate_top=10)
plt.show()


In [None]:
fig, ax = graph_plotters.plot_tokens_per_story_by_author(story_stats, top_n=25, min_stories=2, rotate_labels=45)
plt.show()


In [None]:
fig, ax = graph_plotters.plot_tokens_per_story_by_author_sns(story_stats, top_n=24, min_stories=2, rotate_labels=45)
plt.show()


In [None]:
fig, ax = graph_plotters.plot_tokens_per_story_vs_stories(
    author_stats, annotate_top=15, log_y=True, jitter=0.05, spread_step=4, x_spread=6)
plt.show()


## Scene-Sequel Extraction

In [None]:
segment_extractor = scene_analysis.make_segment_extractor(client)

In [None]:
example_extraction = segment_extractor.extract(example_story.body)
example_result = example_extraction['extracted_output']
example_result

In [None]:
scene_analysis.display_segments(example_story.body, example_result)

In [None]:
semantic_extractor = scene_analysis.make_semantics_extractor(client)
annotated_segments = scene_analysis.annotate_segments_with_semantics(
    example_story.body,
    example_result,
    semantic_extractor)

In [None]:
scene_analysis.display_annotated_segments(annotated_segments)


## Process Corpora

In [None]:
len(json_stories)


Acquire a sample of 10 randomly selected stories.

In [None]:
def pathify(paths: List[str]) -> List[pathlib.Path]:
    return list(map(pathlib.Path, paths))

# print("[")
# for sample in random.sample(json_stories, 100):
#     print(f"    \"{sample}\",")
# print("]")

In [None]:
SAMPLE_OF_10 = pathify([
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_crystal_ray.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/grimm/old_man_and_his_grandson.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_phantom_of_bogue_holauba_1911.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/anderson/elderbush.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/grimm/king_of_the_golden_mountain.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/sherlock/noble_bachelor.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_little_man_who_wasnt_quite.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/monsoons_of_death.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/shipwreck_in_the_sky.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_voice_in_the_fog.json",
])
SAMPLE_OF_10

In [None]:
SAMPLE_OF_100 = pathify([
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_doors_of_death.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/travelogue.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/anderson/real_princess.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/passion_fruit.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/routine_for_a_hornet.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_little_hunchback_zia.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_soul_of_nicholas_snyders;_or,_the_miser_of_zandam.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_disembodied_man.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_queen_of_space.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/halima_and_the_scorpions_1905.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/point_of_departure.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/far_enough_to_touch.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/planet_of_creation.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_ring_bonanza.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/hemingway/alpine_idyll.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/dig_me_no_grave.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/doorstep.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/milk_run.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_ties_that_bind.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/when_oscar_went_wild.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/fifty_per_cent_prophet.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_sweeper_of_loray.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_marriage_of_william_durrant.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_cobbler_in_the_devils_kitchen_from_\"_mackinac_and_lake_stories\"_1899.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/ohenry/skylight_room.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_worlds_of_joe_shannon.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/grimm/twelve_huntsmen.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/brknks_bounty.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/welcome,_martians!.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_rogue_waveform.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/cum_grano_salis.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_eyes_have_it.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/when_the_sun_went_out.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/greener_than_spruce.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/a_chilhowee_lily_1911.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_novel_and_the_common_school.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/thirty_degrees_cattywonkus.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/people_soup.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/new_hire.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/morgue_ship.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/anderson/fir_tree.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_spectre_in_the_cart_1908.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_outer_quiet.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_yellow_wallpaper.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/has_anyone_here_seen_kelly?.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/hagertys_enzymes.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/marianson_from_\"_mackinac_and_lake_stories\"_1899.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/destinationâ€”_death.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/radio_v_rays.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/x_marks_the_asteroid.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_penultimate_trump.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_good_work.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_seven_missionaries.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/george_loves_gistla.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/luna_escapade.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/menace_of_the_mists.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/code.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/\"_pig_headed\"_sailor_men_from_\"_the_strange_adventure_of_james_shervinton,_and_other_stories\"_1902.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/in_the_bad_lands.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/stories_of_christmas_and_the_bowie_knife.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/among_the_scented_ones.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/rat_race.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_hunters_lodge_case.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/lovecraft/at_the_mountains_of_madness.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_un_reconstructed_woman.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/trouble_on_sun_side.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/competition.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/ohenry/gift_of_the_magi.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/stairway_to_the_stars.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_waif_woman.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/tongues_of_the_moon.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_violators.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_hunted_heroes.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_hour_of_battle.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/raiders_of_the_universes.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_voyage_of_vanishing_men.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/ride_the_crepe_ring.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/a_madman_on_board.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_valor_of_cappen_varra.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_crystal_crypt.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_sound_of_silence.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/wodehouse/at_geisenheimer's.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_story_of_gombi.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/love_among_the_robots.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/big_stupe.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/anderson/shoes_of_fortune.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_impossible_pirate.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_flight_of_the_eagle.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/pastoral_affair.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_terrible_answer.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/all_that_earthly_remains.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/that_pup.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/in_his_image.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/derelict.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/lovecraft/the_colour_out_of_space.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/traders_risk.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/picnic.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_good_seed.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/subversive.json",
    "/Users/centaur/Workspace/LCATS/LCATS/corpora/massQuantities/the_vegans_were_curious.json",
])
len(SAMPLE_OF_100)

In [None]:
summary_output = corpus_surveyor.process_files(
    SAMPLE_OF_10,
    processor_function=story_processors.story_summarizer,
    corpora_root=CORPORA_ROOT,
    output_root=DEV_OUTPUT,
    job_label="sample_summaries",
)
summary_output

In [None]:
annotated_segment_processor = story_processors.make_annotated_segment_extractor(
    client, segment_model="gpt-4o", semantic_model="gpt-4o")


In [None]:
summary = corpus_surveyor.process_files(
    SAMPLE_OF_10,
    corpora_root=CORPORA_ROOT,
    output_root=DEV_OUTPUT,
    processor_function=annotated_segment_processor,
    job_label="scene_semantics",
    verbose=True,
)


In [None]:
hundred_summary = corpus_surveyor.process_files(
    SAMPLE_OF_100,
    corpora_root=CORPORA_ROOT,
    output_root=DEV_OUTPUT,
    processor_function=annotated_segment_processor,
    job_label="scene_semantics_100",
    verbose=True,
)
hundred_summary

In [None]:
sample_10_dir = DEV_OUTPUT / "scene_semantics"
sample_10_json = corpus_surveyor.find_corpus_stories(sample_10_dir)
len(sample_10_json)
print(utils.sml(sample_10_json))

sample_json = sample_10_json[0]
sample_json

In [None]:
with sample_json.open("r", encoding="utf-8") as f:
    sample_data = json.load(f)
utils.sm(str(sample_data))

In [None]:
sample_100_dir = DEV_OUTPUT / "scene_semantics_100"
sample_100_json = corpus_surveyor.find_corpus_stories(sample_100_dir)
len(sample_100_json)
print(utils.sml(sample_100_json))

In [None]:
def extract_story_metrics(data_root: Union[str, pathlib.Path],
                          json_path: Union[str, pathlib.Path]) -> Dict[str, Any]:
    """Extract story metrics from a JSON file.

    Args:
        data_root (Union[str, pathlib.Path]): Root directory of the corpus.
        json_path (Union[str, pathlib.Path]): Path to the JSON file.

    Returns:
        Dict[str, Any]: A dictionary containing the extracted story metrics.
    """

    # Make sure we have valid Path objects.
    data_root = pathlib.Path(data_root)
    json_path = pathlib.Path(json_path)
    try:
        with json_path.open("r", encoding="utf-8") as f:
            data = json.load(f)
    except Exception as e:
        print(
            f"warn: skipping unreadable JSON {json_path}: {e}", file=sys.stderr)
        return None
    
    
    # Get title/authors/body and compute uniqueness key
    story_id = str(json_path.resolve().relative_to(data_root.resolve()).with_suffix(""))
    title, authors, _ = story_analysis.extract_title_authors_body(data)

    # Metrics
    title_chars = len(title)
    title_words = story_analysis.word_count(title)
    title_tokens = story_analysis.token_count(title)

    body_chars = data.get('body_length_chars')
    body_words = data.get('body_length_words')
    body_tokens = data.get('body_length_tokens')
    body_paragraphs = data.get('body_length_paragraphs')

    segments = data.get('segments')
    segment_count = len(segments) if segments else 0
    scene_type_analysis = scene_analysis.summarize_type_agreement(data)
    original_dramatic_scene = scene_type_analysis['by_extractor']['dramatic_scene']
    original_dramatic_sequel = scene_type_analysis['by_extractor']['dramatic_sequel']
    original_narrative_scene = scene_type_analysis['by_extractor']['narrative_scene']
    original_other_scene = scene_type_analysis['by_extractor']['other']
    original_unknown_scene = scene_type_analysis['by_extractor']['unknown']
    auditor_dramatic_scene = scene_type_analysis['by_auditor']['dramatic_scene']
    auditor_dramatic_sequel = scene_type_analysis['by_auditor']['dramatic_sequel']
    auditor_narrative_scene = scene_type_analysis['by_auditor']['narrative_scene']
    auditor_other_scene = scene_type_analysis['by_auditor']['other']
    auditor_unknown_scene = scene_type_analysis['by_auditor']['unknown']
    scene_agreements = scene_type_analysis['agreements']
    scene_disagreements = scene_type_analysis['disagreements']
    scene_agreement_rate = scene_type_analysis['agreement_rate']
    scene_total = scene_type_analysis['segments_total']

    return{
            "path": str(json_path),
            "story_id": story_id,
            "title": title,
            "authors": authors,
            "n_authors": len(authors),

            "title_words": title_words,
            "title_chars": title_chars,
            "title_tokens": title_tokens,

            "body_words": body_words,
            "body_chars": body_chars,
            "body_tokens": body_tokens,
            "body_paragraphs": body_paragraphs,

            "segment_count": segment_count,
            "original_dramatic_scene": original_dramatic_scene,
            "original_dramatic_sequel": original_dramatic_sequel,
            "original_narrative_scene": original_narrative_scene,
            "original_other_scene": original_other_scene,
            "original_unknown_scene": original_unknown_scene,
            "auditor_dramatic_scene": auditor_dramatic_scene,
            "auditor_dramatic_sequel": auditor_dramatic_sequel,
            "auditor_narrative_scene": auditor_narrative_scene,
            "auditor_other_scene": auditor_other_scene,
            "auditor_unknown_scene": auditor_unknown_scene,
            "scene_agreements": scene_agreements,
            "scene_disagreements": scene_disagreements,
            "scene_agreement_rate": scene_agreement_rate,
            "scene_total": scene_total,
        }


def compute_semantic_stats(
        data_root: pathlib.Path,
        json_paths: Iterable[Union[str, pathlib.Path]]
) -> Tuple[pd.DataFrame, pd.DataFrame]:
    """Process stories in a corpus to extract story and scene metrics.

    Given an iterable of JSON file paths, produce:
      - story_stats: one row per JSON file with story-level metrics
    Columns in story_stats:
        path, story_id, title, authors, n_authors,
        title_words, title_chars, title_tokens,
        body_words, body_chars, body_tokens
    """
    story_rows: List[Dict[str, Any]] = []
    for json_path in tqdm(json_paths):
        # Extract story metrics; skip if unreadable
        story_metrics = extract_story_metrics(data_root, json_path)
        if story_metrics:
            story_rows.append(story_metrics)
    print(f"Processed {len(story_rows)} valid stories.")

    # Build story_stats DataFrame
    story_stats = pd.DataFrame(story_rows)
    return None if story_stats.empty else story_stats

In [None]:
sample_10_segments_df = compute_semantic_stats(data_root=sample_10_dir, json_paths=sample_10_json)
sample_10_segments_df.describe()

In [None]:
sample_100_segments_df = compute_semantic_stats(data_root=sample_100_dir, json_paths=sample_100_json)
sample_100_segments_df.describe()

In [None]:
joint_segments_df = pd.concat([sample_10_segments_df, sample_100_segments_df], ignore_index=True)
joint_segments_df.describe()

In [None]:
joint_segments_df.segment_count.hist(bins=20)

In [None]:
joint_segments_df.original_dramatic_scene.hist(bins=20)

In [None]:
joint_segments_df.original_dramatic_sequel.hist(bins=20)

In [None]:
joint_segments_df.scene_disagreements.hist(bins=20)

In [None]:
(joint_segments_df.segment_count == 0).sum()

In [None]:
(joint_segments_df.original_dramatic_scene == 0).sum()

In [None]:
(joint_segments_df.original_dramatic_sequel == 0).sum()

In [None]:
(joint_segments_df.scene_disagreements == 0).sum()

In [None]:
joint_segments_df.columns

In [None]:
def plot_segments_vs_tokens_matplotlib(
    df: pd.DataFrame,
    *,
    prefix: str = "auditor_",
    log_x: bool = False,
    figsize: tuple[int, int] = (9, 6),
) -> None:
    """Scatter plot of segment metrics vs body_tokens using Matplotlib.

    Args:
        df: DataFrame with the required columns.
        prefix: "auditor_" or "original_" to select which scene counts to use.
        log_x: If True, set x-axis to log scale (useful for wide token ranges).
        figsize: Figure size (width, height) in inches.

    Raises:
        KeyError: If required columns are missing.
    """
    metrics = {
        "Segments": "segment_count",
        "Narrative scenes": f"{prefix}narrative_scene",
        "Dramatic scenes": f"{prefix}dramatic_scene",
        "Dramatic sequels": f"{prefix}dramatic_sequel",
        "Scene disagreements": "scene_disagreements",
    }

    required = ["body_tokens", *metrics.values()]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    plt.figure(figsize=figsize)
    for label, col in metrics.items():
        plt.scatter(
            df["body_tokens"], df[col],
            s=24, alpha=0.75, linewidths=0.5, label=label
        )

    if log_x:
        plt.xscale("log")

    plt.xlabel("Body tokens")
    plt.ylabel("Count")
    plt.title("Segments vs. Body Tokens")
    plt.legend(title="Metric", bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()
    plt.show()

plot_segments_vs_tokens_matplotlib(joint_segments_df, prefix="original_", log_x=True)

In [None]:
def plot_segments_vs_tokens_seaborn(
    df: pd.DataFrame,
    *,
    prefix: str = "auditor_",
    log_x: bool = False,
    figsize: tuple[int, int] = (9, 6),
) -> None:
    """Scatter plot of segment metrics vs body_tokens using Seaborn.

    Args:
        df: DataFrame with the required columns.
        prefix: "auditor_" or "original_" to select which scene counts to use.
        log_x: If True, set x-axis to log scale (useful for wide token ranges).
        figsize: Figure size (width, height) in inches.

    Raises:
        KeyError: If required columns are missing.
    """
    metrics = {
        "Segments": "segment_count",
        "Narrative scenes": f"{prefix}narrative_scene",
        "Dramatic scenes": f"{prefix}dramatic_scene",
        "Dramatic sequels": f"{prefix}dramatic_sequel",
        "Scene disagreements": "scene_disagreements",
    }

    required = ["body_tokens", *metrics.values()]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    # Tidy/long form for hue separation.
    long_df = (
        df[["body_tokens", *metrics.values()]]
        .rename(columns={v: k for k, v in metrics.items()})
        .melt(id_vars="body_tokens", var_name="metric", value_name="count")
    )

    plt.figure(figsize=figsize)
    ax = sns.scatterplot(
        data=long_df, x="body_tokens", y="count", hue="metric",
        alpha=0.8
    )

    if log_x:
        ax.set(xscale="log")

    ax.set_title("Segments vs. Body Tokens")
    ax.set_xlabel("Body tokens")
    ax.set_ylabel("Count")
    ax.legend(title="Metric", bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()
    plt.show()

plot_segments_vs_tokens_seaborn(joint_segments_df, prefix="original_", log_x=True)

In [None]:
def plot_scene_composition_by_length_bins(
    df,
    *,
    prefix="auditor_",         # or "original_"
    n_bins=8,
    binning="quantile",        # "quantile" (balanced counts) or "fixed"
    figsize=(10, 5),
    title="Scene composition by story length (100% per bin)"
):
    # Prepare proportions per story
    counts = {
        "narrative_scene": f"{prefix}narrative_scene",
        "dramatic_scene": f"{prefix}dramatic_scene",
        "dramatic_sequel": f"{prefix}dramatic_sequel",
        "other": f"{prefix}other_scene",
        "unknown": f"{prefix}unknown_scene",
    }
    required = ["body_tokens", "segment_count", *counts.values()]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    safe = df[df["segment_count"] > 0].copy()
    for k, col in counts.items():
        safe[f"p_{k}"] = safe[col] / safe["segment_count"]

    # Make token bins
    if binning == "quantile":
        safe["len_bin"] = pd.qcut(safe["body_tokens"], q=n_bins, duplicates="drop")
    else:
        safe["len_bin"] = pd.cut(safe["body_tokens"], bins=n_bins)

    grp = (safe
           .groupby("len_bin", observed=True)[[f"p_{k}" for k in counts]]
           .mean()
           .reset_index())

    # 100% stacked bar
    ax = grp.set_index("len_bin").plot(
        kind="bar",
        stacked=True,
        figsize=figsize,
        width=0.9,
        ylabel="Proportion of segments",
        title=title
    )
    ax.set_xlabel("Body token bins")
    ax.legend(title="Type", bbox_to_anchor=(1.02, 1), loc="upper left")
    ax.margins(x=0.01)
    plt.tight_layout()
plot_scene_composition_by_length_bins(joint_segments_df, prefix="auditor_", n_bins=8)


In [None]:
def plot_scene_proportion_trends(
    df,
    *,
    prefix="auditor_",
    lowess=True,               # requires statsmodels via seaborn; if unavailable, set False
    figsize=(10, 6),
    title="Scene-type proportions vs. story length"
):
    counts = {
        "Narrative": f"{prefix}narrative_scene",
        "Dramatic scene": f"{prefix}dramatic_scene",
        "Dramatic sequel": f"{prefix}dramatic_sequel",
        "Other": f"{prefix}other_scene",
        "Unknown": f"{prefix}unknown_scene",
    }
    required = ["body_tokens", "segment_count", *counts.values()]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    safe = df[df["segment_count"] > 0].copy()
    for label, col in counts.items():
        safe[f"p_{label}"] = safe[col] / safe["segment_count"]

    long_df = (
        safe[["body_tokens", *[f"p_{k}" for k in counts]]]
        .rename(columns={f"p_{k}": k for k in counts})
        .melt(id_vars="body_tokens", var_name="metric", value_name="prop")
    )

    plt.figure(figsize=figsize)
    ax = sns.regplot(
        data=long_df, x="body_tokens", y="prop",
        scatter=False, lowess=lowess, ci=None, line_kws={"alpha": 0.9}
    )
    # Draw multiple lines by facetting hue (overlay)
    for name, sub in long_df.groupby("metric"):
        sns.regplot(
            data=sub, x="body_tokens", y="prop",
            scatter=False, lowess=lowess, ci=None, label=name
        )

    ax.set_title(title)
    ax.set_xlabel("Body tokens")
    ax.set_ylabel("Proportion of segments")
    ax.legend(title="Type", bbox_to_anchor=(1.02, 1), loc="upper left")
    plt.tight_layout()

plot_scene_proportion_trends(joint_segments_df, prefix="auditor_", lowess=True)


In [None]:
def plot_hexbin_segments_vs_tokens(
    df,
    *,
    gridsize=30,
    figsize=(8, 6),
    title="Density of segment_count vs. body_tokens"
):
    required = ["body_tokens", "segment_count"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    plt.figure(figsize=figsize)
    hb = plt.hexbin(
        df["body_tokens"], df["segment_count"],
        gridsize=gridsize, mincnt=1, cmap="viridis", linewidths=0
    )
    plt.colorbar(hb, label="Stories per bin")
    plt.xlabel("Body tokens")
    plt.ylabel("Segment count")
    plt.title(title)
    plt.tight_layout()

plot_hexbin_segments_vs_tokens(joint_segments_df)


In [None]:
def plot_disagreement_rate_by_bins(
    df,
    *,
    n_bins=8,
    binning="quantile",
    figsize=(9, 5),
    title="Scene disagreement rate by story length bin"
):
    required = ["body_tokens", "scene_disagreements", "scene_total"]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    safe = df[df["scene_total"] > 0].copy()
    safe["disagree_rate"] = safe["scene_disagreements"] / safe["scene_total"]

    if binning == "quantile":
        safe["len_bin"] = pd.qcut(safe["body_tokens"], q=n_bins, duplicates="drop")
    else:
        safe["len_bin"] = pd.cut(safe["body_tokens"], bins=n_bins)

    stats = safe.groupby("len_bin", observed=True)["disagree_rate"].agg(["mean", "median"]).reset_index()

    ax = stats.plot(
        x="len_bin", y=["mean", "median"], marker="o", figsize=figsize,
        title=title
    )
    ax.set_xlabel("Body token bins")
    ax.set_ylabel("Disagreement rate")
    ax.legend(title="")
    plt.tight_layout()

plot_disagreement_rate_by_bins(joint_segments_df, n_bins=8)


In [None]:
def plot_scene_count_trends(
    df,
    *,
    prefix="auditor_",                     # or "original_"
    features=None,                        # dict: display label -> column name
    smoother="binmean",                   # "binmean" or "lowess"
    n_bins=12,
    binning="quantile",                   # "quantile" or "fixed"
    scatter=True,                         # overlay raw scatter points
    log_x=False,
    max_body_tokens=None,                 # axis limit only (no filtering)
    max_scene_count=None,                 # axis limit only (no clipping)
    min_body_tokens=None,                 # axis limit only (no filtering)
    min_scene_count=None,                 # axis limit only (no clipping)
    markers=None,                         # dict label->marker or list of markers
    figsize=(10, 6),
    title="Scene counts vs. story length",
):
    """Plot total scene counts vs body_tokens with optional smoothing.

    This version does NOT filter or clip the data. `max_body_tokens` and
    `max_scene_count` are applied only as axis limits so outliers remain in
    the fit but are not displayed if outside the view.

    Args:
        df: DataFrame containing 'body_tokens' and scene count columns.
        prefix: "auditor_" or "original_" to pick which scene counts to use.
        features: Optional mapping label -> column name. If None:
            {
              "Segments": "segment_count",
              "Narrative scenes": f"{prefix}narrative_scene",
              "Dramatic scenes": f"{prefix}dramatic_scene",
              "Dramatic sequels": f"{prefix}dramatic_sequel",
              "Disagreements": "scene_disagreements",
            }
        smoother: "binmean" (mean per length bin) or "lowess" (seaborn/regplot).
        n_bins: Number of bins for "binmean" smoothing (and for fixed/quantile).
        binning: "quantile" for balanced bins, "fixed" for equal-width bins.
        scatter: If True, overlay raw points for each metric.
        log_x: If True, use a log scale for the x-axis.
        max_body_tokens: If set, set x-axis maximum to this value.
        max_scene_count: If set, set y-axis maximum to this value.
        figsize: Matplotlib (width, height) in inches.
        title: Figure title.

    Raises:
        KeyError: If required columns are missing.
        ValueError: If `smoother` or `binning` is invalid.
    """
    if features is None:
        features = {
            "Segments": "segment_count",
            "Narrative scenes": f"{prefix}narrative_scene",
            "Dramatic scenes": f"{prefix}dramatic_scene",
            "Dramatic sequels": f"{prefix}dramatic_sequel",
            "Disagreements": "scene_disagreements",
        }

    required = ["body_tokens", *features.values()]
    missing = [c for c in required if c not in df.columns]
    if missing:
        raise KeyError(f"Missing columns: {missing}")

    if smoother not in {"binmean", "lowess"}:
        raise ValueError("smoother must be 'binmean' or 'lowess'")
    if binning not in {"quantile", "fixed"}:
        raise ValueError("binning must be 'quantile' or 'fixed'")
    
    # Marker assignment
    labels = list(features.keys())
    default_cycle = ["o", "s", "D", "^", "v", "P", "X", "*", "h", "<", ">"]
    if markers is None:
        marker_map = {lab: default_cycle[i % len(default_cycle)] for i, lab in enumerate(labels)}
    elif isinstance(markers, dict):
        marker_map = {lab: markers.get(lab, default_cycle[i % len(default_cycle)])
                      for i, lab in enumerate(labels)}
        print   (marker_map)
    else:
        # Treat as sequence in feature order
        marker_map = {lab: markers[i % len(markers)] for i, lab in enumerate(labels)}

    plt.figure(figsize=figsize)

    # Optional raw scatter overlay (all data shown; axis limits may hide some).
    if scatter:
        for _, col in features.items():
            sns.scatterplot(
                data=df, x="body_tokens", y=col,
                alpha=0.25, s=18, linewidth=0, label=None
            )

    if smoother == "lowess":
        # LOWESS line per metric over ALL data (display cropped by axis limits).
        for label, col in features.items():
            sns.regplot(
                data=df, x="body_tokens", y=col,
                lowess=True, scatter=False, ci=None, label=label
            )
        ax = plt.gca()
    else:
        # BINMEAN smoother over ALL data (display cropped by axis limits).
        work = df[["body_tokens", *features.values()]].copy()

        # Create bins over body_tokens.
        if binning == "quantile":
            work["len_bin"] = pd.qcut(work["body_tokens"], q=n_bins, duplicates="drop")
        else:
            work["len_bin"] = pd.cut(work["body_tokens"], bins=n_bins)

        metric_cols = list(features.values())
        for c in metric_cols:
            work[c] = pd.to_numeric(work[c], errors="coerce")

        agg = (
            work.groupby("len_bin", observed=True)[metric_cols]
            .mean()
            .reset_index()
        )

        def _mid(iv):
            try:
                return (iv.left + iv.right) / 2
            except Exception:
                return float("nan")

        agg["x_mid"] = agg["len_bin"].apply(_mid)

        long_df = agg.melt(
            id_vars=["len_bin", "x_mid"],
            value_vars=metric_cols,
            var_name="metric",
            value_name="count",
        )
        inv = {v: k for k, v in features.items()}
        long_df["metric"] = long_df["metric"].map(inv)

        ax = sns.lineplot(
            # data=long_df, x="x_mid", y="count", hue="metric", marker="o"
            data=long_df, x="x_mid", y="count", hue="metric", style="metric",
            markers=marker_map, dashes=False,  # distinct shapes, solid lines
            )

    if log_x:
        plt.xscale("log")

    # Axis limits only (no data filtering/clipping)
    if max_body_tokens is not None:
        ax.set_xlim(left=-0.5)
        ax.set_xlim(right=max_body_tokens)
    if max_scene_count is not None:
        ax.set_ylim(bottom=-0.5)
        ax.set_ylim(top=max_scene_count)
    if min_body_tokens is not None:
        ax.set_xlim(left=min_body_tokens)
    if min_scene_count is not None:
        ax.set_ylim(bottom=min_scene_count)

    ax.set_title(title)
    ax.set_xlabel("Body tokens")
    ax.set_ylabel("Count")
    # ax.legend(title="Metric", bbox_to_anchor=(1.02, 1), loc="upper left")
    # ax.legend(title="Metric", bbox_to_anchor=(0.7, 1), loc="upper left")
    ax.legend(bbox_to_anchor=(0.75, 1), loc="upper left")
    plt.tight_layout()
    plt.show()


In [None]:
# Default features (Segments, Narrative, Dramatic, Sequel, Disagreements)
plot_scene_count_trends(joint_segments_df, prefix="auditor_", smoother="binmean", n_bins=8,
                        max_body_tokens=12100, max_scene_count=12, scatter=True,
                        min_body_tokens=1900,
                        features = {
            "Extracted Segments": "segment_count",
            "Dramatic Scenes": f"auditor_dramatic_scene",
            "Dramatic Sequels": f"auditor_dramatic_sequel",
            "Disagreements": "scene_disagreements",
        },
            markers={
        "Extracted Segments": "o",
        "Dramatic Scenes": "D",
        "Dramatic Sequels": "^",
        "Disagreements": "X",
    },
 )


In [None]:

# LOWESS smoothing (requires statsmodels via seaborn)
# plot_scene_count_trends(joint_segments_df, prefix="auditor_", smoother="lowess", scatter=False)

# Custom features (pick any subset/rename)
custom = {
    "Segments": "segment_count",
    "Dramatic scenes": "auditor_dramatic_scene",
    "Sequels": "auditor_dramatic_sequel",
}
plot_scene_count_trends(joint_segments_df, features=custom, smoother="binmean", n_bins=12)


In [None]:
reloader()