# Analyze Corpus, Reloaded

This notebook analyzes our corpus with updated code moved into libraries.

## Imports

In [None]:
import datetime
import os
import pathlib
import re


from typing import Any, Dict, List, Optional

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

Third-party modules

In [None]:
import dotenv
from openai import OpenAI
import tiktoken

Switch to the parent directory so paths can resolve and we write to the right directories.

In [None]:
cwd = pathlib.Path.cwd().resolve()
project_root = cwd.parent if cwd.name == "notebooks" else cwd
scripts_dir = project_root / "scripts"
if scripts_dir.is_dir():
    if cwd != project_root:
        print(f"Changing working directory from {cwd} to {project_root}")
        os.chdir(project_root)  # Change to the project root directory.
print("Working directory:", pathlib.Path.cwd())

Add imports from within the project (depends on prior cell)

In [None]:
from lcats import constants
from lcats import stories
from lcats import utils

from lcats.analysis import corpus_surveyor
from lcats.analysis import graph_plotters
from lcats.analysis import llm_extractor
from lcats.analysis import scene_analysis
from lcats.analysis import text_segmenter


In [None]:
from importlib import reload

RELOAD_MODULES = [
    constants,
    stories,
    corpus_surveyor,
    graph_plotters,
    llm_extractor,
    scene_analysis,
    text_segmenter,
    utils,
]
def reloader():
    for module in RELOAD_MODULES:
        print("Reloading", module)
        reload(module)
    print("Reloading complete.")


## Project Setup

### Path Setup

In [None]:
# Working corpora
# CORPORA_ROOT = project_root / "data"
# Checked-in corpora
CORPORA_ROOT = project_root / ".." / "corpora"
CORPORA_ROOT = CORPORA_ROOT.resolve()  # Resolve to absolute path.

print("Corpora root:", CORPORA_ROOT)
print("Corpora top-level directories:", end=" ")
os.listdir(CORPORA_ROOT)

In [None]:
json_stories = corpus_surveyor.find_corpus_stories(CORPORA_ROOT)
len(json_stories)
print(utils.sml(json_stories))
print("Type of path element:", type(json_stories[0]))

In [None]:
# You can test with a sample of 10 stories for speed.
# short_stories = stories[:10]  # lol
# story_stats, author_stats = survey.compute_corpus_stats(short_stories)
story_stats, author_stats = corpus_surveyor.compute_corpus_stats(json_stories)


In [None]:
story_stats.describe()

In [None]:
author_stats.describe()

In [None]:
author_stats

In [None]:
story_stats

In [None]:
fig, ax = graph_plotters.plot_author_stories_vs_tokens(author_stats, annotate_top=15)
plt.show()

In [None]:
fig, ax = graph_plotters.plot_author_stories_vs_tokens_sns(author_stats, annotate_top=10)
plt.show()


In [None]:
fig, ax = graph_plotters.plot_tokens_per_story_by_author(story_stats, top_n=24, min_stories=2, rotate_labels=45)
plt.show()


In [None]:
fig, ax = graph_plotters.plot_tokens_per_story_by_author_sns(story_stats, top_n=24, min_stories=2, rotate_labels=45)
plt.show()


In [None]:
fig, ax = graph_plotters.plot_tokens_per_story_vs_stories(
    author_stats, annotate_top=15, log_y=True, jitter=0.05, spread_step=4, x_spread=6)
plt.show()


## Scene-Sequel Extraction

### Path Setup

In [None]:

# Where the notebook is executing (absolute, resolved)
CURRENT_PATH = pathlib.Path.cwd().resolve()

# Project root = formerly parent of notebooks/, now just current dir
# PROJECT_ROOT = CURRENT_PATH.parent 
PROJECT_ROOT = CURRENT_PATH

# Local data/output inside the project
DEV_CORPUS = (PROJECT_ROOT / "data")
DEV_OUTPUT = (PROJECT_ROOT / "output")

# Sibling-level resources (one level up from project root)
GIT_CORPUS = (PROJECT_ROOT.parent / "corpora")
OPENIA_API_KEYS_ENV = (PROJECT_ROOT.parent / ".secrets" / "openai_api_keys.env")

def check_path(path: pathlib.Path, description: str) -> None:
    if path.exists():
        print(f"Found {description} at: {path}")
    else:
        print(f"Missing {description} from: {path}")

check_path(DEV_CORPUS, "DEV_CORPUS")
check_path(DEV_OUTPUT, "DEV_OUTPUT")
check_path(GIT_CORPUS, "GIT_CORPUS")
check_path(OPENIA_API_KEYS_ENV, "OPENIA_API_KEYS_ENV")


## OpenAI Client

Get the OpenAI API key.

In [None]:
dotenv.load_dotenv(OPENIA_API_KEYS_ENV)
OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
print(OPENAI_API_KEY)

Verify that we can get a client.

In [None]:
client = OpenAI()
print(f"Loaded OpenAI client: {client} with version: {client._version}")

Verify the API is working. This week. And that you have credits.

In [None]:
response = client.responses.create(
    model="gpt-4o",
    input="Write a one-sentence bedtime story about a starship captain visiting a planet."
)

print(f"Story generated on: {datetime.date.today()}:")
utils.pprint(response.output_text)

## Story Corpora

In [None]:
# If run from within a notebook, the corpora root is two paths up from the notebook's location.
CORPORA_ROOT = GIT_CORPUS  # Checked-in corpora
# CORPORA_ROOT = DEV_CORPUS  # Command line working corpora

# Now load the corpora
corpora = stories.Corpora(CORPORA_ROOT)

print("Loaded corpora:")
print(f" - root: {corpora.corpora_root}")
print(f" - corpora: {len(corpora.corpora)}")
print(f" - stories: {len(corpora.stories)}")
print()

print(f"Example story: corpora.stories[0]:")
example_story = corpora.stories[0]
print(f"Story type: {type(example_story)} with a body of {len(example_story.body)} characters.")
print(example_story)


## Scene and Sequel Extraction

In [None]:
segment_extractor = scene_analysis.make_segment_extractor(client)

In [None]:
example_extraction = segment_extractor.extract(example_story.body)
example_result = example_extraction['extracted_output']
example_result

In [None]:
scene_analysis.display_segments(example_story.body, example_result)

In [None]:
semantic_extractor = scene_analysis.make_semantics_extractor(client)
annotated_segments = scene_analysis.annotate_segments_with_semantics(
    example_story.body,
    example_result,
    semantic_extractor)

In [None]:
scene_analysis.display_annotated_segments(annotated_segments)


In [None]:
reloader()