# Debug Gettenberg Issues

Analyze things wrong with the gettenberg API.

## Imports

In [None]:
from __future__ import annotations

from dataclasses import dataclass, asdict
import datetime
import fnmatch
import json
import os
import pathlib
import random
import re
import sys

from pathlib import Path

from typing import Any, Dict, Iterable, List, Optional, Set, Tuple, Union

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from tqdm import tqdm


Third-party modules

In [None]:
import dotenv
from openai import OpenAI
import tiktoken

Switch to the parent directory so paths can resolve and we write to the right directories.

In [None]:
cwd = pathlib.Path.cwd().resolve()
project_root = cwd.parent if cwd.name == "notebooks" else cwd
scripts_dir = project_root / "scripts"
if scripts_dir.is_dir():
    if cwd != project_root:
        print(f"Changing working directory from {cwd} to {project_root}")
        os.chdir(project_root)  # Change to the project root directory.
print("Working directory:", pathlib.Path.cwd())

Add imports from within the project (depends on prior cell)

In [None]:
from lcats import constants
from lcats import stories

from lcats import utils
from lcats.utils import names
from lcats.utils import values

from lcats.gettenberg import api
from lcats.gettenberg import cache
from lcats.gettenberg import metadata
from lcats.gettenberg import headers

from lcats.gatherers import downloaders
from lcats.gatherers.mass_quantities import storymap
from lcats.gatherers.mass_quantities import parser

from lcats.analysis import corpus_surveyor


In [None]:
from importlib import reload

RELOAD_MODULES = [
    api,
    cache,
    constants,
    corpus_surveyor,
    downloaders,
    headers,
    metadata,
    names,
    parser,
    stories,
    storymap,
    utils,
]
def reloader():
    for module in RELOAD_MODULES:
        print("Reloading", module)
        reload(module)
    print("Reloading complete.")


In [None]:
import sys, importlib

# See which copy you actually loaded
import gutenbergpy, gutenbergpy.parse.rdfparser, gutenbergpy.caches.sqlitecache
print("gutenbergpy     :", gutenbergpy.__file__)
print("rdfparser       :", gutenbergpy.parse.rdfparser.__file__)
print("sqlitecache     :", gutenbergpy.caches.sqlitecache.__file__)
print("sys.path[0]     :", sys.path[0])

## Project Setup

### Path Setup

In [None]:
# Where the notebook is executing (absolute, resolved)
CURRENT_PATH = pathlib.Path.cwd().resolve()

# Project root = formerly parent of notebooks/, now just current dir
# PROJECT_ROOT = CURRENT_PATH.parent 
PROJECT_ROOT = CURRENT_PATH

# Local data/output inside the project
DEV_CORPUS = (PROJECT_ROOT / "data")
DEV_OUTPUT = (PROJECT_ROOT / "output")

# Sibling-level resources (one level up from project root)
GIT_CORPUS = (PROJECT_ROOT.parent / "corpora")
OPENIA_API_KEYS_ENV = (PROJECT_ROOT.parent / ".secrets" / "openai_api_keys.env")

def check_path(path: pathlib.Path, description: str) -> None:
    if path.exists():
        print(f"Found {description} at: {path}")
    else:
        print(f"Missing {description} from: {path}")

check_path(DEV_CORPUS, "DEV_CORPUS")
check_path(DEV_OUTPUT, "DEV_OUTPUT")
check_path(GIT_CORPUS, "GIT_CORPUS")
check_path(OPENIA_API_KEYS_ENV, "OPENIA_API_KEYS_ENV")


In [None]:
# Working corpora
# CORPORA_ROOT = project_root / "data"
# Checked-in corpora
CORPORA_ROOT = project_root / ".." / "corpora"
CORPORA_ROOT = CORPORA_ROOT.resolve()  # Resolve to absolute path.

print("Corpora root:", CORPORA_ROOT)
print("Corpora top-level directories:", end=" ")
os.listdir(CORPORA_ROOT)

In [None]:
json_stories = corpus_surveyor.find_corpus_stories(CORPORA_ROOT)
len(json_stories)
print(utils.sml(json_stories))
print("Type of path element:", type(json_stories[0]))

## Gathering Mass Quantities

In [None]:
single_stories = storymap.SINGLE_STORIES
len(single_stories)

In [None]:
short_stories = single_stories[:10]  # For testing purposes.

In [None]:
gatherer = downloaders.DataGatherer(
    storymap.TARGET_DIRECTORY,
    description="Single stories from Gutenberg",
    license="Public domain, from Project Gutenberg.")


In [None]:
example_story = 24927  # Specific story for testing.
print("Example story:", example_story)

story, filename, error =parser.gather_story(gatherer, example_story)
print("Filename:", filename)
print("Error:", error)


In [None]:
random_story = random.choice(single_stories)
print("Random story:", random_story)

story, filename, error = parser.gather_story(gatherer, random_story)
print("Filename:", filename)
print("Error:", error)

In [None]:
def story_analyzer(story_id: int) -> None:
    print(f"Analyzing story ID: {story_id}")

    subject = api.get_metadata('subject', story_id)
    is_subject_ok = parser.subject_ok(subject)
    print("Subject:", subject)
    print(" - Subject OK?", is_subject_ok)

    language = api.get_metadata('language', story_id)
    is_language_ok = parser.only_english(language)
    print("Language:", language)
    print(" - Is language OK?", is_language_ok)

    title = api.get_metadata('title', story_id)
    is_title_ok = parser.title_ok(title)
    print("Title:", title)
    print(" - Is title OK?", is_title_ok)

    author = list(api.get_metadata('author', story_id))
    is_author_ok = parser.author_ok(author)
    print("Author:", author)
    print(" - Is author OK?", is_author_ok)

    print("Overall, is story metadata OK?",
          all([is_subject_ok, is_language_ok, is_title_ok, is_author_ok]))

    # text = str(headers.strip_headers(api.load_etext(story_id).strip()))
    etext = api.load_etext(story_id)
    print(etext[:500])
    stripped_text = headers.strip_headers(etext.strip()).strip()
    print(stripped_text[:500])
    cleaned_text = stripped_text.decode('utf-8', errors='ignore').strip()
    print(cleaned_text[:500])


    text_length = len(cleaned_text)
    num_words = len(cleaned_text.split())
    print("Length of story text (characters):", text_length)
    print("Number of words in story text:", num_words)

    # Extract the title and body of the story.
    extracted_title = list(title)[0]
    print("Title extracted:", extracted_title)
    number_of_titles = parser.how_many_titles(cleaned_text, extracted_title)
    print("Number of titles found in text:", number_of_titles)

    body = parser.body_of_text(cleaned_text, author, extracted_title, True)
    print("Extracted body length:", len(body))
    print("Extracted body preview:", body[:200])

    if len(body) < 10:
        print("Story is too short, skipping: " + str(story))

    # if we get here, we have the pieces of the story, so let's save
    file_name = names.title_to_filename(
        extracted_title, ext=constants.FILE_SUFFIX, max_len=50)
    print("Generated filename:", file_name)
    

story_analyzer(example_story)

In [None]:
story_analyzer(23920)

In [None]:
title = api.get_metadata('title', example_story)
is_title_ok = parser.title_ok(title)
print("Title:", title)
print(" - Is title OK?", is_title_ok)

author = list(api.get_metadata('author', example_story))
is_author_ok = parser.author_ok(author)
print("Author:", author)
print(" - Is author OK?", is_author_ok)

etext = api.load_etext(example_story)
print(etext[:500])
cleaned_text = etext.decode('utf-8', errors='ignore').strip()
print(cleaned_text[:500])

# cleaned_text = str(headers.strip_headers(api.load_etext(example_story).strip()))
text_length = len(cleaned_text)
num_words = len(cleaned_text.split())
print("Length of story text (characters):", text_length)
print("Number of words in story text:", num_words)

# Extract the title and body of the story.
extracted_title = list(title)[0]
print("Title extracted:", extracted_title)
number_of_titles = parser.how_many_titles(cleaned_text, extracted_title)
print("Number of titles found in text:", number_of_titles)

paragraph_array = cleaned_text.split("\n\n")
print("Number of paragraphs found in text:", len(paragraph_array))

body = parser.body_of_text(cleaned_text, author, extracted_title, True)
print("Extracted body length:", len(body))
print("Extracted body preview:", body[:200])


In [None]:
print(cleaned_text[:500])

In [None]:
random_story = random.choice(single_stories)
print("Random story ID:", random_story)

api.get_metadata('subject', random_story)

In [None]:
cleaned_text.split('\n')[:10]

In [None]:
reloader()

In [None]:
dev_stories = corpus_surveyor.find_corpus_stories(DEV_CORPUS)
len(dev_stories)
print(utils.sml(dev_stories))
print("Type of path element:", type(dev_stories[0]))

In [None]:

def titles_for(cache, book_id: int) -> Set[str]:
    """Return set of title strings for a given Gutenberg book ID."""
    rows = cache.native_query(
        f"""
        SELECT t.name AS v
        FROM titles t
        JOIN books b ON t.bookid = b.id
        WHERE b.gutenbergbookid = {int(book_id)}
        """
    )
    return values.strings_from_sql(rows)

def text_for(book_id: int):
    etext = api.load_etext(book_id)
    cleaned_text = etext.decode('utf-8', errors='ignore').strip()
    return cleaned_text


def summary_of(book_id):
    gut_cache = cache.ensure_gutenberg_cache()
    book_titles = titles_for(gut_cache, book_id)
    book_text = text_for(book_id)
    print("Book ID: ", book_id)
    print(" - Book Titles:", book_titles)
    print(" - Book Text:", book_text[:500])
    return book_text, book_titles


text, titles = summary_of(example_story)

In [None]:
random_story = random.choice(single_stories)
random_text, random_titles = summary_of(random_story)

In [None]:
from lcats.gettenberg import cache

print("DB path:", cache.gutenberg_cache_path())
print("DB size:", cache.gutenberg_cache_path().stat().st_size if cache.gutenberg_cache_path().exists() else 0)
print("Texts dir:", cache.GUTENBERG_TEXTS.resolve())


In [None]:
cache.gutenberg_cache_path().stat()

In [None]:
import sqlite3, pathlib
db = pathlib.Path(cache.gutenberg_cache_path())
con = sqlite3.connect(f"file:{db}?mode=ro", uri=True)

print("titles:", list(con.execute("PRAGMA table_info(titles);")))
print("books :", list(con.execute("PRAGMA table_info(books);")))

In [None]:
gid = 3115

# What is the internal PK for this Gutenberg ID?
[(book_pk,)] = list(con.execute(f"SELECT id FROM books WHERE gutenbergbookid={gid}"))
print("Internal PK for Gutenberg ID", gid, "is", book_pk)

# What title rows link to that PK?
print("Titles for book PK", book_pk, ":")
print(list(con.execute(f"SELECT name FROM titles WHERE bookid={book_pk}")))
# Cross-check via join (should match):
print("Titles via join:")
print(list(con.execute(f"""
    SELECT t.name
    FROM titles t JOIN books b ON t.bookid=b.id
    WHERE b.gutenbergbookid={gid}
""")))

In [None]:
text, titles = summary_of(3115)

In [None]:
from lcats.gettenberg import cache
from gutenbergpy import gutenbergcache as gc
import pathlib, sqlite3

print("Settings:")
print("  CACHE_FILENAME:",           gc.GutenbergCacheSettings.CACHE_FILENAME)
print("  CACHE_RDF_ARCHIVE_NAME:",   gc.GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME)
print("  CACHE_RDF_UNPACK_DIRECTORY:", gc.GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY)
print("  TEXT_FILES_CACHE_FOLDER:",  gc.GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER)

db = pathlib.Path(gc.GutenbergCacheSettings.CACHE_FILENAME)
print("\nDB path exists/size:", db, db.exists(), (db.stat().st_size if db.exists() else 0))

# Verify the schema columns you actually have
con = sqlite3.connect(f"file:{db}?mode=ro", uri=True)
print("PRAGMA titles:", list(con.execute("PRAGMA table_info(titles)")))
print("PRAGMA books :", list(con.execute("PRAGMA table_info(books)")))

In [None]:
gid = 24927
# internal PK for this Gutenberg ID
row = list(con.execute("SELECT id FROM books WHERE gutenbergbookid=?", (gid,)))
print("book PK:", row)
if row:
    (book_pk,) = row[0]
    print("titles for PK:", list(con.execute("SELECT name FROM titles WHERE bookid=?", (book_pk,))))
    print("join check:", list(con.execute("""
        SELECT t.name
        FROM titles t JOIN books b ON t.bookid=b.id
        WHERE b.gutenbergbookid=?""", (gid,))))

In [None]:
from pathlib import Path
unpack = Path(gc.GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY)
rdf = unpack / str(gid) / f"pg{gid}.rdf"
print("RDF exists:", rdf, rdf.exists(), rdf.stat().st_size if rdf.exists() else 0)
if rdf.exists():
    print("first 200 bytes:", rdf.read_bytes()[:200])

In [None]:
gid = 24927

# A) Does titles(bookid) already store the Gutenberg ID?
print(list(con.execute("SELECT name FROM titles WHERE bookid=?", (gid,))))

# B) What do we get if we join titles.bookid to books.gutenbergbookid?
print(list(con.execute("""
    SELECT t.name
    FROM titles t
    JOIN books b ON t.bookid = b.gutenbergbookid
    WHERE b.gutenbergbookid=?""", (gid,))))

# C) Your current join (bookid -> books.id), which yields the *wrong* title:
print(list(con.execute("""
    SELECT t.name
    FROM titles t
    JOIN books b ON t.bookid = b.id
    WHERE b.gutenbergbookid=?""", (gid,))))



In [None]:
import random
from lcats.gettenberg import api, headers

def header_titles_for(gid: int) -> set[str]:
    txt = api.load_etext(gid)
    hdr = headers.get_text_header_lines(txt)
    return { line.split(":",1)[1].strip()
             for line in hdr if line.lower().startswith("title:") }

gids = [r[0] for r in con.execute("SELECT gutenbergbookid FROM books ORDER BY RANDOM() LIMIT 50")]

mismatch_id_join = 0
mismatch_gid_join = 0

for gid in gids:
    t_id_join  = {r[0] for r in con.execute(
        """SELECT t.name FROM titles t
           JOIN books b ON t.bookid=b.id
           WHERE b.gutenbergbookid=?""", (gid,))}
    t_gid_join = {r[0] for r in con.execute(
        """SELECT t.name FROM titles t
           JOIN books b ON t.bookid=b.gutenbergbookid
           WHERE b.gutenbergbookid=?""", (gid,))}
    try:
        t_header   = header_titles_for(gid)
    except Exception as e:
        print(f"Error getting header titles for {gid}: {e}")
        t_header = set()

    if t_header and t_id_join and t_header.isdisjoint(t_id_join):
        mismatch_id_join += 1
    if t_header and t_gid_join and t_header.isdisjoint(t_gid_join):
        mismatch_gid_join += 1

print("mismatch w/ bookid→books.id join:", mismatch_id_join)
print("mismatch w/ bookid→books.gutenbergbookid join:", mismatch_gid_join)

In [None]:
gutenberg_cache = cache.ensure_gutenberg_cache()


In [None]:

# 1) What title does your DB think belongs to 24927?
list(gutenberg_cache.native_query("""
SELECT t.name
FROM titles t
JOIN books b ON t.bookid=b.id
WHERE b.gutenbergbookid=24927
"""))
# -> [('The Red Cross Girls with Pershing to Victory',)]


In [None]:

# 2) Which Gutenberg ID does that wrong title actually belong to?
list(gutenberg_cache.native_query("""
SELECT b.gutenbergbookid
FROM titles t
JOIN books b ON t.bookid=b.id
WHERE t.name='The Red Cross Girls with Pershing to Victory'
"""))
# Expect a different gid (not 24927). If you see a plausible but wrong gid,
# we have clear evidence of mis-association during cache creation.


In [None]:
# 3) Sanity: count how widespread this is on a sample
import random
sample = [row[0] for row in gutenberg_cache.native_query("SELECT gutenbergbookid FROM books LIMIT 500")]
def db_title(gid):
    return next(iter(gutenberg_cache.native_query(f"""
        SELECT t.name FROM titles t JOIN books b ON t.bookid=b.id
        WHERE b.gutenbergbookid={gid}""")), (None,))[0]

def header_title(gid):
    raw = api.load_etext(gid)
    for line in headers.get_text_header_lines(raw):
        if line.lower().startswith("title:"):
            return line.split(":",1)[1].strip()
    return None

mismatches = [(gid, db_title(gid), header_title(gid)) for gid in random.sample(sample, 50)]
[m for m in mismatches if m[1] and m[2] and m[1] != m[2]]


In [None]:
len(mismatches)

In [None]:
significant_mismatch = [m for m in mismatches if m[1] and m[2] and not m[1].lower().startswith(m[2].lower())]
significant_mismatch, len(significant_mismatch)

In [None]:
# 1) Find internal PK for the Gutenberg ID
pk = list(gutenberg_cache.native_query("SELECT id FROM books WHERE gutenbergbookid=24927"))[0][0]

# 2) Titles attached to that PK:
list(gutenberg_cache.native_query(f"SELECT name FROM titles WHERE bookid={pk}"))

# 3) Cross-check: the one line join you’re using
list(gutenberg_cache.native_query("""
  SELECT t.name
  FROM titles t JOIN books b ON t.bookid=b.id
  WHERE b.gutenbergbookid=24927
"""))

In [None]:
import inspect, gutenbergpy.parse.parseitemtitles as pit
import gutenbergpy.parse.rdfparser as rp
import gutenbergpy.caches.sqlitecache as sc

print(inspect.getsource(pit))  # how titles are extracted/applied
print(inspect.getsource(rp))   # how book key is computed and passed around
print(inspect.getsource(sc))   # where INSERTs into titles happen

In [None]:
import tempfile
from pathlib import Path
import shutil
from pprint import pprint

# library imports
from gutenbergpy import gutenbergcache as gc
from gutenbergpy.parse.rdfparser import RdfParser

def run_rdfparser_on_single_rdf(rdf_file: Path):
    """Run gutenbergpy's RDF parser on one RDF file and return the parser object."""
    rdf_file = Path(rdf_file)
    assert rdf_file.name.startswith("pg") and rdf_file.suffix == ".rdf", \
        "Expect a filename like pg<gid>.rdf"

    gid = int(rdf_file.stem[2:])

    # Build the unpack tree it expects: <unpack_dir>/<gid>/pg<gid>.rdf
    tmp = Path(tempfile.mkdtemp(prefix="gpy_rdf_"))
    unpack_dir = tmp / "epub" / str(gid)
    unpack_dir.mkdir(parents=True, exist_ok=True)
    shutil.copy2(rdf_file, unpack_dir / rdf_file.name)

    # Point ONLY the unpack dir at our temp; no DB/file writes are triggered by RdfParser itself.
    S = gc.GutenbergCacheSettings
    old_unpack = S.CACHE_RDF_UNPACK_DIRECTORY
    try:
        gc.GutenbergCacheSettings.set(CacheUnpackDir=str(tmp / "epub"))

        # Run the parser – this should scan just our single file tree
        parser = RdfParser()
        result = parser.do()

        # `result` is a list/array of per-book field sets the cache layer would normally persist.
        return parser, result, tmp
    finally:
        # restore to avoid surprising other code
        gc.GutenbergCacheSettings.set(CacheUnpackDir=old_unpack)

# ---- usage ----
# point to a *known good* RDF you already have on disk
rdf = Path("cache/epub/24927/pg24927.rdf")
parser, parsed, tmp_root = run_rdfparser_on_single_rdf(rdf)

print()
print("Parsed items:", parsed)
# Peek at what the parser produced (shape is implementation-specific; print to inspect)
print("Parsed book title id:", parsed.books[0].titles_id)


In [None]:
dir(parsed)

In [None]:
parsed.books[0].titles_id

In [None]:
import tempfile
from pathlib import Path
import shutil
from unittest import mock
from pprint import pprint

from gutenbergpy import gutenbergcache as gc
from gutenbergpy.parse.rdfparser import RdfParser
import gutenbergpy.caches.sqlitecache as sqlitecache  # the DB writer used by the parser

def capture_parser_writes_for_single_rdf(rdf_file: Path):
    """Run RdfParser on one RDF and capture what it tries to write to the cache."""
    rdf_file = Path(rdf_file)
    gid = int(rdf_file.stem[2:])

    tmp = Path(tempfile.mkdtemp(prefix="gpy_cap_"))
    unpack_dir = tmp / "epub" / str(gid)
    unpack_dir.mkdir(parents=True, exist_ok=True)
    shutil.copy2(rdf_file, unpack_dir / rdf_file.name)

    S = gc.GutenbergCacheSettings
    old_unpack = S.CACHE_RDF_UNPACK_DIRECTORY

    captured = {
        "books": [],
        "titles": [],
        "authors": [],
        "languages": [],
        "subjects": [],
        # add more buckets if you want to observe other tables
    }

    # Discover insert-like methods to patch (names can vary a bit between versions)
    # Inspect the module or print(dir(sqlitecache.SqliteCache)) if unsure.
    # Common names: insert_books, insert_titles, insert_authors, insert_languages, insert_subjects
    def cap(name):
        def _cap(self, rows):
            captured[name].extend(list(rows))
        return _cap

    with \
        mock.patch.object(sqlitecache.SQLiteCache, "insert_books", side_effect=cap("books")),\
        mock.patch.object(sqlitecache.SQLiteCache, "insert_titles", side_effect=cap("titles")),\
        mock.patch.object(sqlitecache.SQLiteCache, "insert_authors", side_effect=cap("authors"), create=True),\
        mock.patch.object(sqlitecache.SQLiteCache, "insert_languages", side_effect=cap("languages"), create=True),\
        mock.patch.object(sqlitecache.SQLiteCache, "insert_subjects", side_effect=cap("subjects"), create=True):

        try:
            gc.GutenbergCacheSettings.set(CacheUnpackDir=str(tmp / "epub"))

            parser = RdfParser()
            parser.do()  # will call the patched insert_* methods instead of touching a DB
        finally:
            gc.GutenbergCacheSettings.set(CacheUnpackDir=old_unpack)

    return captured, tmp

# ---- usage ----
captured, tmp_root = capture_parser_writes_for_single_rdf(Path("cache/epub/24927/pg24927.rdf"))

print("BOOK rows:", captured["books"])
print("TITLE rows:", captured["titles"])
pprint(captured["titles"][:3])
