# Debug Gutenberg Extraction

This notebook explores the setup of our Gutenberg(py) interface.

## Imports

In [None]:
from datetime import date
import os
import pathlib
import sys

from typing import Any, Dict, Iterable, List, Mapping, Set

import numpy as np
import pandas as pd
import sqlite3

Third-party modules

In [None]:
import dotenv
from openai import OpenAI
import tiktoken

from gutenbergpy import gutenbergcache as gc

Switch to the parent directory so paths can resolve and we write to the right directories.

In [None]:
cwd = pathlib.Path.cwd().resolve()
project_root = cwd.parent if cwd.name == "notebooks" else cwd
scripts_dir = project_root / "scripts"
if scripts_dir.is_dir():
    if cwd != project_root:
        print(f"Changing working directory from {cwd} to {project_root}")
        os.chdir(project_root)  # Change to the project root directory.
print("Working directory:", pathlib.Path.cwd())

Add imports from within the project (depends on prior cell)

In [None]:
from lcats import constants
from lcats.gatherers import gutenberg


## Project Setup

### Path Setup

Configure the Gutenberg cache.

In [None]:
_GUTENBERG_ROOT = pathlib.Path("cache")
_GUTENBERG_TEXTS = _GUTENBERG_ROOT / "texts"
_GUTENBERG_TEXTS.mkdir(parents=True, exist_ok=True)  # makes root too.
_GUTENBERG_TMP = _GUTENBERG_ROOT / "tmp"
_GUTENBERG_TMP.mkdir(parents=True, exist_ok=True)


In [None]:
gc.GutenbergCacheSettings.set(
    CacheFilename=str(_GUTENBERG_ROOT / "gutenbergindex.db"),
    CacheUnpackDir=str(_GUTENBERG_ROOT / "epub"),  # Can't change this default.
    CacheArchiveName=str(_GUTENBERG_ROOT / "rdf-files.tar.bz2"),
    TextFilesCacheFolder=str(_GUTENBERG_TEXTS))

print("CACHE_FILENAME:", gc.GutenbergCacheSettings.CACHE_FILENAME)
print("CACHE_RDF_UNPACK_DIRECTORY:", gc.GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY)
print("CACHE_RDF_ARCHIVE_NAME:", gc.GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME)
print("TEXT_FILES_CACHE_FOLDER:", gc.GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER)

Create the cache, but don't delete temp files.

In [None]:
_GUTENBERG_TMP.mkdir(parents=True, exist_ok=True)
_GUTENBERG_TEXTS.mkdir(parents=True, exist_ok=True)

gc.GutenbergCache.create(
    refresh=True,
    download=True,
    unpack=True,
    parse=True,
    cache=True,
    deleteTemp=False,
    )


In [None]:
cache = gc.GutenbergCache.get_cache()

In [None]:
cache.table_map

In [None]:
def show_settings():
    print("CACHE_FILENAME       =", gc.GutenbergCacheSettings.CACHE_FILENAME)
    print("CACHE_ARCHIVE_NAME   =", gc.GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME)
    print("CACHE_UNPACK_DIR     =", gc.GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY)
    print("TEXT_FILES_CACHE_DIR =", gc.GutenbergCacheSettings.TEXT_FILES_CACHE_FOLDER)

def file_info(p):
    p = pathlib.Path(p).expanduser()
    return dict(path=str(p), exists=p.exists(), size=(p.stat().st_size if p.exists() else 0))

def list_tables(db_path):
    db = pathlib.Path(db_path).expanduser()
    if not db.exists() or db.stat().st_size == 0:
        return []
    with sqlite3.connect(f"file:{db}?mode=ro", uri=True) as con:
        return [r[0] for r in con.execute("SELECT name FROM sqlite_master WHERE type='table' ORDER BY name")]

show_settings()
print("DB:",    file_info(gc.GutenbergCacheSettings.CACHE_FILENAME))
print("RDF:",   file_info(gc.GutenbergCacheSettings.CACHE_RDF_ARCHIVE_NAME))
print("UNPACK:", file_info(gc.GutenbergCacheSettings.CACHE_RDF_UNPACK_DIRECTORY))
print("TABLES:", list_tables(gc.GutenbergCacheSettings.CACHE_FILENAME))

In [None]:
count_books   = list(cache.native_query("SELECT COUNT(*) FROM books;"))[0][0]
count_authors = list(cache.native_query("SELECT COUNT(*) FROM authors;"))[0][0]
count_titles  = list(cache.native_query("SELECT COUNT(*) FROM titles;"))[0][0]
print(count_books, count_authors, count_titles)  # should be non-zero

In [None]:
subjects_pragma = list(cache.native_query("PRAGMA table_info(subjects);"))
books_pragma  = list(cache.native_query("PRAGMA table_info(books);"))
book_subjects_pragma = list(cache.native_query("PRAGMA table_info(book_subjects);"))
print("SUBJECTS PRAGMA:", subjects_pragma)
print("BOOKS PRAGMA:", books_pragma)
print("BOOK_SUBJECTS PRAGMA:", book_subjects_pragma)

In [None]:
list(info)

In [None]:
def _vals(rows: Iterable[Mapping[str, Any] | tuple]) -> Set[str]:
    """Normalize sqlite rows (tuple or dict) to a set of string values."""
    out: Set[str] = set()
    for r in rows:
        if isinstance(r, (tuple, list)):
            if r and r[0] is not None:
                out.add(str(r[0]))
        else:
            v = r.get("v")
            if v is not None:
                out.add(str(v))
    return out

def titles_for(cache, bid: int) -> Set[str]:
    rows = cache.native_query(
        f"""
        SELECT t.name AS v
        FROM titles t
        JOIN books b ON t.bookid = b.id
        WHERE b.gutenbergbookid = {int(bid)}
        """
    )
    return _vals(rows)

def languages_for(cache, bid: int) -> Set[str]:
    rows = cache.native_query(
        f"""
        SELECT l.name AS v
        FROM languages l
        JOIN books b ON l.id = b.languageid
        WHERE b.gutenbergbookid = {int(bid)}
        """
    )
    return _vals(rows)

def authors_for(cache, bid: int) -> Set[str]:
    rows = cache.native_query(
        f"""
        SELECT a.name AS v
        FROM authors a
        JOIN book_authors ba ON a.id = ba.authorid
        JOIN books b         ON ba.bookid = b.id
        WHERE b.gutenbergbookid = {int(bid)}
        """
    )
    return _vals(rows)

def subjects_for(cache, bid: int) -> Set[str]:
    rows = cache.native_query(
        f"""
        SELECT s.name AS v
        FROM subjects s
        JOIN book_subjects bs ON s.id = bs.subjectid
        JOIN books b          ON bs.bookid = b.id
        WHERE b.gutenbergbookid = {int(bid)}
        """
    )
    return _vals(rows)


In [None]:
subjects_for(cache, 1342)

In [None]:
titles_for(cache, 1342)

In [None]:
authors_for(cache, 1342)

In [None]:
languages_for(cache, 1342)