# Dragon's Codex - Data Exploration Notebook

**Week 1, Session 3**: Testing markdown parsing and data extraction

This notebook tests:
1. Book markdown parsing
2. Chapter extraction
3. Wiki temporal section parsing
4. Glossary extraction

## Setup

In [None]:
# Test imports
import sys
from pathlib import Path
import json
import re

# Add src to path
sys.path.append('../src')

# Test our utilities
from utils.config import Config
from utils.logger import setup_logging, get_logger
from utils.wot_constants import BOOK_TITLES, get_book_number

# Setup
config = Config()
logger = get_logger(__name__)

print("✓ All imports successful!")
print(f"✓ Project root: {config.PROJECT_ROOT}")
print(f"✓ Books path: {config.BOOKS_PATH}")
print(f"✓ Wiki path: {config.WIKI_PATH}")

✓ All imports successful!
✓ Project root: C:\Users\victor.diaz\Documents\_AI\dragon-codex
✓ Books path: C:\Users\victor.diaz\Documents\_AI\dragon-codex\data\raw\books
✓ Wiki path: C:\Users\victor.diaz\Documents\_AI\dragon-codex\data\raw\wiki


## 1. Book Parsing Test

In [3]:
# Load Book 1 (Eye of the World)
books_dir = Path(config.BOOKS_PATH)

# Find first book file (might be .md or .txt)
book_files = list(books_dir.glob('*Eye*.md')) + list(books_dir.glob('*Eye*.txt'))
if not book_files:
    book_files = sorted(list(books_dir.glob('*.md')) + list(books_dir.glob('*.txt')))

book_file = book_files[0]
print(f"Loading: {book_file.name}")

# Read content
with open(book_file, 'r', encoding='utf-8') as f:
    book_content = f.read()

print(f"✓ Loaded {len(book_content):,} characters")
print(f"\nFirst 500 characters:")
print("-" * 60)
print(book_content[:500])

Loading: 01-The_Eye_of_the_World.txt
✓ Loaded 1,677,243 characters

First 500 characters:
------------------------------------------------------------
PROLOGUE

 

*Dragonmount*

 

 

The palace still shook occasionally as the earth rumbled in memory, groaned as if it would deny what had happened. Bars of sunlight cast through rents in the walls made motes of dust glitter where they yet hung in the air. Scorch-marks marred the walls, the floors, the ceilings. Broad black smears crossed the blistered paints and gilt of once-bright murals, soot overlaying crumbling friezes of men and animals which seemed to have attempted to walk before the mad


In [None]:
import json
import re
import sys

# Assuming the script is run with the filename as argument, or hardcode it
if len(sys.argv) > 1:
    filename = sys.argv[1]
else:
    filename = '01-The Eye of the World - Robert Jordan-sample.txt'

# Parse book number and name from filename
book_parts = filename.split('-', 1)
book_number = book_parts[0].strip()
if len(book_parts) > 1:
    book_name = book_parts[1].rstrip('.txt').strip()
else:
    book_name = ''

# Read the file
with open(filename, 'r', encoding='utf-8') as f:
    lines = f.readlines()

# Initialize structures
data = {
    "book_number": book_number,
    "book_name": book_name,
    "chapters": [],
    "glossary": []
}

current_section = None
current_chapter = None
chapter_content = []
glossary_lines = []

i = 0
while i < len(lines):
    line = lines[i]
    stripped = line.strip()

    if stripped == "PROLOGUE":
        current_section = "chapter"
        chapter_num = 0
        current_chapter = {"number": chapter_num, "title": "", "content": ""}
        i += 1
        # Skip blank lines to title
        while i < len(lines) and not lines[i].strip():
            i += 1
        if i < len(lines):
            current_chapter["title"] = lines[i].strip().strip('*')
        i += 1
        # Skip blank lines to content
        while i < len(lines) and not lines[i].strip():
            i += 1
        continue

    elif stripped == "CHAPTER":
        if current_section == "chapter" and current_chapter:
            current_chapter["content"] = ''.join(chapter_content).strip()
            data["chapters"].append(current_chapter)
            chapter_content = []

        i += 1
        # Skip blank lines to number
        while i < len(lines) and not lines[i].strip():
            i += 1
        chapter_num = 0
        if i < len(lines):
            try:
                chapter_num = int(lines[i].strip())
            except ValueError:
                pass
        current_chapter = {"number": chapter_num, "title": "", "content": ""}
        i += 1
        # Skip blank lines to title
        while i < len(lines) and not lines[i].strip():
            i += 1
        if i < len(lines):
            current_chapter["title"] = lines[i].strip().strip('*')
        i += 1
        # Skip blank lines to content
        while i < len(lines) and not lines[i].strip():
            i += 1
        current_section = "chapter"
        continue

    elif stripped == "GLOSSARY":
        if current_section == "chapter" and current_chapter:
            current_chapter["content"] = ''.join(chapter_content).strip()
            data["chapters"].append(current_chapter)
            chapter_content = []
        current_section = "glossary"
        i += 1
        continue

    if current_section == "chapter":
        chapter_content.append(line)
    elif current_section == "glossary":
        glossary_lines.append(line)

    i += 1

# Append the last section
if current_section == "chapter" and current_chapter:
    current_chapter["content"] = ''.join(chapter_content).strip()
    data["chapters"].append(current_chapter)
elif current_section == "glossary":
    # Parse glossary
    current_term = None
    term_description = []
    note = []  # Ignore note
    for line in glossary_lines:
        stripped_line = line.strip()
        if stripped_line.startswith('> '):
            # New term
            if current_term:
                current_term["description"] = ''.join(term_description).strip()
                data["glossary"].append(current_term)
                term_description = []

            # Parse the term line
            clean_line = line.strip()[2:].replace('*', '').replace('\\', '').strip()
            term = ""
            pronunciation = ""
            desc_start = ""

            if '(' in clean_line and ')' in clean_line:
                match = re.match(r'^(.+?)\s*\(([^)]+)\)\s*:\s*(.*)$', clean_line)
                if match:
                    term = match.group(1).strip()
                    pronunciation = match.group(2).strip()
                    desc_start = match.group(3).strip()
            else:
                match = re.match(r'^([^:]+):\s*(.*)$', clean_line)
                if match:
                    term = match.group(1).strip()
                    pronunciation = ""
                    desc_start = match.group(2).strip()

            if term:
                current_term = {
                    "term": term,
                    "pronunciation": pronunciation,
                }
                if desc_start:
                    term_description.append(desc_start + '\n')
        else:
            if current_term:
                term_description.append(line)
            else:
                note.append(line)  # Ignore

    # Append last term
    if current_term:
        current_term["description"] = ''.join(term_description).strip()
        data["glossary"].append(current_term)

# Write to JSON
output_filename = filename.replace('.txt', '.json')
with open(output_filename, 'w', encoding='utf-8') as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print(f"JSON file created: {output_filename}")

Prologue found: 0

Chapters found: 0

Epilogue found: 0

✓ Total sections: 0
