In [1]:
from pathlib import Path
import re
from collections import Counter
import math

In [2]:
def load_texts(local_pride: str = "./data/pride.txt",
               local_sense: str = "./data/sense.txt"):
    """Load 'Pride and Prejudice' and 'Sense and Sensibility' from disk.

    Parameters
    ----------
    local_pride : str
        Path to Pride and Prejudice text file. Defaults to './data/pride.txt'.
    local_sense : str
        Path to Sense and Sensibility text file. Defaults to './data/sense.txt'.

    Returns
    -------
    tuple[str, str]
        (pride_text, sense_text).

    Raises
    ------
    FileNotFoundError
        If either file is missing.
    """
    p1, p2 = Path(local_pride), Path(local_sense)

    if not p1.exists():
        raise FileNotFoundError(
            f"Missing file: {p1}\n"
            "→ Please place 'pride.txt' at this path or pass the correct path to load_texts(...)."
        )
    if not p2.exists():
        raise FileNotFoundError(
            f"Missing file: {p2}\n"
            "→ Please place 'sense.txt' at this path or pass the correct path to load_texts(...)."
        )

    pride = p1.read_text(encoding="utf-8", errors="ignore")
    sense = p2.read_text(encoding="utf-8", errors="ignore")
    return pride, sense
          

In [3]:
def normalize(text: str) -> str:
    """Normalize a Gutenberg-like text for tokenization.

    Steps
    -----
    1) Strip Project Gutenberg header/footer if present
       (*** START ... *** to *** END ... ***).
    2) Normalize newlines to '\\n'.
    """
    if not text:
        return ""

    # Try to clip strictly between the START and END markers if both exist
    start_match = re.search(r"\*\*\*\s*START OF (?:THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*", text, flags=re.IGNORECASE|re.DOTALL)
    end_match   = re.search(r"\*\*\*\s*END OF (?:THIS|THE) PROJECT GUTENBERG EBOOK.*?\*\*\*", text,   flags=re.IGNORECASE|re.DOTALL)
    if start_match and end_match and end_match.start() > start_match.end():
        text = text[start_match.end():end_match.start()]

    # Normalize Windows line endings
    return text.replace("\r\n", "\n").strip()


# -------- Tokenization helpers (simple) --------

WORD_RE = re.compile(r"[A-Za-z']+")  # keep apostrophes in words (e.g., don't)

def words(text: str):
    """Simple word tokenizer (lowercased, ASCII letters + apostrophes)."""
    return WORD_RE.findall(text.lower())


def sentences(text: str):
    """Naive sentence splitter using punctuation boundaries."""
    return [s.strip() for s in re.split(r"(?<=[.!?])\s+", text) if s.strip()]


# -------- Frequency & keyness utilities --------

def top_words(word_list, min_len=4, extra_stop=None, n=30):
    """Return top-N frequent words after lightweight filtering."""
    base_stop = {
        "the","and","to","of","a","i","it","in","that","was","he","you","is","for","on","as",
        "with","his","her","at","be","she","had","not","but","said","they","them","this","so","all","one","very",
        "there","what","were","from","have","would","could","when","been","their","we","my","me","or","by","up","no","out","if",
        # name stopwords (tweak as you like)
        "elizabeth","darcy","bennet","jane","willoughby","marianne","dashwood"
    }
    if extra_stop:
        base_stop |= set(extra_stop)

    c = Counter(w for w in word_list if len(w) >= min_len and w not in base_stop)
    return c.most_common(n)

In [4]:
# ---- Load & report ----
pride_raw, sense_raw = load_texts()
pride = normalize(pride_raw)
sense = normalize(sense_raw)

print(f"Pride chars: {len(pride):,} | Sense chars: {len(sense):,}")


Pride chars: 728,713 | Sense chars: 670,674


In [5]:
# Tokenize into word lists
pride_words = words(pride)
sense_words = words(sense)

# Split into sentence lists
pride_sentences = sentences(pride)
sense_sentences = sentences(sense)

# Report counts
print(f"Pride words: {len(pride_words):,} | Sense words: {len(sense_words):,}")
print(f"Pride sentences: {len(pride_sentences):,} | Sense sentences: {len(sense_sentences):,}")


Pride words: 128,565 | Sense words: 120,869
Pride sentences: 5,942 | Sense sentences: 4,663


In [6]:
# Show top stylistic vocabulary (first 15 entries)
print("Top Pride:", top_words(pride_words)[:15])
print("Top Sense:", top_words(sense_words)[:15])

Top Pride: [('which', 568), ('your', 455), ('will', 428), ('such', 397), ('much', 337), ('more', 337), ('miss', 315), ('must', 315), ('bingley', 310), ('than', 300), ('should', 258), ('know', 244), ('though', 238), ('herself', 236), ('well', 230)]
Top Sense: [('elinor', 685), ('which', 593), ('more', 408), ('your', 386), ('every', 376), ('will', 363), ('than', 362), ('such', 359), ('much', 290), ('only', 287), ('must', 283), ('sister', 282), ('edward', 263), ('mother', 259), ('herself', 255)]
