
# 1) Frequent Words = Literary Fingerprints

This notebook compares **word frequency** between our two toy texts:
- *Alice's Adventures in Wonderland* (here referenced as **Wonderland**)
- *Through the Looking-Glass* (here referenced as **Looking-Glass**)

We practice simple tokenization and frequency analysis, then discuss
what's **meaningful signal** vs. **noise** in the results, and how to
improve the method (normalization, keyness, etc).


# Setup: Load Texts

This notebook needs **Alice in Wonderland** and **Through the Looking-Glass** as input texts.

**How to provide the texts:**
1. Download books from Project Gutenberg (IDs 11 and 12) as txts. [go to https://www.gutenberg.org/ebooks/11 and https://www.gutenberg.org/ebooks/12]

2. Place two text files in the "data" folder with names:
   - `Wondeland.txt`  (Alice's Adventures in Wonderland)
   - `Looking-Glass.txt` (Through the Looking-Glass)





In [None]:
from pathlib import Path
import re
from collections import Counter
import math

In [None]:
def load_texts(local_alice: str = '../data/Wonderland.txt',
               local_glass: str = '../data/Looking-Glass.txt'):
    """Load Wonderland and Looking-Glass texts from disk.

    Parameters
    ----------
    local_alice : str
        Path to Wonderland text file. Defaults to '../data/Wonderland.txt'.
    local_glass : str
        Path to Looking-Glass text file. Defaults to '../data/Looking-Glass.txt'.

    Returns
    -------
    tuple[str, str]
        (wonderland_text, lookingglass_text).

    Raises
    ------
    FileNotFoundError
        If either file is missing.

    Extra Notes
    -----------
    - Using UTF-8 with `errors='ignore'` avoids codec exceptions on
      older Project Gutenberg dumps or inconsistent encodings.
    """
    p1, p2 = Path(local_alice), Path(local_glass)

    # Fail fast with a clear message if a file is missing
    if not p1.exists():
        raise FileNotFoundError(
            f"Missing file: {p1}\n"
            "→ Please place 'Wonderland.txt' at this path or update load_texts(...)."
        )
    if not p2.exists():
        raise FileNotFoundError(
            f"Missing file: {p2}\n"
            "→ Please place 'Looking-Glass.txt' at this path or update load_texts(...)."
        )

    # Read the files (UTF-8; ignore undecodable bytes to stay robust)
    wonderland   = p1.read_text(encoding='utf-8', errors='ignore')
    lookingglass = p2.read_text(encoding='utf-8', errors='ignore')
    return wonderland, lookingglass

In [None]:
def normalize(text: str) -> str:
    """Normalize a Gutenberg-like text for tokenization.

    Steps
    -----
    1) Heuristically strip Project Gutenberg headers/footers if present
       (looks for *** START ... *** END markers).
    2) Normalize newlines to '\n'.

    Parameters
    ----------
    text : str
        Raw text as loaded from disk (can be empty).

    Returns
    -------
    str
        Cleaned text suitable for tokenization and counting.
    """
    if not text:
        return ''
    # Clip to the main body if markers are present.
    start = text.find('*** START')
    end   = text.find('*** END')
    if start != -1 and end != -1 and end > start:
        text = text[start:end]
    # Normalize Windows line endings.
    return text.replace('\r\n', '\n')


# -------- Tokenization helpers (simple) --------

WORD_RE = re.compile(r"[A-Za-z']+")  # keep apostrophes in words (e.g., don't -> don't)

def words(text: str):
    """Simple word tokenizer (lowercased, ASCII letters + apostrophes).

    Pros
    ----
    - Very fast and dependency-free.
    - Good enough for frequency/keyness demonstrations.

    Cons
    ----
    - No punctuation words, no sentence boundaries, no POS tags.
    - May treat possessives inconsistently across sources.

    Returns
    -------
    list[str]
        Lowercased word words.
    """
    return WORD_RE.findall(text.lower())


def sentences(text: str):
    """Naive sentence splitter using punctuation boundaries.

    Uses a regex to split on '.', '!', '?' followed by whitespace.
    Because this is heuristic, treat results as approximate.

    Returns
    -------
    list[str]
        Sentence-like strings.
    """
    return [s.strip() for s in re.split(r'(?<=[.!?])\s+', text) if s.strip()]


# -------- Frequency & keyness utilities --------

def top_words(words, min_len=4, extra_stop=None, n=30):
    """Return top-N frequent words after lightweight filtering.

    Parameters
    ----------
    words : list[str]
        Word words (e.g., output of `words()`).
    min_len : int
        Minimum length to keep (filters very short function words).
    extra_stop : Iterable[str] or None
        Additional stopwords to exclude (e.g., {'just','only'}).
    n : int
        Number of items to return.

    Returns
    -------
    list[tuple[str,int]]
        Top-N (word, count) pairs.
    """
    base_stop = {
        'the','and','to','of','a','i','it','in','that','was','he','you','is','for','on','as',
        'with','his','her','at','be','she','had','not','but','said','they','them','this','so','all','one','very',
        'there','what','were','from','have','would','could','when','been','their','we','my','me','or','by','up','no','out','if',
        # book-specific names are often noise for stylistics; tweak as needed
        'alice'
    }
    if extra_stop:
        base_stop |= set(extra_stop)

    c = Counter(w for w in words if len(w) >= min_len and w not in base_stop)
    return c.most_common(n)


## Load & Normalize
We load both texts using **inline path checks** and then apply a simple normalization.


In [None]:

# Load raw texts (forgiving: returns '' if a file is missing)
wonderland_raw, lookingglass_raw = load_texts()

# Normalize for tokenization
wonderland   = normalize(wonderland_raw)
lookingglass = normalize(lookingglass_raw)

print(f"Wonderland chars: {len(wonderland):,} | Looking-Glass chars: {len(lookingglass):,}")



## Tokenize
We use a simple regex tokenizer (letters + apostrophes). For more serious work,
consider spaCy or stanza for tagging and lemmatization.


In [None]:

wonderland_words = words(wonderland)
lookingglass_words = words(lookingglass)

wonderland_sentences = sentences(wonderland)
lookingglass_sentences = sentences(lookingglass)

print(f"Wonderland words: {len(wonderland_words):,} | Looking-Glass words: {len(lookingglass_words):,}")
print(f"Wonderland sentences: {len(wonderland_sentences):,} | Looking Glass sentences: {len(lookingglass_sentences):,}")



## Top Words (after basic stopwords)
The list is **partly signal, partly noise**—use it to start discussion.


In [None]:

print("Top Wonderland:", top_words(wonderland_words)[:15])
print("Top Looking-Glass:", top_words(lookingglass_words)[:15])



## Discussion
- Which frequent words are **thematically meaningful** vs. artifacts of stopwording?
- Do **chess terms** (e.g., *queen*, *white*, *red*) show higher distinctiveness in *Looking-Glass*?
- Do **spatial/falling terms** (e.g., *down*, *rabbit*) show higher distinctiveness in *Wonderland*?
- How would **lemmatization** (e.g., *think/thinks/thought*) change results?
- Implement **per_10k(count,total_words)** and **lolookingglass_likelihood(k1,n1,k2,n2) (Dunning’s G²)**, then list the 20 most distinctive words between Wonderland and Looking-Glass with per-10k rates and briefly argue which are meaningful vs. artifacts.


## Optional continution:


## Distinctiveness via Log-Likelihood (Keyness)
Raw frequency is not enough. Compute **G²** to find words that are *distinctive* of each book.


In [None]:
def per_10k(count: int, total_words: int) -> float:
    """Normalize a raw count per 10,000 words for fair comparisons."""
    return (count / max(1, total_words)) * 10000.0


def lolookingglass_likelihood(k1: int, n1: int, k2: int, n2: int) -> float:
    """Dunning’s log-likelihood (G^2) keyness score for word distinctiveness.

    Parameters
    ----------
    k1 : int  Frequency in corpus A
    n1 : int  Total words in corpus A
    k2 : int  Frequency in corpus B
    n2 : int  Total words in corpus B

    Returns
    -------
    float
        G^2 value; larger absolute values indicate stronger distinctiveness.
        Direction should be interpreted by comparing rates (per_10k) or counts.

    Notes
    -----
    - Symmetric measure widely used for corpus comparison.
    - Great classroom upgrade over raw frequency lists.
    """
    E1 = n1 * (k1 + k2) / max(1, (n1 + n2))
    E2 = n2 * (k1 + k2) / max(1, (n1 + n2))

    def term(k, E):
        return 0.0 if k == 0 or E == 0 else k * math.log(k / E)

    return 2.0 * (term(k1, E1) + term(k2, E2))


In [None]:

# Build frequency dictionaries
cw = Counter(wonderland_words)
cg = Counter(lookingglass_words)
nW, nG = sum(cw.values()), sum(cg.values())

# Compare a candidate set (union of top ~500 from each to keep it fast)
candidates = set([w for w,_ in cw.most_common(500)] + [w for w,_ in cg.most_common(500)])

rows = []
for w in candidates:
    g2 = lolookingglass_likelihood(cw[w], nW, cg[w], nG)
    rows.append((g2, w, per_10k(cw[w], nW), per_10k(cg[w], nG)))

# Sort by distinctiveness (descending)
rows.sort(reverse=True)

print("Most distinctive (either direction):")
for g2, w, a10, b10 in rows[:20]:
    print(f"{w:>12}  G2={g2:7.1f}  W:{a10:6.2f}/10k  LG:{b10:6.2f}/10k")
