# PTB Tokenizer Analysis

This notebook explores the Penn Treebank (PTB) tokenizer FST and its quotient/remainder decomposition.

In [None]:
from transduction.benchmarking.fsts.ptb_pynini import (
    build_ptb_fst_pynini,
    string_to_byte_strs,
    decode_ptb_output,
    SEP,
    MARKER,
)
from transduction.rust_bridge import RustDecomp
from transduction.fsa import EPSILON
from IPython.display import display, HTML

# Build the PTB FST
print("Building PTB FST...")
fst = build_ptb_fst_pynini()
print(f"FST: {len(fst.states)} states")

## 1. Transduction Examples

See how the PTB tokenizer transforms input text.

In [None]:
def transduce(text):
    """Transduce text through PTB FST and return detailed output."""
    byte_strs = string_to_byte_strs(text)
    output_fsa = fst(byte_strs, None)
    output = next(output_fsa.language(tuple=True))
    return output

def decode_with_markers(output, sep_char='|', marker_char=''):
    """Decode output showing token boundaries."""
    tokens = []
    current = []
    
    for sym in output:
        if sym == SEP:
            if current:
                tokens.append(bytes(int(b) for b in current).decode('utf-8', errors='replace'))
                current = []
            tokens.append(sep_char)
        elif sym != MARKER and sym != EPSILON:
            current.append(sym)
    
    if current:
        tokens.append(bytes(int(b) for b in current).decode('utf-8', errors='replace'))
    
    return ''.join(tokens)

def show_transduction(text):
    """Display transduction with formatting."""
    output = transduce(text)
    decoded = decode_ptb_output(output)
    with_boundaries = decode_with_markers(output)
    
    print(f"Input:  {text}")
    print(f"Output: {decoded}")
    print(f"Tokens: {with_boundaries}")
    print(f"        ({len(output)} symbols, {sum(1 for s in output if s == SEP) + 1} tokens)")
    print()
    return output

In [None]:
# Test various inputs
test_cases = [
    "Hello, world!",
    "I can't believe it's not butter.",
    "\"To be or not to be,\" he said.",
    "The price is $19.99 (tax included).",
    "Dr. Smith's patients aren't feeling well.",
    "What?! That's impossible...",
]

for text in test_cases:
    show_transduction(text)

## 2. Interactive Transduction

Enter your own text to see how it's tokenized.

In [None]:
# Try your own text
your_text = "Enter your text here!"

show_transduction(your_text)

## 3. Quotient/Remainder Decomposition

The decomposition splits inputs based on whether they have universal or constrained continuations:

$$\text{Precover}(y) = Q(y) \cdot \Sigma^* \sqcup R(y)$$

- **Q**: Inputs that can continue with ANY suffix (universal)
- **R**: Inputs with constrained continuations

In [None]:
def analyze_decomposition(text, target_len):
    """Analyze Q/R decomposition for given text and target length."""
    byte_strs = string_to_byte_strs(text)
    output = transduce(text)
    
    if target_len > len(output):
        print(f"Target length {target_len} exceeds output length {len(output)}")
        return None, None
    
    target = output[:target_len]
    result = RustDecomp(fst, target)
    Q = result.quotient
    R = result.remainder
    
    # Decode target
    target_decoded = decode_with_markers(target)
    
    print(f"=== Decomposition Analysis ===")
    print(f"Input text: {text}")
    print(f"Full output: {decode_with_markers(output)}")
    print(f"Target (first {target_len} symbols): {target_decoded}")
    print()
    print(f"Quotient Q: {len(Q.states)} states, {len(Q.stop)} final")
    print(f"Remainder R: {len(R.states)} states, {len(R.stop)} final")
    
    return Q, R, target

In [None]:
# Analyze a specific example
text = "Hello, world!"
target_len = 8  # "Hello" + SEP + "," + SEP

Q, R, target = analyze_decomposition(text, target_len)

In [None]:
def show_q_language(Q, max_words=20):
    """Show sample words accepted by Q."""
    print(f"\n=== Q Language (inputs producing exactly the target) ===")
    print(f"Q has {len(Q.stop)} final states")
    print()
    
    count = 0
    for word in Q.language(tuple=True):
        try:
            decoded = bytes(int(b) for b in word if int(b) < 256).decode('utf-8', errors='replace')
            # Get FST output for this word
            out = transduce(decoded)
            out_decoded = decode_with_markers(out[:15]) + ('...' if len(out) > 15 else '')
            print(f"  {decoded!r:20} -> {out_decoded}")
        except Exception as e:
            print(f"  {word} (error: {e})")
        
        count += 1
        if count >= max_words:
            print(f"  ... ({count}+ words)")
            break

show_q_language(Q)

In [None]:
def show_r_language(R, max_words=20):
    """Show sample words accepted by R (constrained continuations)."""
    print(f"\n=== R Language (constrained continuations) ===")
    print(f"R has {len(R.stop)} final states")
    
    if len(R.stop) == 0:
        print("R is empty - all continuations are universal!")
        return
    
    print()
    count = 0
    for word in R.language(tuple=True):
        try:
            decoded = bytes(int(b) for b in word if int(b) < 256).decode('utf-8', errors='replace')
            out = transduce(decoded)
            out_decoded = decode_with_markers(out)
            print(f"  {decoded!r:20} -> {out_decoded}")
        except Exception as e:
            print(f"  {word} (error: {e})")
        
        count += 1
        if count >= max_words:
            print(f"  ... ({count}+ words)")
            break

show_r_language(R)

## 4. Exploring Contraction Handling

PTB has special rules for contractions which create non-universal (R) states.

In [None]:
# Contractions create interesting Q/R splits
text = "I can't do it"

output = transduce(text)
print(f"Full transduction:")
show_transduction(text)

# Analyze at different target lengths
for tlen in [2, 4, 6, 8, 10]:
    if tlen <= len(output):
        Q, R, target = analyze_decomposition(text, tlen)
        print()

In [None]:
# Look at a case with non-empty R
text = "In 2006"
Q, R, target = analyze_decomposition(text, 2)  # Just "In"

print("\n--- Q accepts (universal continuations): ---")
show_q_language(Q, max_words=10)

print("\n--- R accepts (constrained continuations): ---")
show_r_language(R, max_words=10)

## 5. Visualize Q and R Structure

In [None]:
def show_automaton_structure(fsa, name, max_arcs=50):
    """Display automaton structure."""
    print(f"\n=== {name} Structure ===")
    print(f"States: {len(fsa.states)}")
    print(f"Start: {fsa.start}")
    print(f"Final: {fsa.stop}")
    
    arcs = list(fsa.arcs())
    print(f"Arcs: {len(arcs)} total")
    
    if len(arcs) <= max_arcs:
        print("\nTransitions:")
        # Group by source state
        from collections import defaultdict
        by_src = defaultdict(list)
        for src, lbl, dst in arcs:
            by_src[src].append((lbl, dst))
        
        for src in sorted(by_src.keys()):
            is_final = src in fsa.stop
            is_start = src in fsa.start
            markers = []
            if is_start: markers.append('start')
            if is_final: markers.append('final')
            marker_str = f" ({', '.join(markers)})" if markers else ""
            
            trans = by_src[src]
            trans_str = ', '.join(f"{chr(int(l)) if int(l) < 128 else f'[{l}]'}:{d}" for l, d in trans[:5])
            if len(trans) > 5:
                trans_str += f", ... (+{len(trans)-5} more)"
            print(f"  State {src}{marker_str}: {trans_str}")
    else:
        print(f"  (too many arcs to display)")

# Show Q structure for a small example
text = "Hi!"
Q, R, target = analyze_decomposition(text, 3)
show_automaton_structure(Q, "Quotient Q")
show_automaton_structure(R, "Remainder R")

## 6. Benchmark Different Target Lengths

In [None]:
import time

def benchmark_decomposition(text, step=5):
    """Benchmark decomposition at various target lengths."""
    output = transduce(text)
    
    print(f"Text: {text[:50]}..." if len(text) > 50 else f"Text: {text}")
    print(f"Output length: {len(output)} symbols")
    print()
    print(f"{'Target Len':<12} {'Q States':<12} {'Q Final':<10} {'R Final':<10} {'Time (ms)':<10}")
    print("-" * 60)
    
    results = []
    for tlen in range(step, min(len(output), 100), step):
        target = output[:tlen]
        
        t0 = time.perf_counter()
        result = RustDecomp(fst, target)
        t1 = time.perf_counter()
        
        Q = result.quotient
        R = result.remainder
        time_ms = (t1 - t0) * 1000
        
        print(f"{tlen:<12} {len(Q.states):<12} {len(Q.stop):<10} {len(R.stop):<10} {time_ms:<10.1f}")
        results.append({
            'target_len': tlen,
            'q_states': len(Q.states),
            'q_final': len(Q.stop),
            'r_final': len(R.stop),
            'time_ms': time_ms,
        })
    
    return results

# Benchmark on a longer text
long_text = "The quick brown fox jumps over the lazy dog. This is a test sentence with various punctuation marks, contractions like don't and won't, and quoted text like \"hello world\"."
results = benchmark_decomposition(long_text, step=10)

## 7. Check Correctness Property

Verify that `Precover(y) = Q(y) · Σ* ⊔ R(y)`

In [None]:
def verify_correctness(text, target_len, test_inputs):
    """Verify the decomposition property for given test inputs."""
    output = transduce(text)
    target = output[:target_len]
    
    result = RustDecomp(fst, target)
    Q = result.quotient
    R = result.remainder
    
    print(f"Target: {decode_with_markers(target)}")
    print(f"Q: {len(Q.stop)} final, R: {len(R.stop)} final")
    print()
    
    def has_q_prefix(byte_strs):
        for i in range(1, len(byte_strs) + 1):
            if byte_strs[:i] in Q:
                return True
        return False
    
    print(f"{'Input':<20} {'Starts w/ target':<18} {'In Q·Σ*':<10} {'In R':<8} {'Correct':<8}")
    print("-" * 70)
    
    all_correct = True
    for inp in test_inputs:
        byte_strs = string_to_byte_strs(inp)
        try:
            out = transduce(inp)
            starts_with_target = out[:len(target)] == target
        except:
            starts_with_target = False
        
        in_q_sigma = has_q_prefix(byte_strs)
        in_r = byte_strs in R
        in_precover = in_q_sigma or in_r
        
        correct = (starts_with_target == in_precover)
        if not correct:
            all_correct = False
        
        status = "OK" if correct else "FAIL"
        print(f"{inp!r:<20} {str(starts_with_target):<18} {str(in_q_sigma):<10} {str(in_r):<8} {status:<8}")
    
    print()
    if all_correct:
        print("All tests passed!")
    else:
        print("Some tests FAILED!")

# Verify with contraction example
verify_correctness(
    "In 2006", 
    target_len=2, 
    test_inputs=["In", "In ", "In'", "In't", "Inc", "Ink", "Index"]
)

## 8. Load Wikitext Data

In [None]:
from transduction.benchmarking.data import load_wikitext, wikitext_detokenize

# Load some real text
dataset = load_wikitext("test")

# Get first few paragraphs
paragraphs = []
for item in dataset:
    text = item["text"].strip()
    if text and not text.startswith("="):
        detokenized = wikitext_detokenize(text)[:500]
        try:
            output = transduce(detokenized)
            paragraphs.append((detokenized, output))
        except:
            continue
        if len(paragraphs) >= 5:
            break

print(f"Loaded {len(paragraphs)} paragraphs")

In [None]:
# Display loaded paragraphs
for i, (text, output) in enumerate(paragraphs):
    print(f"\n=== Paragraph {i+1} ===")
    print(f"Text ({len(text)} chars): {text[:100]}...")
    print(f"Tokenized: {decode_with_markers(output)[:100]}...")
    print(f"Output: {len(output)} symbols, {sum(1 for s in output if s == SEP)+1} tokens")

In [None]:
# Analyze decomposition for a real paragraph
if paragraphs:
    text, output = paragraphs[0]
    print(f"Analyzing: {text[:80]}...")
    print()
    
    for tlen in [5, 10, 20, 30, 50]:
        if tlen < len(output):
            Q, R, target = analyze_decomposition(text, tlen)
            print()