# Experiment 11: The Tokenization Topology Law

This notebook visualizes the discovery that Spectral Topology is governed by **Tokenization Density (Script)**, not Semantics.

**Data Source**: Pre-computed results from `results/phi4_topology.json`.

**To generate fresh results**, run:
```bash
python scripts/reproduce.py --experiment topology --model Phi-4 --quant-4bit
```

In [None]:
import json
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Setup Plotting
plt.rcParams['font.family'] = 'serif'
sns.set_theme(style="white", font="serif")

## Load Pre-Computed Results

In [None]:
with open('../results/phi4_topology.json', 'r') as f:
    data = json.load(f)

# Group by language
groups = {'ja': [], 'ja_romaji': [], 'en_trans': []}
for item in data:
    lang = item.get('lang')
    if lang in groups:
        traj = [step.get('fiedler_value') for step in item.get('trajectory', [])]
        groups[lang].append(traj)

print(f"Japanese (Kana): {len(groups['ja'])} samples")
print(f"Japanese (Romaji): {len(groups['ja_romaji'])} samples")
print(f"English: {len(groups['en_trans'])} samples")

## Visualize the Phase Transition

In [None]:
colors = {
    'ja': '#e74c3c',       # Red (Asian/Dense)
    'ja_romaji': '#9b59b6',# Purple (Conflict Zone)
    'en_trans': '#3498db'  # Blue (Western/Sparse)
}

labels = {
    'ja': 'Japanese (Kana)',
    'ja_romaji': 'Japanese (Romaji)',
    'en_trans': 'English (Translation)'
}

plt.figure(figsize=(12, 8))

for lang, trajectories in groups.items():
    if not trajectories: continue
    trajectories = np.array(trajectories)
    layers = np.arange(trajectories.shape[1])
    
    # Plot individual traces
    for traj in trajectories:
        plt.plot(layers, traj, color=colors[lang], alpha=0.2, linewidth=1.0)
    
    # Plot mean
    mean_traj = np.nanmean(trajectories, axis=0)
    plt.plot(layers, mean_traj, color=colors[lang], linewidth=4.0, label=labels[lang])

plt.xlabel('Layer', fontweight='bold', fontsize=14)
plt.ylabel(r'Fiedler Value ($\lambda_2$)', fontweight='bold', fontsize=14)
plt.title('Tokenization Topology Law: Phi-4', fontweight='bold', fontsize=16)

plt.grid(True, linestyle='--', alpha=0.7)
plt.legend(loc='upper center', bbox_to_anchor=(0.5, -0.1), ncol=3, frameon=False, fontsize=12)
plt.ylim(0, 1.0)

# Annotate Regimes
plt.text(39, 0.95, 'Dense Regime', color='#e74c3c', ha='right', fontweight='bold', fontsize=12)
plt.text(35, 0.08, 'Sparse Regime', color='#3498db', ha='right', fontweight='bold', fontsize=12)

plt.tight_layout()
plt.show()

## Quantitative Summary

In [None]:
# Compute summary stats at Layer 20 (mid-network)
layer_idx = 20
for lang, trajectories in groups.items():
    if not trajectories: continue
    vals = [t[layer_idx] for t in trajectories if len(t) > layer_idx]
    print(f"{labels.get(lang, lang)}: Mean={np.mean(vals):.3f}, Std={np.std(vals):.3f}")