# Chapter 2: Theoretical Foundation

## Phonetic Features: From Articulation to Computation

### The Articulatory Basis

Human speech sounds are produced by coordinated movements of the vocal tract. These movements can be decomposed into independent components:

In [None]:
import pandas as pd
from distfeat import phoneme_to_features

# Analyze stops by place and voicing
stops_data = []
for place in [('p', 'b', 'Bilabial'), ('t', 'd', 'Alveolar'), ('k', 'g', 'Velar')]:
    voiceless, voiced, place_name = place
    
    vl_features = phoneme_to_features(voiceless)
    vd_features = phoneme_to_features(voiced)
    
    stops_data.append({
        'Phoneme': voiceless,
        'Place': place_name,
        'Voice': vl_features.get('voice', 0),
        'Labial': vl_features.get('labial', 0),
        'Coronal': vl_features.get('coronal', 0),
        'Dorsal': vl_features.get('dorsal', 0),
        'Continuant': vl_features.get('continuant', 0)
    })
    
    stops_data.append({
        'Phoneme': voiced,
        'Place': place_name,
        'Voice': vd_features.get('voice', 0),
        'Labial': vd_features.get('labial', 0),
        'Coronal': vd_features.get('coronal', 0),
        'Dorsal': vd_features.get('dorsal', 0),
        'Continuant': vd_features.get('continuant', 0)
    })

df = pd.DataFrame(stops_data)
print("Feature Matrix for Stops:")
print(df.to_string(index=False))
print("\nKey observations:")
print("1. Voice is the only difference within each place")
print("2. Place features (labial/coronal/dorsal) distinguish columns")
print("3. All stops share continuant=0 (complete closure)")

### Feature Geometry and Natural Classes

Features are not independent but organized hierarchically. This organization predicts which sounds pattern together in phonological processes:

In [None]:
from distfeat import get_feature_system
import numpy as np

# Find natural classes
def find_natural_class(phonemes, feature_system):
    """Find features that define a natural class."""
    if not phonemes:
        return {}
    
    # Get features for all phonemes
    feature_sets = []
    for p in phonemes:
        if p in feature_system:
            feature_sets.append(feature_system[p]['features'])
    
    if not feature_sets:
        return {}
    
    # Find common features
    common = {}
    for feature in feature_sets[0]:
        values = [fs.get(feature) for fs in feature_sets]
        if all(v == values[0] for v in values):
            common[feature] = values[0]
    
    return common

# Test with natural classes
feature_system = get_feature_system()

natural_classes = {
    'Voiceless stops': ['p', 't', 'k'],
    'Voiced stops': ['b', 'd', 'g'],
    'Nasals': ['m', 'n', 'ŋ'],
    'Fricatives': ['f', 's', 'ʃ', 'x'],
    'High vowels': ['i', 'u'],
    'Low vowels': ['a', 'ɑ']
}

for class_name, phonemes in natural_classes.items():
    common = find_natural_class(phonemes, feature_system)
    defining_features = {k: v for k, v in common.items() 
                        if k in ['voice', 'nasal', 'continuant', 'high', 'low', 'consonantal']}
    print(f"\n{class_name}: {phonemes}")
    print(f"  Defining features: {defining_features}")

## Distance Metrics: From Features to Similarity

### Hamming Distance

The simplest metric counts feature differences:

In [None]:
from distfeat import calculate_distance
import matplotlib.pyplot as plt

# Demonstrate Hamming distance calculation
def show_hamming_calculation(p1, p2):
    f1 = phoneme_to_features(p1)
    f2 = phoneme_to_features(p2)
    
    differences = 0
    diff_features = []
    
    for feature in f1:
        if f1[feature] != f2.get(feature, 0):
            differences += 1
            diff_features.append(feature)
    
    total = len(f1)
    normalized = differences / total
    
    print(f"Hamming distance between [{p1}] and [{p2}]:")
    print(f"  Different features: {differences}/{total}")
    print(f"  Normalized distance: {normalized:.3f}")
    print(f"  Differing in: {', '.join(diff_features[:5])}...")
    
    return normalized

# Examples
pairs = [('p', 'b'), ('p', 'f'), ('p', 't'), ('p', 'a')]
distances = []

for p1, p2 in pairs:
    dist = show_hamming_calculation(p1, p2)
    distances.append(dist)
    print()

### Jaccard Distance

Jaccard distance considers only active features, making it less sensitive to the coding scheme:

In [None]:
# Compare Hamming vs Jaccard
def compare_metrics(p1, p2):
    f1 = phoneme_to_features(p1)
    f2 = phoneme_to_features(p2)
    
    # Calculate Jaccard components
    active1 = {k for k, v in f1.items() if v == 1}
    active2 = {k for k, v in f2.items() if v == 1}
    
    intersection = active1 & active2
    union = active1 | active2
    
    jaccard_sim = len(intersection) / len(union) if union else 0
    jaccard_dist = 1 - jaccard_sim
    
    hamming = calculate_distance(p1, p2, method='hamming')
    
    print(f"[{p1}] vs [{p2}]:")
    print(f"  Active features in [{p1}]: {len(active1)}")
    print(f"  Active features in [{p2}]: {len(active2)}")
    print(f"  Shared active features: {len(intersection)}")
    print(f"  Jaccard distance: {jaccard_dist:.3f}")
    print(f"  Hamming distance: {hamming:.3f}")
    print(f"  Difference: {abs(jaccard_dist - hamming):.3f}")

# Compare consonant-vowel pairs
compare_metrics('p', 'a')
print()
compare_metrics('t', 'i')

### Clustering-Based Distance

K-means clustering groups similar sounds, creating a coarser but more robust distance:

In [None]:
from distfeat import build_distance_matrix
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Compare regular distance with k-means
phonemes = ['p', 'b', 't', 'd', 'k', 'g', 'm', 'n', 'f', 's']

# Regular Hamming distance
hamming_matrix, _ = build_distance_matrix(phonemes, method='hamming')

# K-means distance (with 5 clusters)
kmeans_matrix, _ = build_distance_matrix(phonemes, method='kmeans', n_clusters=5)

# Visualize both
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 6))

sns.heatmap(hamming_matrix, xticklabels=phonemes, yticklabels=phonemes,
            annot=True, fmt='.2f', cmap='YlOrRd', ax=ax1, vmin=0, vmax=1)
ax1.set_title('Hamming Distance')

sns.heatmap(kmeans_matrix, xticklabels=phonemes, yticklabels=phonemes,
            annot=True, fmt='.2f', cmap='YlOrRd', ax=ax2, vmin=0, vmax=1)
ax2.set_title('K-means Distance (5 clusters)')

plt.tight_layout()
plt.show()

# Analyze cluster structure
unique_distances = np.unique(kmeans_matrix)
print(f"\nK-means creates {len(unique_distances)} unique distance values")
print(f"Distance values: {unique_distances}")
print("\nThis discretization can improve robustness for cognate detection")

## Information Theory and Feature Importance

Not all features contribute equally to phonetic distance. We can quantify feature importance using information theory:

In [None]:
from distfeat import get_feature_system
import numpy as np
from scipy.stats import entropy

# Calculate feature entropy
feature_system = get_feature_system()
feature_names = list(next(iter(feature_system.values()))['features'].keys())

# Count feature frequencies
feature_counts = {f: {'0': 0, '1': 0} for f in feature_names[:20]}  # First 20 features

for phoneme_data in feature_system.values():
    features = phoneme_data['features']
    for f in feature_counts:
        value = str(features.get(f, 0))
        feature_counts[f][value] += 1

# Calculate entropy for each feature
feature_entropies = {}
for feature, counts in feature_counts.items():
    total = counts['0'] + counts['1']
    if total > 0:
        probs = [counts['0']/total, counts['1']/total]
        feature_entropies[feature] = entropy(probs, base=2)

# Sort by entropy
sorted_features = sorted(feature_entropies.items(), key=lambda x: x[1], reverse=True)

print("Feature Information Content (bits):")
print("\nHigh entropy (most informative):")
for feature, ent in sorted_features[:5]:
    print(f"  {feature:20} {ent:.3f} bits")

print("\nLow entropy (least informative):")
for feature, ent in sorted_features[-5:]:
    print(f"  {feature:20} {ent:.3f} bits")

print("\nInterpretation:")
print("- High entropy features distinguish many phoneme pairs")
print("- Low entropy features are redundant or rarely used")

## Optimality Theory and Distance

Sound changes tend to minimize articulatory effort while maintaining perceptual distinctiveness. This predicts that:

1. Common sound changes involve small distances
2. Sounds that rarely interchange have large distances
3. Distance correlates with markedness

In [None]:
from distfeat import calculate_distance

# Common sound changes and their distances
sound_changes = {
    'Voicing': [('p', 'b'), ('t', 'd'), ('k', 'g'), ('f', 'v'), ('s', 'z')],
    'Lenition': [('p', 'f'), ('t', 'θ'), ('k', 'x'), ('b', 'v'), ('g', 'ɣ')],
    'Palatalization': [('k', 'tʃ'), ('g', 'dʒ'), ('t', 'tʃ'), ('d', 'dʒ')],
    'Nasalization': [('b', 'm'), ('d', 'n'), ('g', 'ŋ')],
    'Vowel raising': [('e', 'i'), ('o', 'u'), ('ɛ', 'e'), ('ɔ', 'o')]
}

change_distances = {}
for change_type, pairs in sound_changes.items():
    distances = []
    for p1, p2 in pairs:
        dist = calculate_distance(p1, p2, method='hamming')
        if dist is not None:
            distances.append(dist)
    
    if distances:
        change_distances[change_type] = np.mean(distances)

# Sort by average distance
sorted_changes = sorted(change_distances.items(), key=lambda x: x[1])

print("Common Sound Changes Ranked by Distance:")
print("(Lower distance = more natural change)\n")
for change_type, avg_dist in sorted_changes:
    print(f"{change_type:15} avg distance: {avg_dist:.3f}")
    examples = sound_changes[change_type][:3]
    ex_str = ', '.join([f"{p1}→{p2}" for p1, p2 in examples])
    print(f"  Examples: {ex_str}\n")

## Validation Framework

### The Cognate Hypothesis

Our fundamental hypothesis is:

> **Words that are cognates (derived from a common ancestor) should have lower average phonetic distance than random word pairs**

This can be formalized as:

In [None]:
from distfeat.alignment import align_sequences
import numpy as np

# Simulated cognate data
cognate_sets = [
    # Cognate set 1: "mother"
    [['m', 'ʌ', 'ð', 'ər'],      # English
     ['m', 'u', 't', 'ər'],       # German
     ['m', 'a', 't', 'ər']],      # Latin
    
    # Cognate set 2: "night"
    [['n', 'aɪ', 't'],            # English
     ['n', 'a', 'x', 't'],        # German
     ['n', 'ɔ', 'k', 's']],       # Latin
    
    # Cognate set 3: "heart"  
    [['h', 'ɑː', 't'],            # English
     ['h', 'ɛ', 'r', 'ts'],       # German
     ['k', 'ɔ', 'r']]             # Latin
]

# Calculate intra-cognate distances
intra_distances = []
for cognate_set in cognate_sets:
    for i in range(len(cognate_set)):
        for j in range(i + 1, len(cognate_set)):
            result = align_sequences(cognate_set[i], cognate_set[j])
            intra_distances.append(result.normalized_distance)

# Calculate inter-cognate distances (between different sets)
inter_distances = []
for i in range(len(cognate_sets)):
    for j in range(i + 1, len(cognate_sets)):
        # Compare first word from each set
        result = align_sequences(cognate_sets[i][0], cognate_sets[j][0])
        inter_distances.append(result.normalized_distance)

# Statistical test
mean_intra = np.mean(intra_distances)
mean_inter = np.mean(inter_distances)
std_intra = np.std(intra_distances)
std_inter = np.std(inter_distances)

print("Cognate Distance Hypothesis Test:")
print(f"\nIntra-cognate distances:")
print(f"  Mean: {mean_intra:.3f}")
print(f"  Std:  {std_intra:.3f}")
print(f"  N:    {len(intra_distances)}")

print(f"\nInter-cognate distances:")
print(f"  Mean: {mean_inter:.3f}")
print(f"  Std:  {std_inter:.3f}")
print(f"  N:    {len(inter_distances)}")

print(f"\nDifference: {mean_inter - mean_intra:.3f}")
print(f"Effect size (Cohen's d): {(mean_inter - mean_intra) / np.sqrt((std_intra**2 + std_inter**2) / 2):.2f}")

if mean_intra < mean_inter:
    print("\n✓ Hypothesis confirmed: Cognates have lower distances")
else:
    print("\n✗ Hypothesis rejected: Check data or method")

## Summary

The theoretical foundation of distfeat rests on:

1. **Phonetic Features**: Binary representations based on articulatory and acoustic properties
2. **Distance Metrics**: Multiple ways to quantify similarity, each with different properties
3. **Information Theory**: Feature importance and entropy for optimization
4. **Linguistic Validation**: Using cognate data to validate and optimize distances
5. **Optimality Theory**: Predicting natural sound changes through distance minimization

This foundation enables principled, reproducible measurement of phonetic similarity for computational historical linguistics.

## Next Steps

- Continue to [Chapter 3: Implementation Details](03_implementation.ipynb) to see how these concepts are implemented
- Jump to [Tutorials](../tutorials/00_quickstart.ipynb) to start using distfeat
- Explore [Case Studies](../case_studies/indo_european.ipynb) to see applications