In [10]:
# Python implementation of the canonical identifier generation logic

import random
import string

def generate_canonical_id() -> str:
    """
    Generate a canonical identifier compatible with XML rules.
    
    - Must be 8 characters long
    - Uses characters a-z and 1-9
    - Excludes confusing characters: 'o', 'i', 'l', '1'
    - First character must be a letter (XML identifiers cannot start with numbers)
    """
    length = 8
    forbidden = {'o', 'i', 'l', '1'}
    numbers = set(str(n) for n in range(1, 10))
    letters = set(string.ascii_lowercase)
    
    allowed_chars = list((numbers | letters) - forbidden)
    first_chars = list(letters - forbidden)
    
    first = random.choice(first_chars)
    rest = ''.join(random.choice(allowed_chars) for _ in range(length - 1))
    
    return first + rest

# Example usage
print(generate_canonical_id())


ebmvsyya


In [11]:
def calculate_id_space(length: int = 8) -> int:
    """Calculate the total number of possible unique IDs."""
    forbidden = {'o', 'i', 'l', '1'}
    numbers = set(str(n) for n in range(1, 10))
    letters = set(string.ascii_lowercase)
    
    allowed_chars = (numbers | letters) - forbidden
    first_chars = letters - forbidden
    
    # First char options × (remaining char options)^(length-1)
    return len(first_chars) * (len(allowed_chars) ** (length - 1))

id_space = calculate_id_space()

print(f"Total ID space: {id_space:,}")
print(f"First character options: {26 - 3} (letters minus o, i, l)")
print(f"Other character options: {35 - 4} (digits 2-9 + letters minus forbidden)")
print(f"Formula: 23 × 31^7 = {23 * 31**7:,}")

Total ID space: 632,790,124,553
First character options: 23 (letters minus o, i, l)
Other character options: 31 (digits 2-9 + letters minus forbidden)
Formula: 23 × 31^7 = 632,790,124,553


In [12]:
import math

def collision_probability(n_items: int, id_space: int) -> float:
    """
    Calculate collision probability using birthday problem approximation.
    P(collision) ≈ 1 - e^(-n²/2k) where n = items, k = ID space
    """
    exponent = -(n_items ** 2) / (2 * id_space)
    return 1 - math.exp(exponent)

def items_for_collision_probability(target_prob: float, id_space: int) -> int:
    """Calculate how many items needed to reach a target collision probability."""
    # From P = 1 - e^(-n²/2k), solve for n: n = sqrt(-2k * ln(1-P))
    return int(math.sqrt(-2 * id_space * math.log(1 - target_prob)))

id_space = calculate_id_space()

# Collision probabilities at different scales
print("Collision Probability (Birthday Problem):\n")
for n in [1_000, 10_000, 100_000, 1_000_000, 10_000_000]:
    prob = collision_probability(n, id_space)
    print(f"  {n:>12,} items: {prob:.2e} ({prob*100:.6f}%)")

print(f"\nItems needed for given collision probability:")
for target in [0.01, 0.1, 0.5]:
    n = items_for_collision_probability(target, id_space)
    print(f"  {target*100:>5.0f}% chance: {n:,} items")

print(f"\n50% collision threshold (birthday bound): ~{int(math.sqrt(id_space * math.pi / 2)):,}")

Collision Probability (Birthday Problem):

         1,000 items: 7.90e-07 (0.000079%)
        10,000 items: 7.90e-05 (0.007901%)
       100,000 items: 7.87e-03 (0.787038%)
     1,000,000 items: 5.46e-01 (54.622391%)
    10,000,000 items: 1.00e+00 (100.000000%)

Items needed for given collision probability:
      1% chance: 112,780 items
     10% chance: 365,160 items
     50% chance: 936,607 items

50% collision threshold (birthday bound): ~996,987
