In [1]:
import pandas as pd

from tqdm.notebook import tqdm

from lsh_utils import load_phoible
from lsh_utils import Token
from lsh_utils import hashes
from lsh_utils import ranked_pairs

In [2]:
# define a toy dataset to work with
items = [
    Token(language='eng', graphemes='Alex', phonemes=['æ','l','ə','k','s']),
    Token(language='eng', graphemes='Alexander', phonemes=['æ','l','ə','z','æ','n','d','ɚ']),
    Token(language='eng', graphemes='Alexi', phonemes=['ə','l','ɛ','k','s','i']),
    Token(language='eng', graphemes='Alexis', phonemes=['ə','l','ɛ','k','s','ɪ','s']),
    Token(language='eng', graphemes='Andrew', phonemes=['æ','n','d','r','u']),
    Token(language='eng', graphemes='Brad', phonemes=['b','ɹ','æ','d']),
    Token(language='eng', graphemes='Bradley', phonemes=['b','ɹ','æ','d','l','i']),
    Token(language='eng', graphemes='Brett', phonemes=['b','ɹ','ɛ','t']),
    Token(language='eng', graphemes='Carl', phonemes=['k','ɑ','ɹ','l']),
    Token(language='eng', graphemes='Carlos', phonemes=['k','ɑ','ɹ','l','oʊ','s']),
    Token(language='eng', graphemes='Catherine', phonemes=['k','æ','θ','ə','r','ə','n']),
    Token(language='eng', graphemes='Catherine', phonemes=['k','æ','θ','ə','r','ɪ','n']),
    Token(language='eng', graphemes='Charles' , phonemes=['t̠ʃ','ɑ','ɹ','l','z']),
    Token(language='eng', graphemes='Drew', phonemes=['d','ɹ','u']),
    Token(language='eng', graphemes='Jennifer', phonemes=['d̠ʒ','ɛ','n','ə','f','ɚ']),
    Token(language='eng', graphemes='Jenny', phonemes=['d̠ʒ','ɛ','n','i']),
    Token(language='eng', graphemes='John', phonemes=['d̠ʒ','ɑ','n']),
    Token(language='eng', graphemes='Johnny', phonemes=['d̠ʒ','ɑ','n','i']),
    Token(language='eng', graphemes='Jonathan', phonemes=['d̠ʒ','ɑ','n','ə','θ','ə','n']),
    Token(language='eng', graphemes='Kat', phonemes=['k','æ','t']),
    Token(language='eng', graphemes='Kathy', phonemes=['k','æ','θ','i']),
    Token(language='eng', graphemes='Matt', phonemes=['m','æ','t']),
    Token(language='eng', graphemes='Matthew', phonemes=['m','æ','θ','j','u']),
    Token(language='eng', graphemes='Michael', phonemes=['m','a','ɪ','k','ə','l']),
    Token(language='eng', graphemes='Mike', phonemes=['m','aɪ','k']),
    Token(language='eng', graphemes='Nate', phonemes=['n','eɪ','t']),
    Token(language='eng', graphemes='Nathan', phonemes=['n','eɪ','θ','ə','n']),
    Token(language='eng', graphemes='Nathaniel', phonemes=['n','ə','θ','æ','n','j','ə','l']),
    Token(language='eng', graphemes='Nichole', phonemes=['n','ɪ','k','oʊ','l']),
    Token(language='eng', graphemes='Nick', phonemes=['n','ɪ','k']),
    Token(language='eng', graphemes='Phil', phonemes=['f','ɪ','l']),
    Token(language='eng', graphemes='Philip', phonemes=['f','ɪ','l','ɪ','p']),
    Token(language='eng', graphemes='Ty', phonemes=['t','aɪ']),
    Token(language='eng', graphemes='Tyler', phonemes=['t','aɪ','l','ə','r']),
    Token(language='eng', graphemes='Xander', phonemes=['z','æ','n','d','ɚ']),
    Token(language='eng', graphemes='Zach', phonemes=['z','æ','k']),
    Token(language='eng', graphemes='Zak', phonemes=['z','æ','k']),
    Token(language='eng', graphemes='Zachary', phonemes=['z','æ','k','ə','r','i']),
]

In [3]:
def compare(
        items,
        n=3, # defines size of n-grams and n x n sliding-window features
        bits=128, # defines underlying LSH bit size (actual size will be 2*bits)
        window=10 # defines size of window in which to consider pairs for computing pair-wise distances
):
    pairs = tqdm(
        list(ranked_pairs(items, n=n, bits=bits, window=window)),
        desc='ranking pairs by bitwise distances',
        unit='pair'
    )
    return pd.DataFrame(
        {
            'a': a,
            'b': b,
            'simhash difference (in bits)': difference,
            '$$\sigma_{phonemic}$$': f'{1.0 - (difference/(2*bits)):0.3}'
        } for ((a, b), difference) in pairs
    )

%timeit -r 1 -n 10 compare(items)

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

293 ms ± 0 ns per loop (mean ± std. dev. of 1 run, 10 loops each)


In [4]:
pd.options.display.max_rows = None
compare(items)

ranking pairs by bitwise distances:   0%|          | 0/703 [00:00<?, ?pair/s]

Unnamed: 0,a,b,simhash difference (in bits),$$\sigma_{phonemic}$$
0,(eng) Zach /z æ k/,(eng) Zak /z æ k/,0,1.0
1,(eng) Catherine /k æ θ ə r ə n/,(eng) Catherine /k æ θ ə r ɪ n/,59,0.77
2,(eng) Alexi /ə l ɛ k s i/,(eng) Alexis /ə l ɛ k s ɪ s/,79,0.691
3,(eng) Brad /b ɹ æ d/,(eng) Brett /b ɹ ɛ t/,79,0.691
4,(eng) Kathy /k æ θ i/,(eng) Matthew /m æ θ j u/,83,0.676
5,(eng) Jenny /d̠ʒ ɛ n i/,(eng) Johnny /d̠ʒ ɑ n i/,83,0.676
6,(eng) Jennifer /d̠ʒ ɛ n ə f ɚ/,(eng) Jenny /d̠ʒ ɛ n i/,88,0.656
7,(eng) Brad /b ɹ æ d/,(eng) Bradley /b ɹ æ d l i/,89,0.652
8,(eng) Catherine /k æ θ ə r ɪ n/,(eng) Nathan /n eɪ θ ə n/,90,0.648
9,(eng) Jennifer /d̠ʒ ɛ n ə f ɚ/,(eng) Jonathan /d̠ʒ ɑ n ə θ ə n/,90,0.648
