In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from Levenshtein import ratio
import eng_to_ipa
import json

Load/create dictionary to translate identifiers from English to IPA

In [2]:
# check for existing translation dictionary
try:
    dict_file = open('data/ipa_translation_dict.json', 'r')
    dict = dict_file.read()
    dict_file.close()

    translate = json.loads(dict)

    print('Loaded ipa_translation_dict.json')

# if pre-translated dictionary is missing
except FileNotFoundError:

    id_freq = pd.read_csv('identifier_frequency.csv')
    top_ids = id_freq['identifier'].astype(str)

    translate = {}

    for id in tqdm(top_ids):

        parts = id.split('_')

        for part in parts:
            try:
                translate[part]
            except KeyError:
                id_ipa = eng_to_ipa.convert(part)

                # if 'id' is not translatable, convert returns 'id*'
                # filter these out
                if part not in id_ipa:
                    translate[part] = id_ipa
    
    percent = str(np.round(100*len(translate)/len(top_ids)))
    print(str(len(translate)) + ' identifiers are translatable (' + percent + '%)')

    f = open('data/ipa_translation_dict.json', 'w')
    f.write(json.dumps(translate))
    f.close()

Loaded ipa_translation_dict.json


Find similarity of every pair of identifiers

In [3]:
words = list(translate)
size = len(words)

ipa_sim = np.zeros((size, size), dtype=np.uint8)
eng_sim = np.zeros((size, size), dtype=np.uint8)

for i in tqdm(range(size)):
    for j in range(size):
        
        if i < j: # only compare each pair once

            ipa_sim[i, j] = np.round(100*ratio(translate[words[i]], translate[words[j]]), 2)
            eng_sim[i, j] = np.round(100*ratio(words[i], words[j]), 2)

# get indices from largest to smallest similarity
sorted = np.flip(np.argsort(ipa_sim.flatten()))

100%|██████████| 5821/5821 [02:19<00:00, 41.64it/s] 


Save most similar pairs

In [10]:
f = open('data/phonological_similarity.csv', 'w')

f.write('word_1,word_2,IPA_similarity,english_similarity\n')

for n in tqdm(range(5821**2)):
    i, j = np.unravel_index(sorted[n], (size, size))
    if words[i] not in words[j] and words[j] not in words[i]:
        f.write(words[i] + ',' + words[j] + ',' + str(ipa_sim[i, j])
                + ',' + str(eng_sim[i, j]) + '\n')

f.close()

100%|██████████| 33884041/33884041 [01:46<00:00, 317866.38it/s]


In [11]:
sim_df = pd.read_csv('data/phonological_similarity.csv')

In [13]:
selected_ids = pd.DataFrame(sim_df[sim_df['english_similarity'] < 35][sim_df['IPA_similarity'] > 70])

selected_ids.reset_index(inplace=True, drop=True)
selected_ids.to_csv('data/lex_diff_homophones.csv', index=False)

  selected_ids = pd.DataFrame(sim_df[sim_df['english_similarity'] < 35][sim_df['IPA_similarity'] > 70])


In [3]:
print(eng_to_ipa.convert('pickle'))

ˈpɪkəl


In [4]:
print(eng_to_ipa.convert('pixel'))

ˈpɪksəl


In [24]:
homophones = [('label', 'table'),
              ('new', 'nu'),
              ('rho', 'row'),
              ('picker', 'picture'), 
              ('pickle', 'pixel'),
              ('x', 'checks'),
              ('err', 'pair'),
              ('queue', 'skew'),
              ('y', 'i')]

In [25]:
for w1, w2 in homophones:
    eng = ratio(w1, w2)
    ipa = ratio(eng_to_ipa.convert(w1), eng_to_ipa.convert(w2))
    print(f'{w1:{8}} {w2:{8}} {eng:{6}.{2}} {ipa:{6}.{2}}')

label    table       0.6   0.86
new      nu          0.4    1.0
rho      row        0.67    1.0
picker   picture    0.62   0.92
pickle   pixel      0.55   0.92
x        checks      0.0   0.86
yerr     xypair      0.4    0.5
queue    skew       0.22   0.86
y        i           0.0    0.8
