In [1]:
import pandas as pd
import numpy as np
from tqdm import tqdm
from Levenshtein import ratio
import eng_to_ipa
import json

Load/create dictionary to translate identifiers from English to IPA

In [2]:
# check for existing translation dictionary
try:
    dict_file = open('ipa_translation_dict.json', 'r')
    dict = dict_file.read()
    dict_file.close()

    translate = json.loads(dict)

    print('Loaded ipa_translation_dict.json')

# if pre-translated dictionary is missing
except FileNotFoundError:

    id_freq = pd.read_csv('identifier_frequency.csv')
    top_ids = id_freq['identifier'].astype(str)

    translate = {}

    for id in tqdm(top_ids):
        
        id_ipa = eng_to_ipa.convert(id)

        # if 'id' is not translatable, convert returns 'id*'
        # filter these out
        if id not in id_ipa:
            translate[id] = id_ipa
    
    percent = str(np.round(100*len(translate)/len(top_ids)))
    print(str(len(translate)) + ' identifiers are translatable (' + percent + '%)')

    f = open('ipa_translation_dict.json', 'w')
    f.write(json.dumps(translate))
    f.close()

Loaded ipa_translation_dict.json


Find similarity of every pair of identifiers

In [3]:
words = list(translate)
size = len(words)

result = np.zeros((size, size))

for i in tqdm(range(size)):
    for j in range(size):
        
        if i < j: # only compare each pair once

            result[i][j] = ratio(translate[words[i]], translate[words[j]])

# get indices from largest to smallest similarity
sorted = np.flip(np.argsort(result.flatten()))

100%|██████████| 5311/5311 [00:07<00:00, 716.24it/s] 


Save most similar pairs

In [4]:
f = open('phonological_similarity.csv', 'w')

for n in range(10000):
    i, j = np.unravel_index(sorted[n], (size, size))
    if words[i] not in words[j] and words[j] not in words[i]:
        f.write(words[i] + ',' + words[j] + ',' + str(result[i][j]) + '\n')

f.close()