# checking if phonological similarity is same for phon_matrix

In [1]:
import numpy as np
import scipy
import pandas as pd
import nltk
from functools import lru_cache
from itertools import product as iterprod
import re
from tqdm import tqdm

In [2]:
''' Phonological Func'''

class phonology_funcs:
    '''
        Description: 
            This class contains functions to generate phonemes from a list of words and create a phonological similarity matrix.
            Code has been adapted from the following link: https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
        Functions:
            (1) load_arpabet(): loads and returns the arpabet dictionary from the NLTK CMU dictionary
            (2) wordbreak(s, arpabet): takes in a word (str) and an arpabet dictionary and returns a list of phonemes
            (3) normalized_edit_distance(w1, w2): takes in two strings (w1, w2) and returns the normalized edit distance between them
            (3) create_phonological_matrix: takes in a list of labels (size N) and returns a phonological similarity matrix (NxN np.array)
    '''
    @lru_cache()
    def wordbreak(s):
        '''
            Description:
                Takes in a word (str) and an arpabet dictionary and returns a list of phonemes
            Args:
                (1) s (str): string to be broken into phonemes
            Returns:
                (1) phonemes (list, size: variable): list of phonemes in s 
        '''
        try:
            arpabet = nltk.corpus.cmudict.dict()
        except LookupError:
            nltk.download('cmudict')
            arpabet = nltk.corpus.cmudict.dict()
                
        s = s.lower()
        if s in arpabet:
            return arpabet[s]
        middle = len(s)/2
        partition = sorted(list(range(len(s))), key=lambda x: (x-middle)**2-x)
        for i in partition:
            pre, suf = (s[:i], s[i:])
            if pre in arpabet and phonology_funcs.wordbreak(suf) is not None:
                return [x+y for x,y in iterprod(arpabet[pre], phonology_funcs.wordbreak(suf))]
        return None

    def normalized_edit_distance(w1, w2):
        '''
            Description: 
                Takes in two strings (w1, w2) and returns the normalized edit distance between them
            Args:
                (1) w1 (str): first word
                (2) w2 (str): second word
            Returns:
                (1) normalized_edit_distance (float): normalized edit distance between w1 and w2
        '''
        return round(1-nltk.edit_distance(w1,w2)/(max(len(w1), len(w2))),4)

    def create_phonological_matrix(labels, path_for_lexical_data):
        '''
            Description:
                Takes in a list of labels (size N) and returns a phonological similarity matrix (NxN np.array)
            Args:
                (1) labels: a list of words matching the size of your search space (list of length N)
            Returns: 
                (1) phonological_matrix: phonological similarity matrix (NxN np.array)
        '''
        labels = [re.sub('[^a-zA-Z]+', '', str(v)) for v in labels]
        sim = np.zeros((len(labels), len(labels)))
        for i in tqdm(range(len(labels))):
            for j in range(i):
                sim[i, j] = phonology_funcs.normalized_edit_distance(phonology_funcs.wordbreak(labels[i])[0], phonology_funcs.wordbreak(labels[j])[0])
        sim = sim + sim.T
        np.fill_diagonal(sim, 1)
        # convert to dataframe without header or index
        phon_matrix_df = pd.DataFrame(sim)
        phon_matrix_df.to_csv('data processing/Lexical Data/speech2vec/phon_matrix.csv', header=False, index=False)
        phon_matrix_df.to_csv('data processing/Lexical Data/word2vec/phon_matrix.csv', header = False, index = False)
        # phon_matrix_df.to_csv(path_for_lexical_data + '/USE_phon_matrix.csv', header=False, index=False)
        return sim


In [3]:
words = ["adder", "albatross"]

num = phonology_funcs.normalized_edit_distance(phonology_funcs.wordbreak(words[0])[0], phonology_funcs.wordbreak(words[1])[0])
print(num)

0.125


In [4]:
num = phonology_funcs.normalized_edit_distance(phonology_funcs.wordbreak("alligator")[0], phonology_funcs.wordbreak(words[0])[0])
print(num)

0.2857
