### Creating semantic and phonological similarity for CBM data with pets


In [1]:
import numpy as np 
import pandas as pd 
import nltk
import scipy
from functools import lru_cache
from itertools import product as iterprod
import re
from tqdm import tqdm


class phonology_funcs:
    '''
        Description: 
            This class contains functions to generate phonemes from a list of words and create a phonological similarity matrix.
            Code has been adapted from the following link: https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
        Functions:
            (1) load_arpabet(): loads and returns the arpabet dictionary from the NLTK CMU dictionary
            (2) wordbreak(s, arpabet): takes in a word (str) and an arpabet dictionary and returns a list of phonemes
            (3) normalized_edit_distance(w1, w2): takes in two strings (w1, w2) and returns the normalized edit distance between them
            (3) create_phonological_matrix: takes in a list of labels (size N) and returns a phonological similarity matrix (NxN np.array)
    '''
    @lru_cache()
    def wordbreak(s):
        '''
            Description:
                Takes in a word (str) and an arpabet dictionary and returns a list of phonemes
            Args:
                (1) s (str): string to be broken into phonemes
            Returns:
                (1) phonemes (list, size: variable): list of phonemes in s 
        '''
        try:
            arpabet = nltk.corpus.cmudict.dict()
        except LookupError:
            nltk.download('cmudict')
            arpabet = nltk.corpus.cmudict.dict()
                
        s = s.lower()
        if s in arpabet:
            return arpabet[s]
        middle = len(s)/2
        partition = sorted(list(range(len(s))), key=lambda x: (x-middle)**2-x)
        for i in partition:
            pre, suf = (s[:i], s[i:])
            if pre in arpabet and phonology_funcs.wordbreak(suf) is not None:
                return [x+y for x,y in iterprod(arpabet[pre], phonology_funcs.wordbreak(suf))]
        return None

    def normalized_edit_distance(w1, w2):
        '''
            Description: 
                Takes in two strings (w1, w2) and returns the normalized edit distance between them
            Args:
                (1) w1 (str): first word
                (2) w2 (str): second word
            Returns:
                (1) normalized_edit_distance (float): normalized edit distance between w1 and w2
        '''
        return round(1-nltk.edit_distance(w1,w2)/(max(len(w1), len(w2))),4)

    def phonological_similarity(word1, word2): 
        phon_sim = phonology_funcs.normalized_edit_distance(phonology_funcs.wordbreak(word1)[0], phonology_funcs.wordbreak(word2)[0])
        return phon_sim


def semantic_similarity(word1, word2, path_to_embeddings): 
    embeddings = pd.read_csv(path_to_embeddings)
    
    word1_embedding = embeddings[word1].T
    word2_embedding = embeddings[word2].T
    
    similarity = 1 - scipy.spatial.distance.cosine(word1_embedding, word2_embedding)
    return similarity

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
''' Tests for the function created above'''
b = phonology_funcs.phonological_similarity("lion", "penguin")
print(b)
a = semantic_similarity("dolphin", "tiger", "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.3_w2v/embeddings.csv")
print(a)

0.2857
0.38040077253817817


In [3]:
def create_similarity_table(main_word, path_to_words, path_to_embeddings, path_save_file): 
    words_list = pd.read_csv(path_to_words, delimiter="\t")["Word"].values
    df = pd.DataFrame()
    semantic_similarities = []
    phonological_similarities = []
    for word in words_list: 
        semantic_similarities.append(semantic_similarity(main_word, word, path_to_embeddings))
        phonological_similarities.append(phonology_funcs.phonological_similarity(main_word, word))
    df['Word'] = words_list
    df['semantic_similarity'] = semantic_similarities
    df['phonological_similarity'] = phonological_similarities
    df.to_csv(path_save_file, index=False)
    

In [4]:
"../forager/output/CBM with pet/CBM_100_dim_alpha_0.3_w2v_similarities.csv"
"../forager/output/CBM with pet/CBM_100_dim_alpha_0.2_w2v_similarities.csv"


"../forager/data/lexical_data/100_dim_lexical_data/alpha_0.2_w2v/embeddings.csv"
"../forager/data/lexical_data/100_dim_lexical_data/alpha_0.3_w2v/embeddings.csv"


word_path = "../forager/data/fluency_lists/participant_data/individual participants/CBM_original.txt"


In [6]:
create_similarity_table("pets", word_path, "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.3_w2v/embeddings.csv", "../forager/output/CBM with pets/CBM_100_dim_alpha_0.3_w2v_partial_sim.csv")
create_similarity_table("pets", word_path, "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.2_w2v/embeddings.csv", "../forager/output/CBM with pets/CBM_100_dim_alpha_0.2_w2v_partial_sim.csv")
create_similarity_table("pets", word_path, "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.5_w2v/embeddings.csv", "../forager/output/CBM with pets/CBM_100_dim_alpha_0.5_w2v_partial_sim.csv")

In [47]:
CBM_no_rep = pd.read_csv("../forager/data/fluency_lists/participant_data/individual participants/CBM_modified.txt", delimiter="\t")
word_list = CBM_no_rep["Word"].values
print(word_list)

['tiger' 'dolphin' 'lion' 'penguin' 'bird' 'pigeon' 'woodpecker' 'reptile'
 'snake' 'snail' 'goldfish' 'fish' 'dog' 'cat' 'rodents' 'snake' 'pets'
 'rabbit' 'reptile' 'polar' 'grizzly' 'bear' 'hedgehog' 'porcupine'
 'reptile']


### Getting the list of words without "pet" 


In [23]:
transformed_words = pd.read_csv("../forager/data/fluency_lists/participant_data/with rep/transformed-data.csv")
rep_no_pet = transformed_words[transformed_words["Word"] != "pets"]


In [25]:
rep_no_pet.to_csv("../forager/data/fluency_lists/participant_data/no pets with rep/no-pets.csv", index=False)