## I. Getting the full similarities dataframe

In [69]:
import numpy as np 
import pandas as pd 
import nltk
import scipy
from functools import lru_cache
from itertools import product as iterprod
import re
from tqdm import tqdm


class phonology_funcs:
    '''
        Description: 
            This class contains functions to generate phonemes from a list of words and create a phonological similarity matrix.
            Code has been adapted from the following link: https://stackoverflow.com/questions/33666557/get-phonemes-from-any-word-in-python-nltk-or-other-modules
        Functions:
            (1) load_arpabet(): loads and returns the arpabet dictionary from the NLTK CMU dictionary
            (2) wordbreak(s, arpabet): takes in a word (str) and an arpabet dictionary and returns a list of phonemes
            (3) normalized_edit_distance(w1, w2): takes in two strings (w1, w2) and returns the normalized edit distance between them
            (3) create_phonological_matrix: takes in a list of labels (size N) and returns a phonological similarity matrix (NxN np.array)
    '''
    @lru_cache()
    def wordbreak(s):
        '''
            Description:
                Takes in a word (str) and an arpabet dictionary and returns a list of phonemes
            Args:
                (1) s (str): string to be broken into phonemes
            Returns:
                (1) phonemes (list, size: variable): list of phonemes in s 
        '''
        try:
            arpabet = nltk.corpus.cmudict.dict()
        except LookupError:
            nltk.download('cmudict')
            arpabet = nltk.corpus.cmudict.dict()
                
        s = s.lower()
        if s in arpabet:
            return arpabet[s]
        middle = len(s)/2
        partition = sorted(list(range(len(s))), key=lambda x: (x-middle)**2-x)
        for i in partition:
            pre, suf = (s[:i], s[i:])
            if pre in arpabet and phonology_funcs.wordbreak(suf) is not None:
                return [x+y for x,y in iterprod(arpabet[pre], phonology_funcs.wordbreak(suf))]
        return None

    def normalized_edit_distance(w1, w2):
        '''
            Description: 
                Takes in two strings (w1, w2) and returns the normalized edit distance between them
            Args:
                (1) w1 (str): first word
                (2) w2 (str): second word
            Returns:
                (1) normalized_edit_distance (float): normalized edit distance between w1 and w2
        '''
        return round(1-nltk.edit_distance(w1,w2)/(max(len(w1), len(w2))),4)

    def phonological_similarity(word1, word2): 
        phon_sim = phonology_funcs.normalized_edit_distance(phonology_funcs.wordbreak(word1)[0], phonology_funcs.wordbreak(word2)[0])
        return phon_sim


def semantic_similarity(word1, word2, path_to_embeddings): 
    embeddings = pd.read_csv(path_to_embeddings)
    
    word1_embedding = embeddings[word1].T
    word2_embedding = embeddings[word2].T
    
    similarity = 1 - scipy.spatial.distance.cosine(word1_embedding, word2_embedding)
    return similarity

In [70]:
def create_similarity_table(main_word, word_list, path_to_embeddings, path_save_file): 
    df = pd.DataFrame()
    semantic_similarities = []
    phonological_similarities = []
    for word in word_list: 
        semantic_similarities.append(semantic_similarity(main_word, word, path_to_embeddings))
        phonological_similarities.append(phonology_funcs.phonological_similarity(main_word, word))
    df['Word'] = word_list
    df['semantic_similarity'] = semantic_similarities
    df['phonological_similarity'] = phonological_similarities
    df.to_csv(path_save_file, index=False)
    

In [72]:
word_list = pd.read_csv("../forager/data/lexical_data/50_dim_lexical_data/alpha_0.0_s2v/frequencies.csv",header=None)[0].values.tolist()



create_similarity_table("pets", word_list, "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.3_w2v/embeddings.csv", "../forager/output/CBM with pets/CBM_100_dim_alpha_0.3_w2v_full_sim.csv")
create_similarity_table("pets", word_list, "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.2_w2v/embeddings.csv", "../forager/output/CBM with pets/CBM_100_dim_alpha_0.2_w2v_full_sim.csv")
create_similarity_table("pets", word_list, "../forager/data/lexical_data/100_dim_lexical_data/alpha_0.5_w2v/embeddings.csv", "../forager/output/CBM with pets/CBM_100_dim_alpha_0.5_w2v_full_sim.csv")

### i. checking if sim history and phon history are the same

In [47]:
def create_history_variables(fluency_list, labels, sim_matrix, freq_matrix, phon_matrix = None):
    '''
        Args:
            (1) sim_matrix: semantic similarity matrix (NxN np.array)
            (2) phon_matrix: phonological similarity matrix (NxN np.array)
            (3) freq_matrix: frequencies array (Nx1 array)
            (4) labels: the space of words (list of length N)
            (5) fluency_list: items produced by a participant (list of size L)

        Returns: 
            (1) sim_list (list, size: L): semantic similarities between each item in fluency_list 
            (2) sim_history(list, size: L arrays of size N): semantic similarities of each word in fluency_list with all items in labels
            (3) phon_list (list, size: L): phonological similarities between each item in fluency_list 
            (4) phon_history (list, size: L arrays of size N): phonological similarities of each word in fluency_list with all items in labels
            (5) freq_list (list, size: L): frequencies of each item in fluency_list (list of size L)
            (6) freq_history  (list, size: L arrays of size N): frequencies of all words in labels repeated L times


    '''
    if phon_matrix is not None:
        phon_matrix[phon_matrix <= 0] = .0001
    sim_matrix[sim_matrix <= 0] = .0001

    freq_list = []
    freq_history = []

    sim_list = []
    sim_history = []

    phon_list = []
    phon_history = []

    for i in range(0,len(fluency_list)):
        word = fluency_list[i]
        currentwordindex = labels.index(word)

        freq_list.append(freq_matrix[currentwordindex])
        freq_history.append(freq_matrix)

        if i > 0: # get similarity between this word and preceding word
            prevwordindex = labels.index(fluency_list[i-1])
            sim_list.append(sim_matrix[prevwordindex, currentwordindex] )
            sim_history.append(sim_matrix[prevwordindex,:])
            if phon_matrix is not None:
                phon_list.append(phon_matrix[prevwordindex, currentwordindex] )
                phon_history.append(phon_matrix[prevwordindex,:])
        else: # first word
            sim_list.append(0.0001)
            sim_history.append(sim_matrix[currentwordindex,:])
            if phon_matrix is not None:
                phon_list.append(0.0001)
                phon_history.append(phon_matrix[currentwordindex,:])

    return sim_list, sim_history, freq_list, freq_history,phon_list, phon_history

In [53]:
dimension = '100'
type = 'alpha_0.3_w2v'

similaritypath =  '../forager/data/lexical_data/' + dimension + '_dim_lexical_data/' + type + '/semantic_matrix.csv'
frequencypath =  '../forager/data/lexical_data/' + dimension + '_dim_lexical_data/' + type + '/frequencies.csv'
phonpath = '../forager/data/lexical_data/' + dimension + '_dim_lexical_data/' + type + '/phonological_matrix.csv'

In [65]:
CBM_words = pd.read_csv("../forager/data/fluency_lists/participant_data/individual participants/CBM_original.txt", delimiter="\t")
CBM_words = CBM_words["Word"].tolist() 
print(CBM_words.index("pets"))
print(len(CBM_words))


18
27


In [55]:
similarity_matrix = np.loadtxt(similaritypath,delimiter=',')
frequency_list = np.array(pd.read_csv(frequencypath,header=None,encoding="unicode-escape")[1])
phon_matrix = np.loadtxt(phonpath,delimiter=',')
labels = pd.read_csv(frequencypath,header=None)[0].values.tolist()

In [58]:
sim_list, sim_history, freq_list, freq_history,phon_list, phon_history = create_history_variables(CBM_words, labels, similarity_matrix, frequency_list, phon_matrix)


In [68]:
print(sim_history[16])

[0.42200345 0.3977776  0.54317008 0.32422199 0.34682831 0.37701656
 0.49864981 0.47863588 0.38275885 0.24945657 0.47634934 0.45838574
 0.49009323 0.43078561 0.31440982 0.30736378 0.38087381 0.38051933
 0.47590154 0.34616522 0.36365902 0.4548852  0.6049997  0.52614617
 0.3974489  0.39782991 0.43889562 0.53719206 0.44047277 0.43518821
 0.43629137 0.44907354 0.39160627 0.23604724 0.31933634 0.42299988
 0.52672368 0.45149767 0.40753074 0.34309711 0.35732356 0.39918935
 0.34744203 0.38301354 0.37173505 0.48718404 0.45050806 0.31540856
 0.29797146 0.5228793  0.49793375 0.2868362  0.34567857 0.43278755
 0.40596105 1.         0.39615038 0.25851114 0.36664081 0.39790752
 0.30991125 0.38098716 0.41059097 0.53358238 0.443584   0.38997211
 0.35236891 0.31587222 0.45666435 0.30808408 0.41514922 0.41023338
 0.57760077 0.37376474 0.36500429 0.39370847 0.44849987 0.33231089
 0.18573687 0.30043806 0.34675023 0.36640524 0.39215048 0.33124591
 0.47818152 0.46526002 0.4044614  0.41318129 0.48784176 0.3941

In [62]:
print(np.array(sim_history).shape)

(27, 463)


## II. Creating CSV File for error_results.csv


In [None]:
import pandas as pd 
import numpy as np

In [20]:
errors = ['../forager/output/Test Error/With Rep With Sub/', 
          '../forager/output/Test Error/With Rep Without Sub/', 
          '../forager/output/Test Error/Without Rep With Sub/', 
          '../forager/output/Test Error/Without Rep Without Sub/']

rep_sub = [('with_rep', 'with_sub'), ('with_rep', 'without_sub'), ('without_rep', 'with_sub'), ('without_rep', 'without_sub')]


# data_path = ['../forager/data/fluency_lists/participant_data/Error Testing/error_with_rep_with_sub.txt', 
#              '../forager/data/fluency_lists/participant_data/Error Testing/error_with_rep_without_sub.txt', 
#              '../forager/data/fluency_lists/participant_data/Error Testing/error_without_rep_with_sub.txt', 
#              '../forager/data/fluency_lists/participant_data/Error Testing/error_without_rep_without_sub.txt' ]

dimensions = ['50', '100', '200', '300']
type = [
    'alpha_0.0_s2v', # = alpha_1_w2v
    'alpha_0.0_w2v', # = alpha_1_s2v 
    'alpha_0.1_s2v', # = alpha_0.9_w2v
    'alpha_0.1_w2v', # = alpha_0.9_s2v
    'alpha_0.2_s2v', # = alpha_0.8_w2v
    'alpha_0.2_w2v', # = alpha_0.8_s2v
    'alpha_0.3_s2v', # = alpha_0.7_w2v
    'alpha_0.3_w2v', # = alpha_0.7_s2v 
    'alpha_0.4_s2v', # = alpha_0.6_w2v
    'alpha_0.4_w2v', # = alpha_0.6_s2v
    'alpha_0.5_s2v', # = alpha_0.5_w2v
    'average',
    'only_w2v', 
    'only_s2v'
]

# forager results 
forager_paths = ['../forager/output/Test Error/Forager Errors/error_with_rep_with_sub_forager_results/model_results.csv', 
         '../forager/output/Test Error/Forager Errors/error_with_rep_without_sub_forager_results/model_results.csv',
         '../forager/output/Test Error/Forager Errors/error_without_rep_with_sub_forager_results/model_results.csv',
         '../forager/output/Test Error/Forager Errors/error_without_rep_without_sub_forager_results/model_results.csv'
         ]




224


In [33]:

error_results = pd.DataFrame(columns=["Project", "Repetition", "Substitution", "Dimension", "Alpha", "Missing NLL"])

for i in range(4): 
    df = pd.read_csv(forager_paths[i])
    nan_NLL = df['Negative_Log_Likelihood_Optimized'].isnull().any()
    error_results.loc[len(error_results)] = ["Forager", rep_sub[i][0], rep_sub[i][1], "N/A", "N/A", nan_NLL]

for i in range(4): 
    for dim in dimensions: 
        for t in type: 
            path = errors[i] + dim + "_dim_results/" + t + "_results"+ "/model_results.csv"
            df = pd.read_csv(path)
            nan_NLL = df["Negative_Log_Likelihood_Optimized"].isnull().any()
            error_results.loc[len(error_results)] = ["Cochlear Project", rep_sub[i][0], rep_sub[i][1], dim, t, nan_NLL]



error_results.to_csv("../error analysis/error_results.csv", index=False)


In [31]:
error_results

Unnamed: 0,Project,Repetition,Substitution,Dimension,Alpha,Missing NLL
0,Forager,with_rep,with_sub,,,False
1,Forager,with_rep,without_sub,,,False
2,Forager,without_rep,with_sub,,,False
3,Forager,without_rep,without_sub,,,False
4,Cochlear Project,with_rep,with_sub,50,alpha_0.0_s2v,False
...,...,...,...,...,...,...
223,Cochlear Project,without_rep,without_sub,300,alpha_0.4_w2v,False
224,Cochlear Project,without_rep,without_sub,300,alpha_0.5_s2v,False
225,Cochlear Project,without_rep,without_sub,300,average,False
226,Cochlear Project,without_rep,without_sub,300,only_w2v,False
