# Getting the embeddings for the average and alpha values of speech2vec and word2vec 

In [19]:
import pandas as pd 
import numpy as np 
import os 
import shutil 
import matplotlib as mpl


### Average embeddings

In [22]:
dimensions = ["50", "100", "200", "300"]

lexical_data_path = "../forager/data/lexical_data/"
s2v_path = "_dim_lexical_data/only_s2v/embeddings.csv"
w2v_path = "_dim_lexical_data/only_w2v/embeddings.csv"
folder = "_dim_lexical_data/average"
embeddings = "/embeddings.csv"


for dim in dimensions: 
    os.makedirs(lexical_data_path + dim + folder, exist_ok=True)
    
    s2v = pd.read_csv(lexical_data_path + dim + s2v_path)
    w2v = pd.read_csv(lexical_data_path + dim + w2v_path)
    df_mean = (s2v + w2v) / 2
    
    df_mean.to_csv(lexical_data_path + dim + folder + embeddings, index= False)





### Alpha embeddings

In [23]:
dimensions = ["50", '100', '200', '300']
alphas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
models = ["s2v", "w2v"]


# paths 
lexical_data_path = "../forager/data/lexical_data/"
s2v_path = "_dim_lexical_data/only_s2v/embeddings.csv"
w2v_path = "_dim_lexical_data/only_w2v/embeddings.csv"
folder = "_dim_lexical_data/alpha_"
embeddings = "/embeddings.csv"

count = 0 
for dim in dimensions: 
    for model in models: 
        for alp in alphas: 
            
            if model == "s2v": 
                path = lexical_data_path + dim + folder + str(alp) + "_" + "s2v"
                os.makedirs(path, exist_ok=True)

                s2v = pd.read_csv(lexical_data_path + dim + s2v_path)
                w2v = pd.read_csv(lexical_data_path + dim + w2v_path)
                
                s2v_alpha = s2v * alp 
                w2v_alpha = w2v * (1 - alp)
                
                df_combined = pd.concat([s2v_alpha, w2v_alpha], ignore_index=True)
                df_combined.to_csv(path + '/embeddings.csv', index = False)
                
            if model == "w2v": 
                path = lexical_data_path + dim + folder + str(alp) + "_" + "w2v"
                os.makedirs(path, exist_ok=True)

                w2v = pd.read_csv(lexical_data_path + dim + w2v_path)
                s2v = pd.read_csv(lexical_data_path + dim + s2v_path)
                
                w2v_alpha = w2v * alp 
                s2v_alpha = s2v * (1 - alp)
                
                df_combined = pd.concat([w2v_alpha, s2v_alpha], ignore_index=True)
                df_combined.to_csv(path + "/embeddings.csv", index = False) 
                
            count += 1 
            
print(count)


88


In [25]:
'''All paths for embeddings'''

alpha_path = [] 

ave_path = [] 

only_path = [] 


In [26]:
dimensions = ["50", '100', '200', '300']
alphas = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
models = ["s2v", "w2v"]


# paths 
lexical_data_path = "../forager/data/lexical_data/"
s2v_path = "_dim_lexical_data/only_s2v/embeddings.csv"
w2v_path = "_dim_lexical_data/only_w2v/embeddings.csv"
folder = "_dim_lexical_data/alpha_"
embeddings = "/embeddings.csv"

count = 0 
for dim in dimensions: 
    for model in models: 
        for alp in alphas: 
            path = lexical_data_path + dim + folder + str(alp) + "_" + model + "/"
            alpha_path += [path] 
            
print("alpha path num:", len(alpha_path))
print(alpha_path)
print()


for dim in dimensions: 
    ave_path += [lexical_data_path + dim + "_dim_lexical_data/average/"]

print("average path num:", len(ave_path))
print(ave_path)




alpha path num: 88
['../forager/data/lexical_data/50_dim_lexical_data/alpha_0_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.1_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.2_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.3_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.4_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.5_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.6_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.7_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.8_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.9_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_1.0_s2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0_w2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.1_w2v/', '../forager/data/lexical_data/50_dim_lexical_data/alpha_0.2_w2v/', '../forager/data/lexical_data/50_dim_lexical_d

### Copy Phonological Matrix and Frequency into Alpha/Average directories

In [27]:
total_path = ave_path + alpha_path
print(len(total_path))

frequency_file = "../forager/data/lexical_data/50_dim_lexical_data/only_w2v/frequencies.csv"
phon_matrix_file = "../forager/data/lexical_data/50_dim_lexical_data/only_w2v/phon_matrix.csv"

counter = 0 
for path in total_path: 
    shutil.copy(frequency_file, path)
    shutil.copy(phon_matrix_file, path)
    counter += 1

print(counter)





92
92


### Semantic Matrix Creation

In [30]:
import numpy as np
import scipy
import pandas as pd
import nltk
from functools import lru_cache
from itertools import product as iterprod
import re
from tqdm import tqdm

def create_semantic_matrix(path_to_embeddings, path_for_lexical_data=None):
    '''
        Description:
            Takes in N word embeddings and returns a semantic similarity matrix (NxN np.array)
        Args:
            (1) path_to_embeddings (str): path to a .csv file containing N word embeddings of size D each (DxN array)
        Returns: 
            (1) semantic_matrix: semantic similarity matrix (NxN np.array)
    '''
    embeddings = pd.read_csv(path_to_embeddings, encoding="unicode-escape").transpose().values
    N = len(embeddings)
    
    semantic_matrix = 1-scipy.spatial.distance.cdist(embeddings, embeddings, 'cosine').reshape(-1)
    semantic_matrix = semantic_matrix.reshape((N,N))
    # convert to dataframe without header or index
    semantic_matrix_df = pd.DataFrame(semantic_matrix)
    semantic_matrix_df.to_csv(path_for_lexical_data + 'semantic_matrix.csv', header=False, index=False)
    
    '''
    #changed
    # semantic_matrix_df.to_csv("data processing/Lexical Data/word2vec/semantic_matrix.csv", header=False, index=False)
    '''
    return semantic_matrix

In [32]:
counter = 0 

for path in total_path: 
    create_semantic_matrix(path + 'embeddings.csv', path)
    counter += 1

print(counter)



0
