In [1]:
import pandas as pd
from collections import Counter
import spacy
nlp = spacy.load("en_core_web_sm")

df = pd.read_csv('data/aoa_colex.csv')
adf = pd.read_csv("data/swow_assoc.csv")

In [2]:
def find_assoc(assoc, word):
    if word in assoc.index:
        return assoc.loc[word, 'counts']
    else:
        return 0

In [3]:
import numpy as np

for index, row in df.iterrows():
    
    concept_1_lower = str(nlp(row['concept_1'].lower()))
    concept_2_lower = str(nlp(row['concept_2'].lower()))

    if concept_1_lower in adf['cue'].values:
        adf_temp = adf[adf['cue'] == concept_1_lower]
        all_assoc_temp = dict(Counter(adf_temp[['R1', 'R2', 'R3']].values.flatten()))
        assoc_temp = pd.DataFrame.from_dict(all_assoc_temp, orient="index").sort_values(by=0, ascending=False).rename(columns={0: "counts"})
        df.loc[index, 'assoc'] = find_assoc(assoc_temp, concept_2_lower)       
    elif concept_2_lower in adf['cue'].values:
        adf_temp = adf[adf['cue'] == concept_2_lower]
        all_assoc_temp = dict(Counter(adf_temp[['R1', 'R2', 'R3']].values.flatten()))
        assoc_temp = pd.DataFrame.from_dict(all_assoc_temp, orient="index").sort_values(by=0, ascending=False).rename(columns={0: "counts"})
        df.loc[index, 'assoc'] = find_assoc(assoc_temp, concept_1_lower)
    else:
        # set the value to NaN
        df.loc[index, 'assoc'] = np.nan

print("Number of rows is NaN:")
print(len(df[df['assoc'].isnull()]))
print("Number of rows is not NaN:")
print(len(df[df['assoc'].notnull()]))

Number of rows is NaN:
75
Number of rows is not NaN:
3587


In [4]:
df = df[df['assoc'].notnull()]
df.head()

Unnamed: 0.1,Unnamed: 0,concept_1,concept_2,colexification_count,age_acquisition_diff,assoc
0,0,ADULTERY,DECEIT,1,64.0,2.0
1,1,ADULTERY,WALK,1,377.0,0.0
2,2,AFTERNOON,AUTUMN,1,3.0,0.0
3,3,AFTERNOON,COLD,1,98.0,0.0
4,4,AFTERNOON,EVENING,17,22.0,15.0


In [5]:
from gensim.models import KeyedVectors

word2vec_model = KeyedVectors.load_word2vec_format('data\\GoogleNews-vectors-negative300.bin.gz', binary=True)

In [6]:
for index, row in df.iterrows():
    try:
        df.loc[index, 'Word2Vec_similarity'] = word2vec_model.similarity(row['concept_1'], row['concept_2'])
    except KeyError:
        df.loc[index, 'Word2Vec_similarity'] = np.nan

print("Number of rows with NaN values:")
print(len(df[df['Word2Vec_similarity'].isnull()]))
print("Number of rows with values:")
print(len(df[df['Word2Vec_similarity'].notnull()]))

Number of rows with NaN values:
795
Number of rows with values:
2792


In [7]:
df = df[df['Word2Vec_similarity'].notnull()]

df.to_csv('data/aoa_assoc_word2vec_colex.csv', index=False)