In [2]:
import pandas as pd
import numpy as np
from gensim.models.keyedvectors import Word2VecKeyedVectors
from gensim.models import KeyedVectors
from scipy.stats.mstats import spearmanr
import csv

In [9]:
glove_orig = '../data/embeddings/glove.42B.300d.txt'

In [13]:
model = KeyedVectors.load_word2vec_format(glove_orig, binary=False)
vecs = model.vectors
words = list(model.key_to_index.keys())

In [17]:
P_mean = np.load('../P_means.npy')

In [22]:
def vec(w):
    return model[w]

def cos_sim(a, b):
    return a.dot(b) / (np.linalg.norm(a) * np.linalg.norm(b))

def load_word_vectors(fname):
    
    model = KeyedVectors.load_word2vec_format(fname, binary=False)
    vecs = model.vectors
    words = list(model.key_to_index.keys())
    return model, vecs, words

def measure_emb_correlation(df, all_emb_words, P):
    df = df.loc[df['word1'].isin(all_emb_words)]
    df = df.loc[df['word2'].isin(all_emb_words)]
    
    rel_words = list(set(df['word1'].tolist() + df['word2'].tolist()))
    
    words_glove_vec = {}
    for word in rel_words:
        words_glove_vec[word] = vec(word)
        
    glove_sim = []
    proj_glove_sim = []
    for _, row in df.iterrows():
        w1, w2 = row['word1'], row['word2']
        sim = cos_sim(words_glove_vec[w1], words_glove_vec[w2])
        glove_sim.append(sim)

        p_sim = cos_sim(P.dot(words_glove_vec[w1]), P.dot(words_glove_vec[w2]))
        proj_glove_sim.append(p_sim)
    
    df['glove_sim'] = np.array(glove_sim)
    df['proj_glove_sim'] = np.array(proj_glove_sim)
    
    print('glove:', spearmanr(df['similarity'], df['glove_sim']))
    print('glove -P:', spearmanr(df['similarity'], df['proj_glove_sim']))
    return df
    
# def measure_emb_correlation(df, model_cleaned):
#     all_emb_words = model_cleaned.key_to_index.keys()
#     df = df.loc[df['word1'].isin(all_emb_words)]
#     df = df.loc[df['word2'].isin(all_emb_words)]
    
#     rel_words = list(set(df['word1'].tolist() + df['word2'].tolist()))
    
#     words_glove_vec = {}
#     for word in rel_words:
#         words_glove_vec[word] = vec(word)
        
#     glove_sim = []
#     proj_glove_sim = []
#     for _, row in df.iterrows():
#         w1, w2 = row['word1'], row['word2']
#         sim = cos_sim(words_glove_vec[w1], words_glove_vec[w2])
#         glove_sim.append(sim)

#         p_sim = model_cleaned.cosine_similarities(model_cleaned[w1], [model_cleaned[w2]])
#         proj_glove_sim.append(p_sim)
    
#     df['glove_sim'] = np.array(glove_sim)
#     df['proj_glove_sim'] = np.array(proj_glove_sim)
    
#     print('glove:', spearmanr(df['similarity'], df['glove_sim']))
#     print('glove -P:', spearmanr(df['similarity'], df['proj_glove_sim']))
#     return df

In [7]:
model_cleaned, _, _ = load_word_vectors(fname = "../better_proj_data/embeddings/vecs_mean.150k.cleaned.txt")


### SimLex999

Data from: https://fh295.github.io/simlex.html

In [20]:
df_simlex = pd.read_csv('../SimLex-999/SimLex-999.txt', sep='\t')
df_simlex = df_simlex.rename(columns={"SimLex999": "similarity"})
df_simlex.head()

Unnamed: 0,word1,word2,POS,similarity,conc(w1),conc(w2),concQ,Assoc(USF),SimAssoc333,SD(SimLex)
0,old,new,A,1.58,2.72,2.81,2,7.25,1,0.41
1,smart,intelligent,A,9.2,1.75,2.46,1,7.11,1,0.67
2,hard,difficult,A,8.77,3.76,2.21,2,5.94,1,1.19
3,happy,cheerful,A,9.55,2.56,2.34,1,5.85,1,2.18
4,hard,easy,A,0.95,3.76,2.07,2,5.82,1,0.93


In [23]:
df_simlex = measure_emb_correlation(df_simlex, words, P_mean)

glove: SpearmanrResult(correlation=0.3738001400000711, pvalue=1.7519187885575554e-34)
glove -P: SpearmanrResult(correlation=0.36815044618095205, pvalue=2.0062117415448982e-33)


### WordSim 353

source: http://alfonseca.org/eng/research/wordsim353.html

#### Similarity

In [27]:
df_353_sim = pd.read_csv('../WordSim353/wordsim_similarity_goldstandard.txt', sep='\t', names=['word1', 'word2', 'similarity'])
df_353_sim.head()

Unnamed: 0,word1,word2,similarity
0,tiger,cat,7.35
1,tiger,tiger,10.0
2,plane,car,5.77
3,train,car,6.31
4,television,radio,6.77


In [30]:
df_353_sim = measure_emb_correlation(df_353_sim, words, P_mean)

glove: SpearmanrResult(correlation=0.6953847581116936, pvalue=1.1908298243958436e-29)
glove -P: SpearmanrResult(correlation=0.6963115680215075, pvalue=9.331212305133868e-30)


In [38]:
df_353_sim[df_353_sim["word1"] == "fuck"]

Unnamed: 0,word1,word2,similarity,glove_sim,proj_glove_sim
17,fuck,sex,9.44,0.670561,0.665727


#### Relatedness

In [31]:
df_353_rel = pd.read_csv('../WordSim353/wordsim_relatedness_goldstandard.txt', sep='\t', names=['word1', 'word2', 'similarity'])
df_353_rel.head()

Unnamed: 0,word1,word2,similarity
0,computer,keyboard,7.62
1,Jerusalem,Israel,8.46
2,planet,galaxy,8.11
3,canyon,landscape,7.53
4,OPEC,country,5.63


In [32]:
df_353_rel = measure_emb_correlation(df_353_rel, words, P_mean)

glove: SpearmanrResult(correlation=0.5999016814083603, pvalue=1.1815135039496508e-24)
glove -P: SpearmanrResult(correlation=0.5984413637919774, pvalue=1.6344853277812252e-24)


### Mturk

source: http://www2.mta.ac.il/~gideon/mturk771.html

In [36]:
df_mturk = pd.read_csv('../Mturk771/MTURK-771.csv', names=['word1', 'word2', 'similarity'], sep=",")
df_mturk.head()

Unnamed: 0,word1,word2,similarity
0,access,gateway,3.791667
1,account,explanation,2.0
2,account,invoice,3.75
3,account,statement,3.681818
4,acoustic,remedy,1.227273


In [37]:
df_mturk = measure_emb_correlation(df_mturk, words, P_mean)

glove: SpearmanrResult(correlation=0.684252286603461, pvalue=1.4888186861090302e-107)
glove -P: SpearmanrResult(correlation=0.6841577911953189, pvalue=1.634932119527619e-107)
