In [1]:
import pandas as pd
import numpy as np

from scipy.stats import spearmanr, pearsonr

from gensim.models import Word2Vec

In [2]:
# load data and word2vec model
df = pd.read_csv('../data/evaluation/wordsim353crowd.csv')

model = Word2Vec.load(f"./models/word2vec/skipgram_{300}_{5}_{15}.model")

In [3]:
preds = []

for _, row in df.iterrows():
    w1, w2 = row['Word 1'], row['Word 2']
    if w1 in model.wv and w2 in model.wv:
        sim = model.wv.similarity(w1, w2)
        preds.append(sim)
    else:
        preds.append(np.nan) # result not available

In [4]:
df['preds'] = preds

In [5]:
df = df.dropna() # remove the words that are not part of our word2vec's vocabulary

In [6]:
print("Spearman correlation:", spearmanr(df['preds'], df['Human (Mean)'])[0])
print("Pearson correlation:", pearsonr(df['preds'], df['Human (Mean)'])[0])

Spearman correlation: 0.008304687634239496
Pearson correlation: 0.07061691532536118


We can see that there is little to no correlation between the similarity scores from the word2vec vectors and the human scores, indicating poor performance of our word2vec model.

This is expected as the 10k tweets we used probably did not contain enough data