In [43]:
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.metrics import mean_squared_error
from scipy.stats import spearmanr
import os

In [44]:
# Load Skip-gram Models
skipgram_model = Word2Vec.load("../word2vec_skipgram_window2.model")
skipgram_neg_model = Word2Vec.load("../word2vec_ns_window2.model")

# Load GloVe Embeddings (NumPy array and Gensim KeyedVectors)
glove_embeddings = np.load("../glove_embeddings.npy")
glove_gensim_model = KeyedVectors.load("../glove_gensim.kv")

In [45]:
# Load the GloVe Vocabulary into a Dictionary
glove_file = "../glove.6B/glove.6B.100d.txt"
vocab_file = "glove_vocab.txt"

if not os.path.exists(glove_file):
    raise FileNotFoundError(f"GloVe file not found at: {glove_file}")

with open(glove_file, 'r', encoding='utf-8') as infile, open(vocab_file, 'w', encoding='utf-8') as outfile:
    for line in infile:
        word = line.split()[0]
        outfile.write(word + "\n")

with open(vocab_file, 'r', encoding='utf-8') as f:
    glove_vocab = {line.strip(): idx for idx, line in enumerate(f)}

# Load WordSim353 Dataset
def load_wordsim353(file_path):
    word_pairs = []
    with open(file_path, 'r') as f:
        for line in f:
            if line.strip() and not line.startswith("#"):
                word1, word2, score = line.strip().split()
                word_pairs.append((word1, word2, float(score)))
    return word_pairs

file_path = 'wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt'
if not os.path.exists(file_path):
    raise FileNotFoundError(f"WordSim353 dataset not found at: {file_path}")

word_sim_data = load_wordsim353(file_path)

# Spearman Correlation Function
def calculate_spearman_correlation(word_sim_data, model, glove_vocab=None, glove_array=None):
    model_similarities = []
    human_similarities = []

    for word1, word2, human_score in word_sim_data:
        try:
            if glove_vocab is not None and glove_array is not None:
                vec1 = glove_array[glove_vocab[word1]]
                vec2 = glove_array[glove_vocab[word2]]
            else:
                vec1 = model[word1]
                vec2 = model[word2]
            similarity = np.dot(vec1, vec2)
            model_similarities.append(similarity)
            human_similarities.append(human_score)
        except KeyError:
            continue  # Skip words not in vocabulary

    if model_similarities and human_similarities:
        correlation, _ = spearmanr(model_similarities, human_similarities)
        return correlation
    return None

In [46]:
# Define Models
models = {
    "Skip-gram": skipgram_model.wv,
    "Skip-gram Negative Sampling": skipgram_neg_model.wv,
    "GloVe": (glove_vocab, glove_embeddings),
    "GloVe Gensim": glove_gensim_model
}

# Calculate Metrics
results = []
for model_name, model in models.items():
    model_similarities = []
    human_similarities = []

    if model_name == "GloVe":
        vocab, array = model
        # Filter glove_vocab to only include words in glove_embeddings
        filtered_vocab = {word: idx for idx, word in enumerate(vocab) if idx < len(array)} 
        correlation = calculate_spearman_correlation(word_sim_data, None, filtered_vocab, array)
    else:
        correlation = calculate_spearman_correlation(word_sim_data, model)

    for word1, word2, human_score in word_sim_data:
        try:
            if model_name == "GloVe":
                vec1 = array[filtered_vocab[word1]] 
                vec2 = array[filtered_vocab[word2]] 
            else:
                vec1 = model[word1]
                vec2 = model[word2]
            similarity = np.dot(vec1, vec2)
            model_similarities.append(similarity)
            human_similarities.append(human_score)
        except KeyError:
            continue

    mse = mean_squared_error(human_similarities, model_similarities) if model_similarities else None
    results.append({"Model": model_name, "Spearman Correlation": correlation, "MSE": mse})

# Print Results
print(f"{'Model':<30}{'Spearman Correlation':<25}{'MSE':<10}")
print("-" * 65)
for result in results:
    corr = f"{result['Spearman Correlation']:.4f}" if result['Spearman Correlation'] else "N/A"
    mse = f"{result['MSE']:.4f}" if result['MSE'] else "N/A"
    print(f"{result['Model']:<30}{corr:<25}{mse:<10}")

Model                         Spearman Correlation     MSE       
-----------------------------------------------------------------
Skip-gram                     0.0098                   9.0389    
Skip-gram Negative Sampling   0.0148                   8.9478    
GloVe                         -0.0681                  258.7266  
GloVe Gensim                  0.4802                   128.1582  
