In [16]:
from gensim.models import Word2Vec, KeyedVectors
import numpy as np

In [17]:
# Load Skip-gram model
skipgram_model = Word2Vec.load("../word2vec_skipgram_window2.model")

# Load Skip-gram Negative Sampling model
skipgram_neg_model = Word2Vec.load("../word2vec_ns_window2.model")

# Load the GloVe embeddings as a NumPy array
glove_embeddings = np.load("../glove_embeddings.npy")

# Load the GloVe embeddings as a Gensim KeyedVectors model
glove_gensim_model = KeyedVectors.load("../glove_gensim.kv")

In [18]:
def load_word_test(filepath):
    with open(filepath, "r") as f:
        lines = f.readlines()

    syntactic_questions = []
    semantic_questions = []
    current_section = None

    for line in lines:
        if ":" in line:  # New section
            current_section = "semantic" if "semantic" in line.lower() else "syntactic"
        elif current_section == "syntactic":
            syntactic_questions.append(line.strip().split())
        elif current_section == "semantic":
            semantic_questions.append(line.strip().split())

    return syntactic_questions, semantic_questions

In [19]:
# Load word-test data
syntactic_questions, semantic_questions = load_word_test("word-test.txt")

In [20]:
# Accuracy evaluation function
def evaluate_accuracy(model, questions):
    correct = 0
    total = 0

    for q in questions:
        try:
            predicted = model.most_similar(positive=[q[1], q[2]], negative=[q[0]], topn=1)[0][0]
            if predicted == q[3]:
                correct += 1
            total += 1
        except KeyError:
            # Skip if a word is not in vocabulary
            continue

    return correct / total if total > 0 else 0

# GloVe-specific accuracy evaluation (NumPy)
# Updated GloVe-specific accuracy evaluation (NumPy)
def evaluate_glove_accuracy(glove_embeddings, vocab, questions):
    correct = 0
    total = 0

    for q in questions:
        try:
            # Get vector representations using the vocabulary mapping
            vec = (
                glove_embeddings[vocab[q[1]]] +
                glove_embeddings[vocab[q[2]]] -
                glove_embeddings[vocab[q[0]]]
            )
            # Find the word with the highest similarity to the vector
            predicted = max(vocab.keys(), key=lambda word: np.dot(vec, glove_embeddings[vocab[word]]))
            if predicted == q[3]:
                correct += 1
            total += 1
        except KeyError:
            # Skip if any word in the analogy is not in vocabulary
            continue

    return correct / total if total > 0 else 0


In [21]:
def load_glove_embeddings(glove_path):
    with open(glove_path, "r", encoding="utf-8") as f:
        vocab = {}
        vectors = []
        for idx, line in enumerate(f):
            parts = line.strip().split()
            word = parts[0]
            vector = np.array(parts[1:], dtype=np.float32)
            vocab[word] = idx
            vectors.append(vector)
        glove_embeddings = np.array(vectors)
    return glove_embeddings, vocab

# Load GloVe embeddings and create word-to-index mapping
glove_embeddings, glove_vocab = load_glove_embeddings("../glove.6B/glove.6B.100d.txt")

In [None]:
# Evaluate Syntactic and Semantic Accuracy
# For Skip-gram models
syntactic_accuracy_skipgram = evaluate_accuracy(skipgram_model.wv, syntactic_questions)
semantic_accuracy_skipgram = evaluate_accuracy(skipgram_model.wv, semantic_questions)

syntactic_accuracy_skipgram_neg = evaluate_accuracy(skipgram_neg_model.wv, syntactic_questions)
semantic_accuracy_skipgram_neg = evaluate_accuracy(skipgram_neg_model.wv, semantic_questions)

# For GloVe (NumPy embeddings)
syntactic_accuracy_glove = evaluate_glove_accuracy(glove_embeddings, glove_vocab, syntactic_questions)
semantic_accuracy_glove = evaluate_glove_accuracy(glove_embeddings, glove_vocab, semantic_questions)

# For GloVe (Gensim-compatible)
syntactic_accuracy_glove_gensim = evaluate_accuracy(glove_gensim_model, syntactic_questions)
semantic_accuracy_glove_gensim = evaluate_accuracy(glove_gensim_model, semantic_questions)

In [None]:
# Print results
print("Skip-gram - Syntactic Accuracy:", syntactic_accuracy_skipgram)
print("Skip-gram - Semantic Accuracy:", semantic_accuracy_skipgram)
print("Skip-gram (NEG) - Syntactic Accuracy:", syntactic_accuracy_skipgram_neg)
print("Skip-gram (NEG) - Semantic Accuracy:", semantic_accuracy_skipgram_neg)
print("GloVe - Syntactic Accuracy:", syntactic_accuracy_glove)
print("GloVe - Semantic Accuracy:", semantic_accuracy_glove)
print("GloVe (Gensim) - Syntactic Accuracy:", syntactic_accuracy_glove_gensim)
print("GloVe (Gensim) - Semantic Accuracy:", semantic_accuracy_glove_gensim)