In [13]:
from util_funs import Skipgram, SkipgramNeg, Glove

In [14]:
from gensim.test.utils import datapath
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from scipy.stats import spearmanr

In [15]:
# !pip install gensim
# !pip install nltk
# !pip install numpy

In [16]:
# !pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124

In [17]:
import torch
import torch.nn.functional as F
import pickle
import numpy as np

### Skipgram

In [18]:
#Args

skipgram_data = pickle.load(open(r'model/skipgram_metadata.pkl', 'rb'))

cleaned_corpus = skipgram_data['corpus']
vocabs = skipgram_data['vocab']
word2index = skipgram_data['word2index']
voc_size = skipgram_data['voc_size']
emb_size = skipgram_data['embedding_size']

In [19]:
# Instantiate the model and load saved parameters
skipgram = Skipgram(voc_size, emb_size)
skipgram.load_state_dict(torch.load('model/word2vec_skipgram.pt'))
skipgram.eval()

  skipgram.load_state_dict(torch.load('model/word2vec_skipgram.pt'))


Skipgram(
  (embedding_center): Embedding(6966, 2)
  (embedding_outside): Embedding(6966, 2)
)

### Negative Sampling

In [20]:
#Args

neg_skipgram_data = pickle.load(open(r'model/neg_sampling_metadata.pkl', 'rb'))

cleaned_corpus = neg_skipgram_data['corpus']
vocabs = neg_skipgram_data['vocab']
word2index = neg_skipgram_data['word2index']
voc_size = neg_skipgram_data['voc_size']
emb_size = neg_skipgram_data['embedding_size']

In [21]:
# Instantiate the model and load saved parameters
skipgramNeg = SkipgramNeg(voc_size, emb_size)
skipgramNeg.load_state_dict(torch.load('model/word2vec_neg_sampling.pt'))
skipgramNeg.eval()

  skipgramNeg.load_state_dict(torch.load('model/word2vec_neg_sampling.pt'))


SkipgramNeg(
  (embedding_center): Embedding(6966, 2)
  (embedding_outside): Embedding(6966, 2)
  (logsigmoid): LogSigmoid()
)

### Glove

In [22]:
#Args

glove_data = pickle.load(open(r'model/glove_scratch_metadata.pkl', 'rb'))

cleaned_corpus = neg_skipgram_data['corpus']
vocabs = neg_skipgram_data['vocab']
word2index = neg_skipgram_data['word2index']
voc_size = neg_skipgram_data['voc_size']
emb_size = neg_skipgram_data['embedding_size']

In [23]:
# Instantiate the model and load saved parameters
glove = Glove(voc_size, emb_size)
glove.load_state_dict(torch.load('model/glove_scratch.pt'))
glove.eval()

  glove.load_state_dict(torch.load('model/glove_scratch.pt'))


Glove(
  (center_embedding): Embedding(6966, 2)
  (outside_embedding): Embedding(6966, 2)
  (center_bias): Embedding(6966, 1)
  (outside_bias): Embedding(6966, 1)
)

### Gensim

In [25]:
#you have to put this file in some python/gensim directory; just run it and it will inform where to put....
glove_file = datapath('glove.6B.100d.txt')  #search on the google
gensim = KeyedVectors.load_word2vec_format(glove_file, binary=False, no_header=True)

## Similarity Accuracy Analysis

In [26]:
file_path = "test_text/wordsim_similarity_goldstandard.txt"

content = open(file_path, 'r').readlines()

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [27]:
sim_data[:5]

['tiger\tcat\t7.35',
 'tiger\ttiger\t10.00',
 'plane\tcar\t5.77',
 'train\tcar\t6.31',
 'television\tradio\t6.77']

In [28]:
file_path = "test_text/wordsim_similarity_goldstandard.txt"

content = open(file_path, 'r').readlines()

sim_data = []

for sent in content:
    sim_data.append(sent.strip())

In [29]:
def compute_vectors(vocab, model):
    """Compute and stack vectors for all words in the vocabulary."""
    return torch.stack([model.get_embed(word) for word in vocab])

In [30]:
import torch
import torch.nn.functional as F

def sim_analysis(lines, model):
    # Perform similarity analysis on word analogies.
    # Compute all word vectors once
    all_word_vectors = compute_vectors(vocabs, model)

    correct = 0
    skipped = 0

    for line in lines:
        words = line.split()

        # Ensure the line has exactly four words
        if len(words) != 4:
            skipped += 1
            continue

        # Skip lines with unknown words
        if any(word not in vocabs for word in words):
            skipped += 1
            continue

        # Retrieve vectors for analogy words
        try:
            vectors = [model.get_embed(word.lower()) for word in words]
            vectors = [vec if isinstance(vec, torch.Tensor) else torch.tensor(vec) for vec in vectors]
        except KeyError:
            skipped += 1
            continue

        # Perform vector manipulation
        result_vector = vectors[1] - vectors[0] + vectors[2]
        result_vector = result_vector.unsqueeze(0)  # Add batch dimension

        # Calculate cosine similarities
        similarities = F.cosine_similarity(result_vector, all_word_vectors)

        # Get the closest word
        closest_word_index = torch.argmax(similarities).item()
        closest_word = vocabs[closest_word_index]

        # Check if the predicted word matches the target
        if closest_word == words[3]:
            correct += 1

    # Calculate accuracy
    total = len(lines) - skipped
    accuracy = (correct / total) * 100 if total > 0 else 0

    print(f'Accuracy: {accuracy:.2f}%')
    print(f'Skipped: {skipped} invalid words')
    print(f'------------------------------------------------------------------------------')


### Gensim

In [31]:
def evaluate_glove(lines, model):

    correct = 0
    total = 0
    skipped = 0  # Counter for skipped lines

    for line in lines:
        words = line.lower().strip().split()

        # Skip invalid or OOV lines
        if len(words) != 4:
            skipped += 1
            continue
        if any(word not in model for word in words):
            skipped += 1
            continue

        # Perform analogy
        try:
            result = model.most_similar(positive=[words[2], words[1]], negative=[words[0]], topn=1)
            predicted_word = result[0][0]
            total += 1

            if predicted_word == words[3]:
                correct += 1
        except Exception as e:
            skipped += 1
            continue

    # Calculate accuracy
    accuracy = (correct / total) * 100 if total > 0 else 0

    # # Print results
    print(f'Accuracy: {accuracy:.2f}%')
    print(f'Skipped: {skipped} invalid words')

# 1. Semantic and Syntatic Analysis

### Loading text files

In [32]:
# Read the text file and create a list of tuples
with open('test_text/word-test.v1.txt', 'r') as file:
    sem_data = file.readlines()
    
with open('test_text/past_tense_syntatic.txt', 'r') as file:
    syn_data = file.readlines()

# Work2Vec(Skipgram)

In [33]:
# Define the models and their names
models = {
    'Word2Vec (Skipgram)': skipgram,
    'Word2Vec (Neg Sampling)': skipgramNeg,
    'GloVe from Scratch': glove,
    'GloVe (Gensim)': gensim
}

### Semantic Similarity Analysis

In [34]:
print("Semantic Analysis:")
for model_name, model in models.items():
    print(f"\n{model_name} on semantic similarity:")
    if model_name == 'GloVe (Gensim)':
        # Use evaluate_glove for gensim model
        evaluate_glove(sem_data, model)
    else:
        # Use similarities for other models
        sim_analysis(sem_data, model)

Semantic Analysis:

Word2Vec (Skipgram) on semantic similarity:


Accuracy: 0.00%
Skipped: 18427 invalid words
------------------------------------------------------------------------------

Word2Vec (Neg Sampling) on semantic similarity:
Accuracy: 0.00%
Skipped: 18427 invalid words
------------------------------------------------------------------------------

GloVe from Scratch on semantic similarity:
Accuracy: 0.00%
Skipped: 18427 invalid words
------------------------------------------------------------------------------

GloVe (Gensim) on semantic similarity:
Accuracy: 63.11%
Skipped: 13 invalid words


### Syntatic Similarrity Analysis

In [35]:
print("Syntatic Analysis:")
for model_name, model in models.items():
    print(f"\n{model_name} on syntatic similarity:")
    if model_name == 'GloVe (Gensim)':
        # Use evaluate_glove for gensim model
        evaluate_glove(syn_data, model)
    else:
        # Use similarities for other models
        sim_analysis(syn_data, model)

Syntatic Analysis:

Word2Vec (Skipgram) on syntatic similarity:
Accuracy: 0.00%
Skipped: 1288 invalid words
------------------------------------------------------------------------------

Word2Vec (Neg Sampling) on syntatic similarity:
Accuracy: 0.00%
Skipped: 1288 invalid words
------------------------------------------------------------------------------

GloVe from Scratch on syntatic similarity:
Accuracy: 0.00%
Skipped: 1288 invalid words
------------------------------------------------------------------------------

GloVe (Gensim) on syntatic similarity:
Accuracy: 55.45%
Skipped: 0 invalid words


# 2. Similarity Analysis

In [43]:
import numpy as np

def cosine_similarity(A, B):

    dot_product = np.dot(A.flatten(), B.flatten())
    norm_a = np.linalg.norm(A)
    norm_b = np.linalg.norm(B)

    similarity = dot_product / (norm_a * norm_b)
    return similarity

In [37]:
def similar(lines, model):
    scores_real = []  # To store actual similarity scores (from the dataset)
    scores_pred = []  # To store predicted similarity scores (using cosine similarity)

    for line in lines:
        words = line.split()  # Split line into words
        vec = []  # List to store word vectors

        # Assuming the first two words need to be compared
        for word in words[:2]: 
            try:
                # Attempt to get the vector for the word
                vec.append(model.get_embed(word).detach().numpy())
            except:
                # If the word is not in the vocabulary, use the <UNK> token
                vec.append(model.get_embed('<UNK>').detach().numpy())

        # Store the actual similarity score from the dataset (third word)
        scores_real.append(float(words[2]))  
        
        # Calculate the cosine similarity between the two words and store the predicted score
        scores_pred.append(cosine_similarity(np.array(vec[0]), np.array(vec[1])))

    # Calculate and return Spearman's rank correlation between actual and predicted scores
    return spearmanr(scores_real, scores_pred)

In [52]:
# Custom Function for Gensim

def similar_gensim(lines, model):
    scores_real = []
    scores_pred = [] 

    for line in lines:
        words = line.split()
        vec = []
        
        for word in words[:2]:
            try:
                vec.append(model[word])
            except KeyError:
                vec.append(model['UNK'])
        
        # Store the actual similarity score from the dataset (third word)
        scores_real.append(float(words[2]))

        similarity_score = cosine_similarity(np.array(vec[0]), np.array(vec[1]))
        scores_pred.append(similarity_score)

    correlation, p_value = spearmanr(scores_real, scores_pred)
    
    return correlation, p_value

In [50]:
# Define the models and their evaluation functions
models = {
    'Word2Vec (Skipgram)': (skipgram, similar),
    'Word2Vec (Neg Sampling)': (skipgramNeg, similar),
    'GloVe from Scratch': (glove, similar),
    'GloVe (Gensim)': (gensim, similar_gensim)
}

In [51]:
# Evaluate each model
print("Similarity Analysis:")
for model_name, (model, eval_func) in models.items():
    if model_name == 'gensim':
        correlation_score = eval_func(sim_data, model)[0]
    else:
        correlation_score = eval_func(sim_data, model)[0]
    
    print(f'\n{model_name} correlation score: {correlation_score:.4f}')

Similarity Analysis:

Word2Vec (Skipgram) correlation score: 0.2254

Word2Vec (Neg Sampling) correlation score: 0.2899

GloVe from Scratch correlation score: 0.1609

GloVe (Gensim) correlation score: 0.6038


# Human Scoring

In [4]:
file_path = "test_text/wordsim_similarity_goldstandard_w_human.txt"

content = open(file_path, 'r').readlines()

h_sim_data = []

for sent in content:
    h_sim_data.append(sent.strip())

In [6]:
from scipy.stats import spearmanr

def similar_human(lines):
    scores_real = []
    scores_pred = []

    for line in lines:
        words = line.split()
        scores_real.append(float(words[2]))  # Real score from the dataset (3rd column)
        scores_pred.append(float(words[3]))  # Human score from the dataset (4th column)

    # Calculate Spearman correlation
    correlation, _ = spearmanr(scores_real, scores_pred)
    return correlation

# Calculate Spearman correlation
correlation = similar_human(h_sim_data)
print(f"Spearman correlation between real scores and human scores: {correlation:.4f}")

Spearman correlation between real scores and human scores: 0.9677
