# Compare performance of CBOW and Skipgram

In [1]:
# Import necessary libraries
import pandas as pd
import os
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from scipy.spatial.distance import cosine

# Load SpaCy model for lemmatization
nlp = spacy.load('en_core_web_sm')

try:
    import google.colab
    in_colab = True
    local_path = "/content/drive/MyDrive/DLSS/"
    drive.mount('/content/drive')

except ImportError:
    in_colab = False
    ## get current directory
    current_wd = os.getcwd()
    ## move one up to go to main directory
    local_path = os.path.dirname(current_wd) + "/"

print("CWD: ", local_path)


CWD:  d:\dlss-project24/


## Functions

In [None]:
def df_to_embeddings_dict(df):
    """
    Convert a DataFrame of word embeddings into a dictionary.

    Args:
        df (pd.DataFrame): DataFrame containing word embeddings. 
                           Assumes the first column contains words and the remaining columns contain the embedding vectors.

    Returns:
        dict: A dictionary where keys are words and values are their corresponding embedding vectors.
    """
    embeddings = {}
    for _, row in df.iterrows():
        word = row['word']  # Assuming the word column is named 'word'
        vector = row.iloc[3:].to_numpy(dtype=np.float32)  # Convert remaining columns to numpy array (word vector)
        embeddings[word] = vector
    return embeddings

In [None]:
def compute_similarities(word_pairs, embeddings):
    """
    Compute cosine similarities for a list of word pairs using their embeddings.

    Args:
        word_pairs (list of tuple): List of tuples where each tuple contains a pair of words (word1, word2).
        embeddings (dict): Dictionary of word embeddings where keys are words and values are their embedding vectors.

    Returns:
        list: A list of cosine similarity scores for each word pair. If a word is not found in the embeddings, a similarity of 0 is assigned.
    """
    similarities = []
    for word1, word2 in word_pairs:
        if word1 in embeddings and word2 in embeddings:
            # Retrieve and reshape the vectors for the words
            vec1 = embeddings[word1].reshape(1, -1)
            vec2 = embeddings[word2].reshape(1, -1)
            # Compute the cosine similarity between the vectors
            similarity = cosine_similarity(vec1, vec2)[0][0]
        else:
            similarity = 0  # Handle Out-of-Vocabulary (OOV) words by assigning a similarity of 0
        similarities.append(similarity)
    return similarities

In [None]:
def lemmatize_word(word):
    """
    Lemmatize a given word.

    Args:
        word (str): The word to lemmatize.

    Returns:
        str: The lemmatized form of the word.
    """
    doc = nlp(word)
    return doc[0].lemma_

In [None]:
def predict_analogy_word(word_a, word_b, word_c, word_to_vec):
    """
    Predict the fourth word in an analogy of the form "word_a is to word_b as word_c is to ?".

    Args:
        word_a (str): The first word in the analogy.
        word_b (str): The second word in the analogy.
        word_c (str): The third word in the analogy.
        word_to_vec (dict): Dictionary of word embeddings where keys are words and values are their embedding vectors.

    Returns:
        str or None: The predicted fourth word in the analogy. If any of the words are not in the dictionary, returns None.
    """
    # Lemmatize the words to match the vocabulary
    word_a = lemmatize_word(word_a)
    word_b = lemmatize_word(word_b)
    word_c = lemmatize_word(word_c)

    # Retrieve the vectors for the words
    vec_a = word_to_vec.get(word_a)
    vec_b = word_to_vec.get(word_b)
    vec_c = word_to_vec.get(word_c)

    if vec_a is None or vec_b is None or vec_c is None:
        return None

    # Calculate the target vector: vec_b - vec_a + vec_c
    target_vec = vec_b - vec_a + vec_c

    # Find the closest word to the target vector
    best_word = None
    best_similarity = float('inf')
    
    for word, vec in word_to_vec.items():
        if word not in {word_a, word_b, word_c}:  # Exclude the input words
            similarity = cosine(target_vec, vec)
            if similarity < best_similarity:  # Looking for the most similar vector
                best_similarity = similarity
                best_word = word
    
    return best_word

In [None]:
def predict_analogy_word(word_a, word_b, word_c, word_to_vec):
    """
    Predict the fourth word in an analogy of the form "word_a is to word_b as word_c is to ?".

    Args:
        word_a (str): The first word in the analogy.
        word_b (str): The second word in the analogy.
        word_c (str): The third word in the analogy.
        word_to_vec (dict): Dictionary of word embeddings where keys are words and values are their embedding vectors.

    Returns:
        str or None: The predicted fourth word in the analogy. If any of the words are not in the dictionary, returns None.
    """
    # Lemmatize the words to match the vocabulary
    vec_a = word_to_vec.get(word_a)
    vec_b = word_to_vec.get(word_b)
    vec_c = word_to_vec.get(word_c)
    
    if vec_a is None or vec_b is None or vec_c is None:
        return None
    
    # Calculate the analogy vector: vec_b - vec_a + vec_c
    vec_d = vec_b - vec_a + vec_c
    
    # Find the closest word to vec_d
    closest_word = None
    min_distance = float('inf')
    
    for word, vec in word_to_vec.items():
        distance = np.linalg.norm(vec - vec_d)
        if distance < min_distance:
            min_distance = distance
            closest_word = word
            
    return closest_word


In [None]:
def evaluate_analogy_dataset(analogy_df, word_to_vec):
    """
    Evaluate the accuracy of analogy predictions against a dataset.

    Args:
        analogy_df (pd.DataFrame): DataFrame containing analogy questions. 
                                   Assumes the columns are 'Word 1', 'Word 2', 'Word 3', and 'Expected Word'.
        word_to_vec (dict): Dictionary of word embeddings where keys are words and values are their embedding vectors.

    Returns:
        float: The accuracy of analogy predictions, calculated as the proportion of correct predictions.
    """
    correct = 0
    total = len(analogy_df)

    for index, row in analogy_df.iterrows():
        # Extract the words from the dataset
        word_a = row['Word 1']
        word_b = row['Word 2']
        word_c = row['Word 3']
        expected_word_d = row['Expected Word']

        # Predict the fourth word in the analogy
        predicted_word_d = predict_analogy_word(word_a, word_b, word_c, word_to_vec)

        # Compare the predicted word with the expected word (lemmatized)
        if predicted_word_d == lemmatize_word(expected_word_d):
            correct += 1
        
        # Optionally, print details for debugging
        # print(f"Analogy: {word_a} is to {word_b} as {word_c} is to {predicted_word_d} (Expected: {expected_word_d})")
    
    # Calculate accuracy as the proportion of correct predictions
    accuracy = correct / total
    return accuracy

In [None]:
def evaluate_analogy_dataset(analogy_df, word_to_vec):
    """
    Evaluate the accuracy of analogy predictions against a dataset.

    Args:
        analogy_df (pd.DataFrame): DataFrame containing analogy questions. 
                                   Assumes the columns are 'Word 1', 'Word 2', 'Word 3', and 'Expected Word'.
        word_to_vec (dict): Dictionary of word embeddings where keys are words and values are their embedding vectors.

    Returns:
        float: The accuracy of analogy predictions, calculated as the proportion of correct predictions.
    """
    correct = 0
    total = len(analogy_df)

    for index, row in analogy_df.iterrows():
        word_a = row['Word 1']
        word_b = row['Word 2']
        word_c = row['Word 3']
        expected_word_d = row['Expected Word']

        predicted_word_d = predict_analogy_word(word_a, word_b, word_c, word_to_vec)

        # Compare the predicted word with the expected word
        if predicted_word_d == lemmatize_word(expected_word_d):
            correct += 1
        
        # Optionally, print details for debugging
        # print(f"Analogy: {word_a} is to {word_b} as {word_c} is to {predicted_word_d} (Expected: {expected_word_d})")
    
    # Calculate accuracy
    accuracy = correct / total
    return accuracy

## Data Prep

### prepare embeddings

In [None]:
# Load the data
embeddings_cbow_df = pd.read_csv(local_path + "output/embeddings/embeddings_CBOW_total_posts.csv")
embeddings_skipgram_df = pd.read_csv(local_path + "output/embeddings/embeddings_CBOW_posts_genz.csv")
glove_cbow_df = pd.read_csv(local_path + "output/embeddings/glove_embeddings.csv")
bge_cbow_df = pd.read_csv(local_path + "output/embeddings/bge_embeddings.csv")

## Fix order so function worked for all dfs
glove_cbow_df = glove_cbow_df[glove_cbow_df.columns[:1].tolist() + ['word'] + glove_cbow_df.columns[1:].drop('word').tolist()]
bge_cbow_df = bge_cbow_df[bge_cbow_df.columns[:1].tolist() + ['word'] + bge_cbow_df.columns[1:].drop('word').tolist()]

## Convert to dictionary
skipgram_embeddings = df_to_embeddings_dict(embeddings_skipgram_df)
cbow_embeddings = df_to_embeddings_dict(embeddings_cbow_df)
bge_embeddings = df_to_embeddings_dict(bge_cbow_df)
glove_embeddings = df_to_embeddings_dict(glove_cbow_df)

### prepare wordsim

In [None]:
# Load the WordSim-353 dataset into a DataFrame
wordsim353_df = pd.read_csv(local_path + "data/external_data/wordsim353crowd.csv")

# Split up into list of word pairs and list of scores
word_pairs = wordsim353_df[['Word 1', 'Word 2']].values.tolist()
human_scores = wordsim353_df['Human (Mean)'].values

# Display dataframe
wordsim353_df

## Word Similarity 

### CBOW

In [None]:
# Apply the function compute_similarities to the word pairs
cbow_similarities = compute_similarities(word_pairs, cbow_embeddings)

# Calculate Spearman correlation
correlation_cbow = spearmanr(cbow_similarities, human_scores).correlation
print(f"Spearman Correlation (CBOW): {correlation_cbow:.2f}")

### Skipgram

In [None]:
# Apply the function compute_similarities to the word pairs
skipgram_similarities = compute_similarities(word_pairs, skipgram_embeddings)

# Calculate Spearman correlation
correlation_skipgram = spearmanr(skipgram_similarities, human_scores).correlation
print(f"Spearman Correlation (Skip-gram): {correlation_skipgram:.2f}")

## Glove

In [None]:
# Apply the function compute_similarities to the word pairs
glove_similarities = compute_similarities(word_pairs, glove_embeddings)

# Calculate Spearman correlation
correlation_glove = spearmanr(glove_similarities, human_scores).correlation
print(f"Spearman Correlation (GloVe): {correlation_glove:.2f}")

## BGE

In [None]:
# Apply the function compute_similarities to the word pairs
bge_similarities = compute_similarities(word_pairs, bge_embeddings)

# Calculate Spearman correlation
correlation_bge = spearmanr(bge_similarities, human_scores).correlation
print(f"Spearman Correlation (BGE): {correlation_bge:.2f}")

## Word Analogy

In [None]:
# Define a dictionary containing word analogy tasks, with three words and the expected fourth word
data_word2vec_analogies = {
    'Word 1': ['sustainable', 'park', 'pollution', 'scientist', 'carbon', 
               'leader', 'earth', 'wealth', 'fire', 'drought',
               'global', 'emission', 'research', 'climate', 'city',
               'temperature', 'development', 'tree', 'planet'],
    'Word 2': ['development', 'bike lane', 'environment', 'experiment', 'dioxide', 
               'country', 'orbit', 'privilege', 'forest', 'water',
               'warming', 'atmosphere', 'discovery', 'change', 'capital',
               'weather', 'progress', 'bicycle', 'solar system'],
    'Word 3': ['renewable', 'tree', 'waste', 'researcher', 'oxygen', 
               'nation', 'rotation', 'poverty', 'fuel', 'famine',
               'cooling', 'pollutant', 'innovation', 'adaptation', 'village',
               'precipitation', 'growth', 'shade', 'sun'],
    'Expected Word': ['energy', 'bicycle', 'disposal', 'analysis', 'air', 
                      'state', 'spin', 'disadvantaged ', 'firewood', 'food',
                      'heating', 'contaminant', 'invention', 'mitigation', 'town',
                      'rain', 'advancement', 'canopy', 'orbit']
}

# Convert the dictionary into a DataFrame for processing
data_word2vec_analogies_df = pd.DataFrame(data_word2vec_analogies)

# Evaluate analogy task accuracy for different word embedding models
# CBOW model
accuracy_cbow_data_word2vec_analogies = evaluate_analogy_dataset(data_word2vec_analogies_df, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_data_word2vec_analogies:.2%}")

# Skipgram model
accuracy_skipgram_data_word2vec_analogies = evaluate_analogy_dataset(data_word2vec_analogies_df, skipgram_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_skipgram_data_word2vec_analogies:.2%}")

# GloVe model
accuracy_glove_data_word2vec_analogies = evaluate_analogy_dataset(data_word2vec_analogies_df, glove_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_glove_data_word2vec_analogies:.2%}")

# BGE model
accuracy_bge_data_word2vec_analogies = evaluate_analogy_dataset(data_word2vec_analogies_df, bge_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_bge_data_word2vec_analogies:.2%}")