# Compare performance of CBOW and Skipgram

In [None]:
import pandas as pd
import os
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from scipy.spatial.distance import cosine

# Load SpaCy model for lemmatization
nlp = spacy.load('en_core_web_sm')

try:
    import google.colab
    in_colab = True
    local_path = "/content/drive/MyDrive/DLSS/"
    drive.mount('/content/drive')

except ImportError:
    in_colab = False
    ## get current directory
    current_wd = os.getcwd()
    ## move one up to go to main directory
    local_path = os.path.dirname(current_wd) + "/"

print("CWD: ", local_path)


## Functions

In [None]:
def df_to_embeddings_dict(df):
    embeddings = {}
    for _, row in df.iterrows():
        word = row['word']  # Assuming the word column is named 'word'
        vector = row.iloc[3:].to_numpy(dtype=np.float32)  # Convert remaining columns to numpy array
        embeddings[word] = vector
    return embeddings

def compute_similarities(word_pairs, embeddings):
    similarities = []
    for word1, word2 in word_pairs:
        if word1 in embeddings and word2 in embeddings:
            vec1 = embeddings[word1].reshape(1, -1)
            vec2 = embeddings[word2].reshape(1, -1)
            similarity = cosine_similarity(vec1, vec2)[0][0]
        else:
            similarity = 0  # Handle OOV words
        similarities.append(similarity)
    return similarities


# Function to lemmatize words
def lemmatize_word(word):
    doc = nlp(word)
    return doc[0].lemma_

# Function to predict the fourth word in an analogy
def predict_analogy_word(word_a, word_b, word_c, word_to_vec):
    # Lemmatize the words to match the vocabulary
    word_a = lemmatize_word(word_a)
    word_b = lemmatize_word(word_b)
    word_c = lemmatize_word(word_c)

    # Retrieve the vectors for the words
    vec_a = word_to_vec.get(word_a)
    vec_b = word_to_vec.get(word_b)
    vec_c = word_to_vec.get(word_c)

    if vec_a is None or vec_b is None or vec_c is None:
        return None

    # Calculate the target vector: vec_b - vec_a + vec_c
    target_vec = vec_b - vec_a + vec_c

    # Find the closest word to the target vector
    best_word = None
    best_similarity = float('inf')
    
    for word, vec in word_to_vec.items():
        if word not in {word_a, word_b, word_c}:
            similarity = cosine(target_vec, vec)
            if similarity < best_similarity:
                best_similarity = similarity
                best_word = word
    
    return best_word

def evaluate_analogy_dataset(analogy_df, word_to_vec):
    correct = 0
    total = len(analogy_df)

    for index, row in analogy_df.iterrows():
        word_a = row['Word 1']
        word_b = row['Word 2']
        word_c = row['Word 3']
        expected_word_d = row['Expected Word']

        predicted_word_d = predict_analogy_word(word_a, word_b, word_c, word_to_vec)

        # Compare the predicted word with the expected word
        if predicted_word_d == lemmatize_word(expected_word_d):
            correct += 1
        
        #print(f"Analogy: {word_a} is to {word_b} as {word_c} is to {predicted_word_d} (Expected: {expected_word_d})")
    
    # Calculate accuracy
    accuracy = correct / total
    return accuracy


## Data Prep

### prepare embeddings

In [None]:
embeddings_cbow_df = pd.read_csv(local_path + "output/embeddings/embeddings_CBOW_total_posts.csv")
embeddings_skipgram_df = pd.read_csv(local_path + "output/embeddings/embeddings_CBOW_posts_genz.csv")
glove_cbow_df = pd.read_csv(local_path + "output/embeddings/glove_embeddings.csv")
bge_cbow_df = pd.read_csv(local_path + "output/embeddings/bge_embeddings.csv")

## to dict
skipgram_embeddings = df_to_embeddings_dict(embeddings_skipgram_df)
cbow_embeddings = df_to_embeddings_dict(embeddings_cbow_df)
bge_embeddings = df_to_embeddings_dict(bge_cbow_df)
glove_embeddings = df_to_embeddings_dict(glove_cbow_df)

### prepare wordsim

In [None]:
wordsim353_df = pd.read_csv(local_path + "data/external_data/wordsim353crowd.csv")

## split up into list of word pairs and list of scores
word_pairs = wordsim353_df[['Word 1', 'Word 2']].values.tolist()
human_scores = wordsim353_df['Human (Mean)'].values

wordsim353_df

## Word Similarity 

### cbow

In [None]:
cbow_similarities = compute_similarities(word_pairs, cbow_embeddings)
correlation_cbow = spearmanr(cbow_similarities, human_scores).correlation
print(f"Spearman Correlation (CBOW): {correlation_cbow:.2f}")

### skipgram

In [None]:
skipgram_similarities = compute_similarities(word_pairs, skipgram_embeddings)
correlation_skipgram = spearmanr(skipgram_similarities, human_scores).correlation
print(f"Spearman Correlation (Skip-gram): {correlation_skipgram:.2f}")


## Glove

In [None]:
glove_similarities = compute_similarities(word_pairs, glove_embeddings)
correlation_glove = spearmanr(glove_similarities, human_scores).correlation
print(f"Spearman Correlation (GloVe): {correlation_glove:.2f}")


## BGE

In [None]:
bge_similarities = compute_similarities(word_pairs, bge_embeddings)
correlation_bge = spearmanr(bge_similarities, human_scores).correlation
print(f"Spearman Correlation (BGE): {correlation_bge:.2f}")


## Word Analogy

In [None]:
# Example DataFrame of analogies
data_climate_change = {
    'Word 1': ['coal', 'fossil', 'emission', 'climate', 'carbon', 'global', 'electric', 'renewable', 'methane', 'solar',
               'carbon', 'temperature', 'ice', 'deforestation', 'dioxide', 'sea', 'wind', 'greenhouse', 'sustainability', 'pollution'],
    'Word 2': ['fossil', 'energy', 'reduction', 'warming', 'dioxide', 'warming', 'vehicle', 'energy', 'gas', 'power',
               'dioxide', 'rise', 'melt', 'reduction', 'gas', 'level', 'turbine', 'emissions', 'policy', 'impact'],
    'Word 3': ['solar', 'renewable', 'pollution', 'environment', 'methane', 'cooling', 'car', 'wind', 'CO2', 'wind',
               'carbon', 'impact', 'melting', 'forestation', 'gas', 'carbon', 'turbine', 'mitigation', 'climate', 'change'],
    'Expected Word': ['renewable', 'resource', 'control', 'sustainability', 'gas', 'cooling', 'car', 'turbine', 'greenhouse', 'energy',
                      'emission', 'rise', 'melt', 'deforestation', 'gas', 'level', 'wind', 'policy', 'impact', 'policy']}

analogy_df_climate_change = pd.DataFrame(data_climate_change)

data_reddit = {
    'Word 1': ['OP', 'thread', 'TL;DR', 'upvote', 'troll', 'mod', 'AMA', 'lurker', 'NSFW', 'flair',
               'comment', 'karma', 'subreddit', 'post', 'reply', 'ban', 'meme', 'user', 'admin', 'tag'],
    'Word 2': ['post', 'discussion', 'summary', 'downvote', 'bait', 'admin', 'Q&A', 'reader', 'SFW', 'label',
               'reply', 'points', 'community', 'thread', 'comment', 'ban', 'GIF', 'user', 'moderator', 'badge'],
    'Word 3': ['comment', 'reply', 'context', 'upvote', 'spam', 'user', 'ask', 'reader', 'explicit', 'flair',
               'upvote', 'comment', 'thread', 'discussion', 'report', 'image', 'moderator', 'poster', 'sub', 'message'],
    'Expected Word': ['reply', 'discussion', 'summary', 'downvote', 'bait', 'admin', 'Q&A', 'lurker', 'SFW', 'tag',
                      'comment', 'points', 'subreddit', 'post', 'reply', 'ban', 'sticker', 'user', 'admin', 'flair']}

analogy_df_reddit = pd.DataFrame(data_reddit)

data_politics_climate = {
    'Word 1': ['EPA', 'Paris Agreement', 'Biden', 'UN', 'Congress', 'carbon tax', 'renewable energy', 'climate bill', 'COP26', 'Green New Deal',
               'regulation', 'emissions', 'legislation', 'government', 'policy', 'administration', 'carbon footprint', 'international', 'president', 'senator'],
    'Word 2': ['regulation', 'international accord', 'administration', 'global body', 'legislature', 'carbon pricing', 'clean energy', 'policy', 'summit', 'policy',
               'rules', 'treaty', 'law', 'leadership', 'initiative', 'impact', 'effort', 'negotiation', 'leader', 'law'],
    'Word 3': ['Paris Agreement', 'UN', 'Biden', 'G7', 'senator', 'cap-and-trade', 'climate action', 'agenda', 'COP21', 'climate legislation',
               'treaty', 'agreement', 'regulation', 'administration', 'program', 'target', 'campaign', 'deal', 'conference', 'bill'],
    'Expected Word': ['international accord', 'agreement', 'administration', 'global body', 'congress', 'carbon pricing', 'clean energy', 'policy', 'summit', 'policy',
                      'regulation', 'treaty', 'law', 'leadership', 'initiative', 'impact', 'effort', 'negotiation', 'leader', 'law']
}

analogy_df_politics_climate = pd.DataFrame(data_politics_climate)




#### CBOW

In [None]:
## Climate Change related words:
accuracy_cbow_climate_change = evaluate_analogy_dataset(analogy_df_climate_change, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_climate_change:.2%}")

## reddit related words:
accuracy_cbow_reddit = evaluate_analogy_dataset(analogy_df_reddit, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_reddit:.2%}")

## politics related words:
accuracy_cbow_politics = evaluate_analogy_dataset(analogy_df_politics_climate, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_politics:.2%}")


### glove

In [None]:
## Climate Change related words:
accuracy_glove_climate_change = evaluate_analogy_dataset(analogy_df_climate_change, glove_embeddings)
print(f"\nAnalogy Task Accuracy (GloVe - Climate Change): {accuracy_glove_climate_change:.2%}")

## Reddit related words:
accuracy_glove_reddit = evaluate_analogy_dataset(analogy_df_reddit, glove_embeddings)
print(f"\nAnalogy Task Accuracy (GloVe - Reddit): {accuracy_glove_reddit:.2%}")

## Politics/Climate related words:
accuracy_glove_politics_climate = evaluate_analogy_dataset(analogy_df_politics_climate, glove_embeddings)
print(f"\nAnalogy Task Accuracy (GloVe - Politics/Climate): {accuracy_glove_politics_climate:.2%}")


### bge

In [None]:
## Climate Change related words:
accuracy_bge_climate_change = evaluate_analogy_dataset(analogy_df_climate_change, bge_embeddings)
print(f"\nAnalogy Task Accuracy (BGE - Climate Change): {accuracy_bge_climate_change:.2%}")

## Reddit related words:
accuracy_bge_reddit = evaluate_analogy_dataset(analogy_df_reddit, bge_embeddings)
print(f"\nAnalogy Task Accuracy (BGE - Reddit): {accuracy_bge_reddit:.2%}")

## Politics/Climate related words:
accuracy_bge_politics_climate = evaluate_analogy_dataset(analogy_df_politics_climate, bge_embeddings)
print(f"\nAnalogy Task Accuracy (BGE - Politics/Climate): {accuracy_bge_politics_climate:.2%}")


#### Skipgram

In [None]:
## Climate Change related words:
accuracy_skipram_climate_change = evaluate_analogy_dataset(analogy_df_climate_change, skipgram_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_skipram_climate_change:.2%}")
#
### reddit related words:
accuracy_skipgram_reddit = evaluate_analogy_dataset(analogy_df_reddit, skipgram_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_skipgram_reddit:.2%}")
#
### politics related words:
accuracy_skipgram_politics = evaluate_analogy_dataset(analogy_df_politics_climate, skipgram_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_skipgram_politics:.2%}")
