# Compare performance of CBOW and Skipgram

In [4]:
import pandas as pd
import os
import numpy as np
from scipy.stats import spearmanr
from sklearn.metrics.pairwise import cosine_similarity


try:
    import google.colab
    in_colab = True
    local_path = "/content/drive/MyDrive/DLSS/"
    drive.mount('/content/drive')

except ImportError:
    in_colab = False
    ## get current directory
    current_wd = os.getcwd()
    ## move one up to go to main directory
    local_path = os.path.dirname(current_wd) + "/"

print("CWD: ", local_path)


CWD:  d:\dlss-project24/


## Functions

In [5]:
def df_to_embeddings_dict(df):
    embeddings = {}
    for _, row in df.iterrows():
        word = row['word']  # Assuming the word column is named 'word'
        vector = row.iloc[3:].to_numpy(dtype=np.float32)  # Convert remaining columns to numpy array
        embeddings[word] = vector
    return embeddings

def compute_similarities(word_pairs, embeddings):
    similarities = []
    for word1, word2 in word_pairs:
        if word1 in embeddings and word2 in embeddings:
            vec1 = embeddings[word1].reshape(1, -1)
            vec2 = embeddings[word2].reshape(1, -1)
            similarity = cosine_similarity(vec1, vec2)[0][0]
        else:
            similarity = 0  # Handle OOV words
        similarities.append(similarity)
    return similarities

## Data Prep

### prepare embeddings

In [10]:
embeddings_cbow_df = pd.read_csv(local_path + "data/embeddings/embeddings_CBOW_sampled_100k_all_text.csv")
embeddings_skipgram_df = pd.read_csv(local_path + "data/embeddings/embeddings_skipgram_sampled_100k_all_text.csv")

## to dict
skipgram_embeddings = df_to_embeddings_dict(embeddings_skipgram_df)
cbow_embeddings = df_to_embeddings_dict(embeddings_cbow_df)

### prepare wordsim

In [11]:
wordsim353_df = pd.read_csv(local_path + "data/external_data/wordsim353crowd.csv")

## split up into list of word pairs and list of scores
word_pairs = wordsim353_df[['Word 1', 'Word 2']].values.tolist()
human_scores = wordsim353_df['Human (Mean)'].values

wordsim353_df

Unnamed: 0,Word 1,Word 2,Human (Mean)
0,admission,ticket,5.5360
1,alcohol,chemistry,4.1250
2,aluminum,metal,6.6250
3,announcement,effort,2.0625
4,announcement,news,7.1875
...,...,...,...
348,weapon,secret,2.5000
349,weather,forecast,5.4375
350,Wednesday,news,1.1250
351,wood,forest,7.9375


## Word Similarity 

In [12]:
skipgram_similarities = compute_similarities(word_pairs, skipgram_embeddings)
cbow_similarities = compute_similarities(word_pairs, cbow_embeddings)

## calculate correlation
correlation_skipgram = spearmanr(skipgram_similarities, human_scores).correlation
correlation_cbow = spearmanr(cbow_similarities, human_scores).correlation

# Step 5: Output the Results
print(f"Spearman Correlation (Skip-gram): {correlation_skipgram:.2f}")
print(f"Spearman Correlation (CBOW): {correlation_cbow:.2f}")

Spearman Correlation (Skip-gram): 0.05
Spearman Correlation (CBOW): 0.05


## Word Analogy

In [13]:
import spacy
import numpy as np
from scipy.spatial.distance import cosine
import pandas as pd

# Load SpaCy model for lemmatization
nlp = spacy.load('en_core_web_sm')

# Function to lemmatize words
def lemmatize_word(word):
    doc = nlp(word)
    return doc[0].lemma_

# Function to predict the fourth word in an analogy
def predict_analogy_word(word_a, word_b, word_c, word_to_vec):
    # Lemmatize the words to match the vocabulary
    word_a = lemmatize_word(word_a)
    word_b = lemmatize_word(word_b)
    word_c = lemmatize_word(word_c)

    # Retrieve the vectors for the words
    vec_a = word_to_vec.get(word_a)
    vec_b = word_to_vec.get(word_b)
    vec_c = word_to_vec.get(word_c)

    if vec_a is None or vec_b is None or vec_c is None:
        return None

    # Calculate the target vector: vec_b - vec_a + vec_c
    target_vec = vec_b - vec_a + vec_c

    # Find the closest word to the target vector
    best_word = None
    best_similarity = float('inf')
    
    for word, vec in word_to_vec.items():
        if word not in {word_a, word_b, word_c}:
            similarity = cosine(target_vec, vec)
            if similarity < best_similarity:
                best_similarity = similarity
                best_word = word
    
    return best_word

def evaluate_analogy_dataset(analogy_df, word_to_vec):
    correct = 0
    total = len(analogy_df)

    for index, row in analogy_df.iterrows():
        word_a = row['Word 1']
        word_b = row['Word 2']
        word_c = row['Word 3']
        expected_word_d = row['Expected Word']

        predicted_word_d = predict_analogy_word(word_a, word_b, word_c, word_to_vec)

        # Compare the predicted word with the expected word
        if predicted_word_d == lemmatize_word(expected_word_d):
            correct += 1
        
        print(f"Analogy: {word_a} is to {word_b} as {word_c} is to {predicted_word_d} (Expected: {expected_word_d})")
    
    # Calculate accuracy
    accuracy = correct / total
    return accuracy

# Example DataFrame of analogies
data_climate_change = {
    'Word 1': ['coal', 'fossil', 'emission', 'climate', 'carbon', 'global', 'electric', 'renewable', 'methane', 'solar',
               'carbon', 'temperature', 'ice', 'deforestation', 'dioxide', 'sea', 'wind', 'greenhouse', 'sustainability', 'pollution'],
    'Word 2': ['fossil', 'energy', 'reduction', 'warming', 'dioxide', 'warming', 'vehicle', 'energy', 'gas', 'power',
               'dioxide', 'rise', 'melt', 'reduction', 'gas', 'level', 'turbine', 'emissions', 'policy', 'impact'],
    'Word 3': ['solar', 'renewable', 'pollution', 'environment', 'methane', 'cooling', 'car', 'wind', 'CO2', 'wind',
               'carbon', 'impact', 'melting', 'forestation', 'gas', 'carbon', 'turbine', 'mitigation', 'climate', 'change'],
    'Expected Word': ['renewable', 'resource', 'control', 'sustainability', 'gas', 'cooling', 'car', 'turbine', 'greenhouse', 'energy',
                      'emission', 'rise', 'melt', 'deforestation', 'gas', 'level', 'wind', 'policy', 'impact', 'policy']}

analogy_df_climate_change = pd.DataFrame(data_climate_change)

data_reddit = {
    'Word 1': ['OP', 'thread', 'TL;DR', 'upvote', 'troll', 'mod', 'AMA', 'lurker', 'NSFW', 'flair',
               'comment', 'karma', 'subreddit', 'post', 'reply', 'ban', 'meme', 'user', 'admin', 'tag'],
    'Word 2': ['post', 'discussion', 'summary', 'downvote', 'bait', 'admin', 'Q&A', 'reader', 'SFW', 'label',
               'reply', 'points', 'community', 'thread', 'comment', 'ban', 'GIF', 'user', 'moderator', 'badge'],
    'Word 3': ['comment', 'reply', 'context', 'upvote', 'spam', 'user', 'ask', 'reader', 'explicit', 'flair',
               'upvote', 'comment', 'thread', 'discussion', 'report', 'image', 'moderator', 'poster', 'sub', 'message'],
    'Expected Word': ['reply', 'discussion', 'summary', 'downvote', 'bait', 'admin', 'Q&A', 'lurker', 'SFW', 'tag',
                      'comment', 'points', 'subreddit', 'post', 'reply', 'ban', 'sticker', 'user', 'admin', 'flair']}

analogy_df_reddit = pd.DataFrame(data_reddit)

data_politics_climate = {
    'Word 1': ['EPA', 'Paris Agreement', 'Biden', 'UN', 'Congress', 'carbon tax', 'renewable energy', 'climate bill', 'COP26', 'Green New Deal',
               'regulation', 'emissions', 'legislation', 'government', 'policy', 'administration', 'carbon footprint', 'international', 'president', 'senator'],
    'Word 2': ['regulation', 'international accord', 'administration', 'global body', 'legislature', 'carbon pricing', 'clean energy', 'policy', 'summit', 'policy',
               'rules', 'treaty', 'law', 'leadership', 'initiative', 'impact', 'effort', 'negotiation', 'leader', 'law'],
    'Word 3': ['Paris Agreement', 'UN', 'Biden', 'G7', 'senator', 'cap-and-trade', 'climate action', 'agenda', 'COP21', 'climate legislation',
               'treaty', 'agreement', 'regulation', 'administration', 'program', 'target', 'campaign', 'deal', 'conference', 'bill'],
    'Expected Word': ['international accord', 'agreement', 'administration', 'global body', 'congress', 'carbon pricing', 'clean energy', 'policy', 'summit', 'policy',
                      'regulation', 'treaty', 'law', 'leadership', 'initiative', 'impact', 'effort', 'negotiation', 'leader', 'law']
}

analogy_df_politics_climate = pd.DataFrame(data_politics_climate)




#### CBOW

In [14]:
## Climate Change related words:
accuracy_cbow_climate_change = evaluate_analogy_dataset(analogy_df_climate_change, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_climate_change:.2%}")


  dist = 1.0 - uv / math.sqrt(uu * vv)


Analogy: coal is to fossil as solar is to oddball (Expected: renewable)
Analogy: fossil is to energy as renewable is to antler (Expected: resource)
Analogy: emission is to reduction as pollution is to antler (Expected: control)
Analogy: climate is to warming as environment is to synagogue (Expected: sustainability)
Analogy: carbon is to dioxide as methane is to antler (Expected: gas)
Analogy: global is to warming as cooling is to synagogue (Expected: cooling)
Analogy: electric is to vehicle as car is to synagogue (Expected: car)
Analogy: renewable is to energy as wind is to synagogue (Expected: turbine)
Analogy: methane is to gas as CO2 is to None (Expected: greenhouse)
Analogy: solar is to power as wind is to antler (Expected: energy)
Analogy: carbon is to dioxide as carbon is to antler (Expected: emission)
Analogy: temperature is to rise as impact is to antler (Expected: rise)
Analogy: ice is to melt as melting is to antler (Expected: melt)
Analogy: deforestation is to reduction as f

In [15]:
## reddit related words:
accuracy_cbow_reddit = evaluate_analogy_dataset(analogy_df_reddit, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_reddit:.2%}")


Analogy: OP is to post as comment is to synagogue (Expected: reply)
Analogy: thread is to discussion as reply is to reus (Expected: discussion)
Analogy: TL;DR is to summary as context is to None (Expected: summary)
Analogy: upvote is to downvote as upvote is to synagogue (Expected: downvote)
Analogy: troll is to bait as spam is to shedding (Expected: bait)
Analogy: mod is to admin as user is to synagogue (Expected: admin)
Analogy: AMA is to Q&A as ask is to None (Expected: Q&A)
Analogy: lurker is to reader as reader is to antler (Expected: lurker)
Analogy: NSFW is to SFW as explicit is to None (Expected: SFW)
Analogy: flair is to label as flair is to synagogue (Expected: tag)
Analogy: comment is to reply as upvote is to antler (Expected: comment)
Analogy: karma is to points as comment is to synagogue (Expected: points)
Analogy: subreddit is to community as thread is to synagogue (Expected: subreddit)
Analogy: post is to thread as discussion is to antler (Expected: post)
Analogy: reply 

In [16]:
## reddit related words:
accuracy_cbow_reddit = evaluate_analogy_dataset(analogy_df_politics_climate, cbow_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_cbow_reddit:.2%}")


Analogy: EPA is to regulation as Paris Agreement is to None (Expected: international accord)
Analogy: Paris Agreement is to international accord as UN is to None (Expected: agreement)
Analogy: Biden is to administration as Biden is to None (Expected: administration)
Analogy: UN is to global body as G7 is to None (Expected: global body)
Analogy: Congress is to legislature as senator is to None (Expected: congress)
Analogy: carbon tax is to carbon pricing as cap-and-trade is to synagogue (Expected: carbon pricing)
Analogy: renewable energy is to clean energy as climate action is to synagogue (Expected: clean energy)
Analogy: climate bill is to policy as agenda is to synagogue (Expected: policy)
Analogy: COP26 is to summit as COP21 is to None (Expected: summit)
Analogy: Green New Deal is to policy as climate legislation is to None (Expected: policy)
Analogy: regulation is to rules as treaty is to shedding (Expected: regulation)
Analogy: emissions is to treaty as agreement is to synagogue 

#### Skipgram

In [17]:
## Climate Change related words:
accuracy_skipram_climate_change = evaluate_analogy_dataset(analogy_df_climate_change, skipgram_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_skipram_climate_change:.2%}")


Analogy: coal is to fossil as solar is to oddball (Expected: renewable)
Analogy: fossil is to energy as renewable is to antler (Expected: resource)
Analogy: emission is to reduction as pollution is to antler (Expected: control)
Analogy: climate is to warming as environment is to synagogue (Expected: sustainability)
Analogy: carbon is to dioxide as methane is to antler (Expected: gas)
Analogy: global is to warming as cooling is to synagogue (Expected: cooling)
Analogy: electric is to vehicle as car is to synagogue (Expected: car)
Analogy: renewable is to energy as wind is to synagogue (Expected: turbine)
Analogy: methane is to gas as CO2 is to None (Expected: greenhouse)
Analogy: solar is to power as wind is to antler (Expected: energy)
Analogy: carbon is to dioxide as carbon is to antler (Expected: emission)
Analogy: temperature is to rise as impact is to antler (Expected: rise)
Analogy: ice is to melt as melting is to antler (Expected: melt)
Analogy: deforestation is to reduction as f

In [None]:
## reddit related words:
accuracy_skipgram_reddit = evaluate_analogy_dataset(analogy_df_reddit, skipgram_embeddings)
print(f"\nAnalogy Task Accuracy: {accuracy_skipgram_reddit:.2%}")


Analogy: OP is to comment as post is to ask (Expected: OP)
Analogy: thread is to discussion as reply is to difficult (Expected: discussion)
Analogy: TL;DR is to summary as context is to None (Expected: summary)
Analogy: upvote is to downvote as upvote is to None (Expected: downvote)
Analogy: troll is to bait as spam is to None (Expected: bait)
Analogy: mod is to admin as user is to None (Expected: admin)
Analogy: AMA is to Q&A as ask is to None (Expected: Q&A)
Analogy: lurker is to poster as reader is to None (Expected: lurker)
Analogy: NSFW is to SFW as explicit is to None (Expected: SFW)
Analogy: flair is to tag as label is to None (Expected: tag)

Analogy Task Accuracy: 0.00%
