# Word Embeddings

### Creating Word Embedding Models

In [1]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zachm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
print("Downloading word2vec model")
word2vec_model = gensim.downloader.load('word2vec-google-news-300')
print("Downloading gigaword model")
glove_model = gensim.downloader.load('glove-wiki-gigaword-100')
print("Downloading fasttext model")
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

Downloading word2vec model
Downloading gigaword model
Downloading fasttext model


In [3]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\d]', '', text)
    tokens = word_tokenize(text)
    # print(tokens)
    return tokens

In [4]:
def wordToVector(word, model):
    try:
        return model[word]
    except KeyError:
        # Handle out-of-vocabulary words
        return np.zeros(model.vector_size)  # Return zero vector for OOV words

In [5]:
def textToVectors(text, model):
    tokens = tokenize(text)  # Tokenize the text
    vectors = [wordToVector(token, model) for token in tokens]  # Convert words to vectors
    vectors = np.array(vectors)
    return vectors

In [6]:
word2vec_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)
glove_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", glove_model)
fasttext_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", fasttext_model)

# print(word2vec_vectors)
# print(glove_vectors)
# print(fasttext_vectors)

### Vector Compression with PCA

It seems like using PCA results in vectors losing any meaning to the word embedding models. Although this might technically preserve a lot of data, we would need a new model to interpret them.

In [7]:
from sklearn.decomposition import PCA
import numpy as np

In [8]:
def compressVectorsPCA(data, target_count):
    pca = PCA(n_components=target_count)
    compressed_data = pca.fit_transform(data.T).T
    return compressed_data

In [9]:
print(np.shape(word2vec_vectors))
if len(word2vec_vectors.shape) == 1:
    word2vec_vectors = np.expand_dims(word2vec_vectors, axis=0)
compressed_word2vec_vectors = compressVectorsPCA(word2vec_vectors, 7)
print(np.shape(compressed_word2vec_vectors))

(10, 300)
(7, 300)


### Vector Compression With Averaging - UNFINISHED

In [74]:
import math

#Example usage:
#[v1, v2, v3, v4], 2 -> [(v1 + v2) / 2, (v3 + v4) / 2]
#[v1, v2, v3], 2 -> [(v1 + (v2/2)) / 1.5, ((v2/2) + v3) / 1.5]

def compressVectorsWithAveraging(vectors, compression_size):
    group_size = len(vectors) / compression_size
    compressed_vectors = []
    i = 0
    while (i < len(vectors) - group_size):
        # print(i)
        group_end = i + group_size
        # print(np.shape(vectors[0]))
        new_vec = np.zeros_like(vectors[0])

        while i < group_end:
            new_vec = []
            if (group_end - i) >= 1:
                dist_to_prev_whole_number = (i % 1)
                dist_to_next_whole_number = 1 - dist_to_prev_whole_number

                new_vec = dist_to_next_whole_number * np.array(vectors[i])
                print(new_vec)
                i += 1

        # print(np.shape(new_vec))
        

    return np.array(compressed_vectors)


In [75]:
word2vec_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)

# for vec in word2vec_vectors:
#     print(np.linalg.norm(vec))
print()
compressed_vectors = compressVectorsWithAveraging(word2vec_vectors, 6)
# for vec in compressed_vectors:
#     print(np.linalg.norm(vec))

# very_compressed_vectors = compressVectorsWithAveraging(word2vec_vectors, 3)


[ 0.08007812  0.10498047  0.04980469  0.0534668  -0.06738281 -0.12060547
  0.03515625 -0.11865234  0.04394531  0.03015137 -0.05688477 -0.07617188
  0.01287842  0.04980469 -0.08496094 -0.06347656  0.00628662 -0.04321289
  0.02026367  0.01330566 -0.01953125  0.09277344 -0.171875   -0.00131989
  0.06542969  0.05834961 -0.08251953  0.0859375  -0.00318909  0.05859375
 -0.03491211 -0.0123291  -0.0480957  -0.00302124  0.05639648  0.01495361
 -0.07226562 -0.05224609  0.09667969  0.04296875 -0.03540039 -0.07324219
  0.03271484 -0.06176758  0.00787354  0.0035553  -0.00878906  0.0390625
  0.03833008  0.04443359  0.06982422  0.01263428 -0.00445557 -0.03320312
 -0.04272461  0.09765625 -0.02160645 -0.0378418   0.01190186 -0.01391602
 -0.11328125  0.09326172 -0.03930664 -0.11621094  0.02331543 -0.01599121
  0.02636719  0.10742188 -0.00466919  0.09619141  0.0279541  -0.05395508
  0.08544922 -0.03686523 -0.02026367 -0.08544922  0.125       0.14453125
  0.0267334   0.15039062  0.05273438 -0.18652344  0

KeyboardInterrupt: 

In [55]:
for i, vec in enumerate(compressed_vectors):
    best_matches = vectorToBestWords(vec, word2vec_model, num_words=5)
    print(f"best matches for compressed vector {i}: ", best_matches)

for i, vec in enumerate(very_compressed_vectors):
    best_matches = vectorToBestWords(vec, word2vec_model, num_words=5)
    print(f"best matches for very compressed vector {i}: ", best_matches)

best matches for compressed vector 0:  [('quick', 0.9999999), ('swift', 0.6208426), ('speedy', 0.5804499), ('fast', 0.57016057), ('easy', 0.56960756)]
best matches for compressed vector 1:  [('fox', 0.99999994), ('foxes', 0.77625567), ('squirrel', 0.6794781), ('rabbit', 0.6482737), ('squirrels', 0.638612)]
best matches for compressed vector 2:  [('over', 0.99999994), ('past', 0.5859714), ('Over', 0.5610154), ('overthe', 0.55483913), ('within', 0.4844896)]
best matches for compressed vector 3:  [('the', 0.9999998), ('this', 0.5937378), ('in', 0.5429296), ('that', 0.526257), ('ofthe', 0.51502824)]
best matches for compressed vector 4:  [('brown', 0.99999994), ('brownish', 0.70204836), ('reddish_brown', 0.69402885), ('reddish', 0.6582767), ('white', 0.65807706)]
best matches for very compressed vector 0:  [('fox', 0.99999994), ('foxes', 0.77625567), ('squirrel', 0.6794781), ('rabbit', 0.6482737), ('squirrels', 0.638612)]
best matches for very compressed vector 1:  [('the', 0.9999998), ('t

### Vector To Best Match Word(s)

In [47]:
def vectorToBestWords(target_vector, model, num_words=5):
    # Check if model is a full Word2Vec model
    if hasattr(model, 'wv'):
        vectors = model.wv.vectors
        index_to_key = model.wv.index_to_key
    # If not, assume it's a KeyedVectors object
    else:
        vectors = model.vectors
        index_to_key = model.index_to_key

    # Compute cosine similarity between target vector and all word vectors
    similarity_scores = np.dot(vectors, target_vector) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(target_vector))

    # Get indices of top N words with highest similarity scores
    top_indices = similarity_scores.argsort()[-num_words:][::-1]

    # Get top N words and their similarity scores
    similar_words = [(index_to_key[idx], similarity_scores[idx]) for idx in top_indices]

    return similar_words

In [11]:
print("Performing Vector -> Word using Glove")
target_vector_random = np.random.rand(100)  # Random target vector
similar_words_random = vectorToBestWords(target_vector_random, glove_model, num_words=5)

target_word = "fast"
target_vector_word = wordToVector(target_word, glove_model)
similar_words_word = vectorToBestWords(target_vector_word, glove_model, num_words=5)

print("Best matches for random vector: ", similar_words_random)
print(f"Best matches for specific vector for {target_word}: ", similar_words_word)

print('\n')

print("Performing Vector -> Word using Word2Vec")
target_vector_random = np.random.rand(300)  # Random target vector
similar_words_random = vectorToBestWords(target_vector_random, word2vec_model, num_words=5)

target_word = "fast"
target_vector_word = wordToVector(target_word, word2vec_model)
similar_words_word = vectorToBestWords(target_vector_word, word2vec_model, num_words=5)

print("Best matches for random vector: ", similar_words_random)
print(f"Best matches for specific vector for {target_word}: ", similar_words_word)

Performing Vector -> Word using Glove
Best matches for random vector:  [('sunita', 0.3848831591768465), ('trnopolje', 0.36304550206512476), ('eros', 0.35788758373142765), ('omarska', 0.3487484460830099), ('mig-19', 0.34581193864475784)]
Best matches for specific vector for fast:  [('fast', 0.99999994), ('slow', 0.795973), ('faster', 0.75118226), ('pace', 0.7462931), ('speed', 0.71333927)]


Performing Vector -> Word using Word2Vec
Best matches for random vector:  [('Legislative_Scorecard', 0.23590085064348204), ('AP_HOCKEY_NEWS', 0.23459775667067212), ('TRENDING_UP', 0.2237782728651971), ('Website_http://www.cgi.com', 0.2216507669298352), ('TRENDING_DOWN', 0.22140191310845694)]
Best matches for specific vector for fast:  [('fast', 1.0000001), ('quick', 0.5701606), ('rapidly', 0.5525555), ('Fast', 0.5490224), ('quickly', 0.5393723)]


# Calculating Loss

In [12]:
import torch
compressed_vector_1 = np.random.rand(7, 300)
print('vector1\n', compressed_vector_1)

compressed_vector_2 = np.random.rand(7, 300)
print('vector2\n', compressed_vector_2)

# Loss calculation
import torch.nn.functional as F


# Convert numpy arrays to PyTorch tensors
compressed_vector_1 = torch.from_numpy(compressed_vector_1).float()
compressed_vector_2 = torch.from_numpy(compressed_vector_2).float()


def cosine_similarity_loss(compressed_vectors_1, compressed_vectors_2):
    # Normalize the compressed vectors
    compressed_vectors_1 = F.normalize(compressed_vectors_1, dim=1)
    compressed_vectors_2 = F.normalize(compressed_vectors_2, dim=1)
    
    # Calculate the cosine similarity between the compressed vectors
    cosine_similarities = F.cosine_similarity(compressed_vectors_1, compressed_vectors_2)
    
    # Define the target labels (1 for similar pairs)
    target = torch.ones_like(cosine_similarities)
    
    # Calculate the MSE loss
    loss = F.mse_loss(cosine_similarities, target)
    
    return loss

loss = cosine_similarity_loss(compressed_vector_1, compressed_vector_2)
print('loss', loss)


vector1
 [[7.39749257e-01 8.39965802e-02 7.03839622e-01 ... 5.73710887e-01
  4.62328625e-01 3.18578680e-01]
 [8.22803691e-01 4.67929492e-01 1.76120936e-01 ... 9.44050707e-01
  8.30725811e-01 1.29721382e-01]
 [9.82973136e-01 3.21305293e-01 7.44089749e-01 ... 2.67614653e-01
  1.93074362e-01 5.24516552e-01]
 ...
 [7.77721563e-01 3.98477142e-01 4.73752422e-02 ... 6.38497226e-01
  1.51927150e-02 3.23426927e-05]
 [1.79944801e-01 7.81132929e-03 3.40351758e-01 ... 2.26077714e-01
  9.68746871e-01 1.14903095e-01]
 [9.02326391e-01 4.94349923e-01 7.32896543e-02 ... 2.86647824e-01
  9.53655073e-01 9.08831145e-01]]
vector2
 [[0.2669689  0.49207948 0.21668173 ... 0.3196332  0.24785025 0.96875742]
 [0.96472003 0.3580558  0.63984578 ... 0.02406716 0.82997334 0.57165175]
 [0.31159191 0.29564814 0.72713214 ... 0.74641691 0.39253504 0.84042203]
 ...
 [0.21530968 0.67735914 0.02367151 ... 0.72195895 0.58287576 0.98881385]
 [0.58858401 0.33128256 0.60276719 ... 0.69665234 0.28861399 0.9899224 ]
 [0.67471068

In [13]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Define some similar and dissimilar sentences
similar_sentences = ["The cat sat on the mat", "A cat is sitting on the mat"]
dissimilar_sentences = ["The cat sat on the mat", "Apple is a fruit"]

# Similar sentences are about the same sport (soccer)
similar_sentences = ["The soccer team won the match", "The football squad was victorious in the game"]
dissimilar_sentences = ["The soccer team won the match", "The basketball player scored a three-pointer"]

# Similar sentences are about the same cooking action (baking a cake)
similar_sentences = ["She is baking a cake for the party", "A cake is being baked for the celebration"]
dissimilar_sentences = ["She is baking a cake for the party", "He is frying an egg for breakfast"]


# Tokenize the sentences
similar_sentences = [tokenize(sentence) for sentence in similar_sentences]
dissimilar_sentences = [tokenize(sentence) for sentence in dissimilar_sentences]

# print('similar_sentence tokens:', similar_sentences)
# print('dissimilar_sentences tokens:', dissimilar_sentences)

# Train a Word2Vec model on the sentences
model = Word2Vec(similar_sentences + dissimilar_sentences, min_count=1)

# Convert the sentences to vectors
similar_vectors = [model.wv[sentence].mean(axis=0) for sentence in similar_sentences]
dissimilar_vectors = [model.wv[sentence].mean(axis=0) for sentence in dissimilar_sentences]


# print('similar_vectors:\n', similar_vectors)
# print('dissimilar_vectors:\n', dissimilar_vectors)

# Convert the vectors to PyTorch tensors
similar_vectors = torch.tensor(np.array(similar_vectors)).float()
dissimilar_vectors = torch.tensor(np.array(dissimilar_vectors)).float()

# Calculate the loss for the similar and dissimilar sentences
similar_loss = cosine_similarity_loss(similar_vectors[0].unsqueeze(0), similar_vectors[1].unsqueeze(0))
dissimilar_loss = cosine_similarity_loss(dissimilar_vectors[0].unsqueeze(0), dissimilar_vectors[1].unsqueeze(0))


# close to 0 == similar, closer to 1 == dissimilar
print('Similar loss', similar_loss)
print('Dissimilar loss', dissimilar_loss)

Similar loss tensor(0.1246)
Dissimilar loss tensor(0.6277)
