# Word Embeddings

### Creating Word Embedding Models

In [3]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')
import re


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zachm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [4]:
print("Downloading word2vec model")
word2vec_model = gensim.downloader.load('word2vec-google-news-300')
print("Downloading gigaword model")
glove_model = gensim.downloader.load('glove-wiki-gigaword-100')
print("Downloading fasttext model")
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

Downloading word2vec model
Downloading gigaword model
Downloading fasttext model


In [5]:
def tokenize(text):
    text = text.lower()
    text = re.sub(r'[^\w\s\d]', '', text)
    tokens = word_tokenize(text)
    # print(tokens)
    return tokens

In [6]:
def wordToVector(word, model):
    try:
        return model[word]
    except KeyError:
        # Handle out-of-vocabulary words
        return np.zeros(model.vector_size)  # Return zero vector for OOV words

In [7]:
def textToVectors(text, model):
    tokens = tokenize(text)  # Tokenize the text
    vectors = [wordToVector(token, model) for token in tokens]  # Convert words to vectors
    vectors = np.array(vectors)
    return vectors

In [8]:
word2vec_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)
glove_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", glove_model)
fasttext_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", fasttext_model)

# print(word2vec_vectors)
# print(glove_vectors)
# print(fasttext_vectors)

### Vector Compression with PCA

In [9]:
from sklearn.decomposition import PCA
import numpy as np

In [10]:
def compressVectorsPCA(data, target_count):
    pca = PCA(n_components=target_count)
    compressed_data = pca.fit_transform(data.T).T
    return compressed_data

In [11]:
print(np.shape(word2vec_vectors))
if len(word2vec_vectors.shape) == 1:
    word2vec_vectors = np.expand_dims(word2vec_vectors, axis=0)
compressed_word2vec_vectors = compressVectorsPCA(word2vec_vectors, 7)
print(np.shape(compressed_word2vec_vectors))

(10, 300)
(7, 300)


### Vector To Best Match Word(s)

In [19]:
def vectorToBestWords(target_vector, model, num_words=5):
    # Check if model is a full Word2Vec model
    if hasattr(model, 'wv'):
        vectors = model.wv.vectors
        index_to_key = model.wv.index_to_key
    # If not, assume it's a KeyedVectors object
    else:
        vectors = model.vectors
        index_to_key = model.index_to_key

    # Compute cosine similarity between target vector and all word vectors
    similarity_scores = np.dot(vectors, target_vector) / (np.linalg.norm(vectors, axis=1) * np.linalg.norm(target_vector))

    # Get indices of top N words with highest similarity scores
    top_indices = similarity_scores.argsort()[-num_words:][::-1]

    # Get top N words and their similarity scores
    similar_words = [(index_to_key[idx], similarity_scores[idx]) for idx in top_indices]

    return similar_words

In [20]:
print("Performing Vector -> Word using Glove")
target_vector_random = np.random.rand(100)  # Random target vector
similar_words_random = vectorToBestWords(target_vector_random, glove_model, num_words=5)

target_word = "fast"
target_vector_word = wordToVector(target_word, glove_model)
similar_words_word = vectorToBestWords(target_vector_word, glove_model, num_words=5)

print("Best matches for random vector: ", similar_words_random)
print(f"Best matches for specific vector for {target_word}: ", similar_words_word)

print('\n')

print("Performing Vector -> Word using Word2Vec")
target_vector_random = np.random.rand(300)  # Random target vector
similar_words_random = vectorToBestWords(target_vector_random, word2vec_model, num_words=5)

target_word = "fast"
target_vector_word = wordToVector(target_word, word2vec_model)
similar_words_word = vectorToBestWords(target_vector_word, word2vec_model, num_words=5)

print("Best matches for random vector: ", similar_words_random)
print(f"Best matches for specific vector for {target_word}: ", similar_words_word)

Performing Vector -> Word using Glove
Best matches for random vector:  [('db2', 0.40853849455847435), ('ldap', 0.38845534847767554), ('sqlite', 0.3835625956867372), ('zupljanin', 0.3681825436365551), ('eros', 0.365523298471245)]
Best matches for specific vector for fast:  [('fast', 0.99999994), ('slow', 0.795973), ('faster', 0.75118226), ('pace', 0.7462931), ('speed', 0.71333927)]


Performing Vector -> Word using Word2Vec
Best matches for random vector:  [('week_begs##Jun##', 0.2413109026849108), ('Cindy_Crawford', 0.23725296150794603), ('Ovidiu_Rom', 0.2289217162930197), ("l'_Affaire", 0.22402579167550726), ('Sons_Grosset_&', 0.2196883822516193)]
Best matches for specific vector for fast:  [('fast', 1.0000001), ('quick', 0.5701606), ('rapidly', 0.5525555), ('Fast', 0.5490224), ('quickly', 0.5393723)]


# Calculating Loss

In [None]:
import torch
compressed_vector_1 = np.random.rand(7, 300)
print('vector1\n', compressed_vector_1)

compressed_vector_2 = np.random.rand(7, 300)
print('vector2\n', compressed_vector_2)

# Loss calculation
import torch.nn.functional as F


# Convert numpy arrays to PyTorch tensors
compressed_vector_1 = torch.from_numpy(compressed_vector_1).float()
compressed_vector_2 = torch.from_numpy(compressed_vector_2).float()


def cosine_similarity_loss(compressed_vectors_1, compressed_vectors_2):
    # Normalize the compressed vectors
    compressed_vectors_1 = F.normalize(compressed_vectors_1, dim=1)
    compressed_vectors_2 = F.normalize(compressed_vectors_2, dim=1)
    
    # Calculate the cosine similarity between the compressed vectors
    cosine_similarities = F.cosine_similarity(compressed_vectors_1, compressed_vectors_2)
    
    # Define the target labels (1 for similar pairs)
    target = torch.ones_like(cosine_similarities)
    
    # Calculate the MSE loss
    loss = F.mse_loss(cosine_similarities, target)
    
    return loss

loss = cosine_similarity_loss(compressed_vector_1, compressed_vector_2)
print('loss', loss)


vector1
 [[0.37493081 0.89925593 0.68222099 ... 0.80354179 0.16098218 0.74905248]
 [0.57839804 0.10557084 0.05983191 ... 0.55696641 0.25766353 0.47368311]
 [0.90015595 0.96018257 0.92317255 ... 0.64130156 0.13815145 0.43976601]
 ...
 [0.81866496 0.29370606 0.31808073 ... 0.61089443 0.6236832  0.96462161]
 [0.85069772 0.52448952 0.61051311 ... 0.56898204 0.42140225 0.88865234]
 [0.32056013 0.96210239 0.39238596 ... 0.49111334 0.73328403 0.7347346 ]]
vector2
 [[0.75036178 0.24471017 0.53446834 ... 0.84077725 0.90364866 0.83368172]
 [0.38934142 0.04549182 0.94027247 ... 0.83180536 0.56619628 0.39809632]
 [0.19763913 0.79561056 0.26689064 ... 0.2096431  0.7927108  0.91064082]
 ...
 [0.48262378 0.5227481  0.36765888 ... 0.57648586 0.57718368 0.47268441]
 [0.04774399 0.84220641 0.11906403 ... 0.34460865 0.00331085 0.50848442]
 [0.36156734 0.81143025 0.33937332 ... 0.41617547 0.51920126 0.70450542]]
loss tensor(0.0626)


In [None]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Define some similar and dissimilar sentences
similar_sentences = ["The cat sat on the mat", "A cat is sitting on the mat"]
dissimilar_sentences = ["The cat sat on the mat", "Apple is a fruit"]

# Similar sentences are about the same sport (soccer)
similar_sentences = ["The soccer team won the match", "The football squad was victorious in the game"]
dissimilar_sentences = ["The soccer team won the match", "The basketball player scored a three-pointer"]

# Similar sentences are about the same cooking action (baking a cake)
similar_sentences = ["She is baking a cake for the party", "A cake is being baked for the celebration"]
dissimilar_sentences = ["She is baking a cake for the party", "He is frying an egg for breakfast"]


# Tokenize the sentences
similar_sentences = [tokenize(sentence) for sentence in similar_sentences]
dissimilar_sentences = [tokenize(sentence) for sentence in dissimilar_sentences]

# print('similar_sentence tokens:', similar_sentences)
# print('dissimilar_sentences tokens:', dissimilar_sentences)

# Train a Word2Vec model on the sentences
model = Word2Vec(similar_sentences + dissimilar_sentences, min_count=1)

# Convert the sentences to vectors
similar_vectors = [model.wv[sentence].mean(axis=0) for sentence in similar_sentences]
dissimilar_vectors = [model.wv[sentence].mean(axis=0) for sentence in dissimilar_sentences]


# print('similar_vectors:\n', similar_vectors)
# print('dissimilar_vectors:\n', dissimilar_vectors)

# Convert the vectors to PyTorch tensors
similar_vectors = torch.tensor(np.array(similar_vectors)).float()
dissimilar_vectors = torch.tensor(np.array(dissimilar_vectors)).float()

# Calculate the loss for the similar and dissimilar sentences
similar_loss = cosine_similarity_loss(similar_vectors[0].unsqueeze(0), similar_vectors[1].unsqueeze(0))
dissimilar_loss = cosine_similarity_loss(dissimilar_vectors[0].unsqueeze(0), dissimilar_vectors[1].unsqueeze(0))


# close to 0 == similar, closer to 1 == dissimilar
print('Similar loss', similar_loss)
print('Dissimilar loss', dissimilar_loss)

Similar loss tensor(0.1246)
Dissimilar loss tensor(0.6277)
