### Word Embedding Models ###

In [1]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader
import nltk
from nltk.tokenize import word_tokenize


In [2]:
print("Downloading word2vec model")
word2vec_model = gensim.downloader.load('word2vec-google-news-300')
print("Downloading gigaword model")
glove_model = gensim.downloader.load('glove-wiki-gigaword-100')
print("Downloading fasttext model")
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

Downloading word2vec model
Downloading gigaword model
Downloading fasttext model


In [3]:
def tokenize(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

In [4]:
def wordToVector(word, model):
    try:
        return model[word]
    except KeyError:
        # Handle out-of-vocabulary words
        return np.zeros(model.vector_size)  # Return zero vector for OOV words

In [5]:
def textToVectors(text, model):
    tokens = tokenize(text)  # Tokenize the text
    vectors = [wordToVector(token, model) for token in tokens]  # Convert words to vectors
    vectors = np.array(vectors)
    return vectors

In [6]:
nltk.download('punkt') # needed this dependency to run this module

word2vec_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)
glove_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", glove_model)
fasttext_vectors = textToVectors("The quick brown fox jumped. Over the lazy, brown Dog!", fasttext_model)

# print(word2vec_vectors)
# print(glove_vectors)
# print(fasttext_vectors)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\zachm\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [7]:
from sklearn.decomposition import PCA
import numpy as np

In [8]:
def compressVectorsPCA(data, target_count):
    pca = PCA(n_components=target_count)
    compressed_data = pca.fit_transform(data.T).T
    return compressed_data

In [9]:
print(np.shape(word2vec_vectors))
if len(word2vec_vectors.shape) == 1:
    word2vec_vectors = np.expand_dims(word2vec_vectors, axis=0)
compressed_word2vec_vectors = compressVectorsPCA(word2vec_vectors, 7)
print(np.shape(compressed_word2vec_vectors))

(13, 300)
(7, 300)


# Calculating Loss

In [10]:
import torch
compressed_vector_1 = np.random.rand(7, 300)
print('vector1\n', compressed_vector_1)

compressed_vector_2 = np.random.rand(7, 300)
print('vector2\n', compressed_vector_2)

# Loss calculation
import torch.nn.functional as F


# Convert numpy arrays to PyTorch tensors
compressed_vector_1 = torch.from_numpy(compressed_vector_1).float()
compressed_vector_2 = torch.from_numpy(compressed_vector_2).float()


def cosine_similarity_loss(compressed_vectors_1, compressed_vectors_2):
    # Normalize the compressed vectors
    compressed_vectors_1 = F.normalize(compressed_vectors_1, dim=1)
    compressed_vectors_2 = F.normalize(compressed_vectors_2, dim=1)
    
    # Calculate the cosine similarity between the compressed vectors
    cosine_similarities = F.cosine_similarity(compressed_vectors_1, compressed_vectors_2)
    
    # Define the target labels (1 for similar pairs)
    target = torch.ones_like(cosine_similarities)
    
    # Calculate the MSE loss
    loss = F.mse_loss(cosine_similarities, target)
    
    return loss

loss = cosine_similarity_loss(compressed_vector_1, compressed_vector_2)
print('loss', loss)


vector1
 [[0.5998198  0.87476283 0.18173789 ... 0.34574115 0.46002654 0.4450597 ]
 [0.54420733 0.60855119 0.07550424 ... 0.24339172 0.49459274 0.45631048]
 [0.77527884 0.94949736 0.52133708 ... 0.40341107 0.61114772 0.42465905]
 ...
 [0.03183342 0.81069096 0.20642733 ... 0.43671026 0.04127853 0.97592815]
 [0.73900767 0.74957235 0.47429497 ... 0.79351917 0.5721325  0.60575341]
 [0.86258475 0.89757387 0.76513914 ... 0.75768511 0.75769022 0.74978716]]
vector2
 [[0.19516028 0.02849873 0.92067956 ... 0.97349843 0.97812601 0.91455581]
 [0.9757859  0.96299027 0.28742315 ... 0.87991771 0.74952072 0.44609446]
 [0.14679533 0.79318475 0.08814172 ... 0.39968938 0.80172621 0.66361208]
 ...
 [0.45635654 0.12651347 0.37567982 ... 0.32325817 0.61019748 0.5223292 ]
 [0.08723689 0.79460083 0.76671261 ... 0.8334818  0.69967357 0.04181071]
 [0.24358481 0.36747512 0.88149076 ... 0.50479809 0.72011864 0.95091518]]
loss tensor(0.0655)


In [14]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Define some similar and dissimilar sentences
similar_sentences = ["The cat sat on the mat", "A cat is sitting on the mat"]
dissimilar_sentences = ["The cat sat on the mat", "Apple is a fruit"]

# Similar sentences are about the same sport (soccer)
similar_sentences = ["The soccer team won the match", "The football squad was victorious in the game"]
dissimilar_sentences = ["The soccer team won the match", "The basketball player scored a three-pointer"]

# Similar sentences are about the same cooking action (baking a cake)
similar_sentences = ["She is baking a cake for the party", "A cake is being baked for the celebration"]
dissimilar_sentences = ["She is baking a cake for the party", "He is frying an egg for breakfast"]


# Tokenize the sentences
similar_sentences = [tokenize(sentence) for sentence in similar_sentences]
dissimilar_sentences = [tokenize(sentence) for sentence in dissimilar_sentences]

# print('similar_sentence tokens:', similar_sentences)
# print('dissimilar_sentences tokens:', dissimilar_sentences)

# Train a Word2Vec model on the sentences
model = Word2Vec(similar_sentences + dissimilar_sentences, min_count=1)

# Convert the sentences to vectors
similar_vectors = [model.wv[sentence].mean(axis=0) for sentence in similar_sentences]
dissimilar_vectors = [model.wv[sentence].mean(axis=0) for sentence in dissimilar_sentences]


# print('similar_vectors:\n', similar_vectors)
# print('dissimilar_vectors:\n', dissimilar_vectors)

# Convert the vectors to PyTorch tensors
similar_vectors = torch.tensor(np.array(similar_vectors)).float()
dissimilar_vectors = torch.tensor(np.array(dissimilar_vectors)).float()

# Calculate the loss for the similar and dissimilar sentences
similar_loss = cosine_similarity_loss(similar_vectors[0].unsqueeze(0), similar_vectors[1].unsqueeze(0))
dissimilar_loss = cosine_similarity_loss(dissimilar_vectors[0].unsqueeze(0), dissimilar_vectors[1].unsqueeze(0))


# close to 0 == similar, closer to 1 == dissimilar
print('Similar loss', similar_loss)
print('Dissimilar loss', dissimilar_loss)

Similar loss tensor(0.1246)
Dissimilar loss tensor(0.6277)
