In [1]:
import numpy as np
import gensim
from gensim.models import Word2Vec
from gensim import corpora, models, similarities, downloader
import nltk
from nltk.tokenize import word_tokenize


In [2]:
print("Downloading word2vec model")
word2vec_model = gensim.downloader.load('word2vec-google-news-300')
print("Downloading gigaword model")
glove_model = gensim.downloader.load('glove-wiki-gigaword-100')
print("Downloading fasttext model")
fasttext_model = gensim.downloader.load('fasttext-wiki-news-subwords-300')

Downloading word2vec model
Downloading gigaword model
Downloading fasttext model


In [3]:
def tokenize(text):
    text = text.lower()
    tokens = word_tokenize(text)
    return tokens

In [4]:
def word_to_vector(word, model):
    try:
        return model[word]
    except KeyError:
        # Handle out-of-vocabulary words
        return np.zeros(model.vector_size)  # Return zero vector for OOV words

In [5]:
def text_to_vectors(text, model):
    tokens = tokenize(text)  # Tokenize the text
    vectors = [word_to_vector(token, model) for token in tokens]  # Convert words to vectors
    vectors = np.array(vectors)
    return vectors

In [11]:
nltk.download('punkt') # needed this dependency to run this module

word2vec_vectors = text_to_vectors("The quick brown fox jumped. Over the lazy, brown Dog!", word2vec_model)
glove_vectors = text_to_vectors("The quick brown fox jumped. Over the lazy, brown Dog!", glove_model)
fasttext_vectors = text_to_vectors("The quick brown fox jumped. Over the lazy, brown Dog!", fasttext_model)

# print(word2vec_vectors)
# print(glove_vectors)
# print(fasttext_vectors)

[[ 0.08007812  0.10498047  0.04980469 ...  0.00366211  0.04760742
  -0.06884766]
 [ 0.32421875  0.01373291 -0.02819824 ...  0.05175781 -0.09033203
  -0.12792969]
 [ 0.00787354  0.12890625  0.02734375 ... -0.01373291  0.12060547
   0.19238281]
 ...
 [ 0.00787354  0.12890625  0.02734375 ... -0.01373291  0.12060547
   0.19238281]
 [ 0.05126953 -0.02233887 -0.17285156 ...  0.41601562 -0.35546875
   0.22265625]
 [ 0.          0.          0.         ...  0.          0.
   0.        ]]
[array([-0.038194, -0.24487 ,  0.72812 , -0.39961 ,  0.083172,  0.043953,
       -0.39141 ,  0.3344  , -0.57545 ,  0.087459,  0.28787 , -0.06731 ,
        0.30906 , -0.26384 , -0.13231 , -0.20757 ,  0.33395 , -0.33848 ,
       -0.31743 , -0.48336 ,  0.1464  , -0.37304 ,  0.34577 ,  0.052041,
        0.44946 , -0.46971 ,  0.02628 , -0.54155 , -0.15518 , -0.14107 ,
       -0.039722,  0.28277 ,  0.14393 ,  0.23464 , -0.31021 ,  0.086173,
        0.20397 ,  0.52624 ,  0.17164 , -0.082378, -0.71787 , -0.41531 ,
    

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jeremysaccount/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:
from sklearn.decomposition import PCA
import numpy as np

In [17]:
def compress_vectors_PCA(data, target_count):
    pca = PCA(n_components=target_count)
    compressed_data = pca.fit_transform(data.T).T
    return compressed_data

In [21]:
print(np.shape(word2vec_vectors))
if len(word2vec_vectors.shape) == 1:
    word2vec_vectors = np.expand_dims(word2vec_vectors, axis=0)
compressed_word2vec_vectors = compress_vectors_PCA(word2vec_vectors, 7)
print(np.shape(compressed_word2vec_vectors))

(13, 300)
(7, 300)


In [279]:
import torch
compressed_vector_1 = np.random.rand(7, 300)
print('vector1\n', compressed_vector_1)

compressed_vector_2 = np.random.rand(7, 300)
print('vector2\n', compressed_vector_2)

# Loss calculation
import torch.nn.functional as F


# Convert numpy arrays to PyTorch tensors
compressed_vector_1 = torch.from_numpy(compressed_vector_1).float()
compressed_vector_2 = torch.from_numpy(compressed_vector_2).float()


def cosine_similarity_loss(compressed_vectors_1, compressed_vectors_2):
    # Normalize the compressed vectors
    compressed_vectors_1 = F.normalize(compressed_vectors_1, dim=1)
    compressed_vectors_2 = F.normalize(compressed_vectors_2, dim=1)
    
    # Calculate the cosine similarity between the compressed vectors
    cosine_similarities = F.cosine_similarity(compressed_vectors_1, compressed_vectors_2)
    
    # Define the target labels (1 for similar pairs)
    target = torch.ones_like(cosine_similarities)
    
    # Calculate the MSE loss
    loss = F.mse_loss(cosine_similarities, target)
    
    return loss

loss = cosine_similarity_loss(compressed_vector_1, compressed_vector_2)
print('loss', loss)


vector1
 [[0.17056278 0.63398972 0.22195671 ... 0.73470834 0.3838552  0.30160935]
 [0.90708827 0.41043103 0.18526834 ... 0.10107154 0.62454734 0.54306655]
 [0.39615635 0.05131884 0.36681253 ... 0.08435408 0.59527782 0.29017821]
 ...
 [0.75252669 0.95907083 0.76910504 ... 0.16224513 0.99325054 0.27598611]
 [0.93851294 0.79834325 0.90471731 ... 0.37005262 0.91908562 0.95298253]
 [0.49607554 0.96702053 0.12645085 ... 0.73388498 0.80529343 0.78325553]]
vector2
 [[9.55563819e-01 8.61345089e-01 4.42591448e-01 ... 7.29694985e-01
  7.08775187e-01 8.70417130e-04]
 [6.95753741e-01 4.64971869e-01 4.80071002e-01 ... 1.67756501e-01
  3.48512074e-01 1.34929799e-01]
 [3.32020260e-01 9.15812456e-01 5.68121525e-02 ... 6.98273541e-01
  6.79387284e-01 8.44683693e-01]
 ...
 [8.04641777e-01 9.71625005e-01 8.14557447e-01 ... 9.25714297e-01
  8.53355438e-01 6.13825311e-01]
 [5.73262482e-01 2.19780806e-01 7.86116139e-01 ... 5.56453054e-01
  2.77839302e-01 9.50746149e-02]
 [5.30035597e-02 7.75117205e-01 2.7508

In [304]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

# Define some similar and dissimilar sentences
similar_sentences = ["The cat sat on the mat", "A cat is sitting on the mat"]
dissimilar_sentences = ["The cat sat on the mat", "Apple is a fruit"]

# Similar sentences are about the same sport (soccer)
similar_sentences = ["The soccer team won the match", "The football squad was victorious in the game"]
dissimilar_sentences = ["The soccer team won the match", "The basketball player scored a three-pointer"]

# Similar sentences are about the same cooking action (baking a cake)
similar_sentences = ["She is baking a cake for the party", "A cake is being baked for the celebration"]
dissimilar_sentences = ["She is baking a cake for the party", "He is frying an egg for breakfast"]


# Tokenize the sentences
similar_sentences = [tokenize(sentence) for sentence in similar_sentences]
dissimilar_sentences = [tokenize(sentence) for sentence in dissimilar_sentences]

# print('similar_sentence tokens:', similar_sentences)
# print('dissimilar_sentences tokens:', dissimilar_sentences)

# Train a Word2Vec model on the sentences
model = Word2Vec(similar_sentences + dissimilar_sentences, min_count=1)

# Convert the sentences to vectors
similar_vectors = [model.wv[sentence].mean(axis=0) for sentence in similar_sentences]
dissimilar_vectors = [model.wv[sentence].mean(axis=0) for sentence in dissimilar_sentences]


# print('similar_vectors:\n', similar_vectors)
# print('dissimilar_vectors:\n', dissimilar_vectors)

# Convert the vectors to PyTorch tensors
similar_vectors = torch.tensor(np.array(similar_vectors)).float()
dissimilar_vectors = torch.tensor(np.array(dissimilar_vectors)).float()

# Calculate the loss for the similar and dissimilar sentences
similar_loss = cosine_similarity_loss(similar_vectors[0].unsqueeze(0), similar_vectors[1].unsqueeze(0))
dissimilar_loss = cosine_similarity_loss(dissimilar_vectors[0].unsqueeze(0), dissimilar_vectors[1].unsqueeze(0))

print('Similar loss', similar_loss)
print('Dissimilar loss', dissimilar_loss)

Similar loss tensor(0.1246)
Dissimilar loss tensor(0.6277)
