<a href="https://colab.research.google.com/github/testanalyst/AILaunchpad/blob/main/Getting_Into_Embeddings.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gensim.downloader as api
from gensim.models import Word2Vec

# Load pre-trained Word2Vec model
word2vec_model = api.load('word2vec-google-news-300')

# Example function to get embeddings for a sentence
def get_sentence_embedding(sentence):
    words = sentence.split()
    word_vectors = [word2vec_model[word] for word in words if word in word2vec_model]
    return sum(word_vectors) / len(word_vectors)

# Example sentence
sentence = "My bank is 2 miles away from my office"
embedding = get_sentence_embedding(sentence)
print(embedding)

# Instead of sentence embedding get word embedding for the word 'Bank'

[ 1.36956107e-02  2.34063459e-03  6.29611537e-02  6.82101771e-02
 -4.33892161e-02  9.19681136e-03 -1.69118252e-02 -1.34128153e-01
  3.21926549e-02  1.26125753e-01 -8.24652798e-03 -4.54372820e-03
  3.86945941e-02  1.65337455e-02 -8.53307545e-02  8.93690288e-02
  9.67764333e-02  4.94113490e-02 -1.90904411e-03 -2.01585554e-02
 -3.22265625e-02  3.63498256e-02  5.62337227e-02 -1.60386832e-03
  4.38537598e-02 -7.64702708e-02 -9.97297466e-02  2.08062064e-02
  9.28005651e-02 -1.00341797e-01  7.27640763e-02 -3.82351354e-02
 -3.46272774e-02  9.35829990e-03 -9.09665450e-02 -7.01226108e-03
 -6.70030387e-03  3.77061628e-02 -3.81910540e-02  7.80707449e-02
  5.42144775e-02 -8.20583776e-02  7.90608749e-02  4.54033725e-03
 -5.82139753e-02  1.56690814e-02 -2.14640293e-02  2.82864049e-02
  1.94769967e-02  6.22558594e-03  2.57161465e-02  5.01725934e-02
  8.94300640e-02  2.25423183e-02 -7.37304688e-02  9.10305455e-02
 -1.30588114e-01 -2.83135311e-03 -9.81987827e-03 -7.40152970e-02
 -5.61794713e-02  3.45187

In [2]:
dimensionality = len(embedding)
print(f"Dimensionality of the embedding: {dimensionality}")


Dimensionality of the embedding: 300


In [3]:
!pip install transformers



In [5]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to get embeddings for a sentence using BERT
def get_sentence_embedding(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state across all tokens for the sentence embedding
    sentence_embedding = outputs.last_hidden_state.mean(dim=1).squeeze()
    return sentence_embedding

# Example sentence
sentence = "My bank is 2 miles away from my office"
embedding_BERT = get_sentence_embedding(sentence)
print(embedding_BERT)

tensor([ 7.5392e-03,  1.2506e-01,  1.4103e-01, -3.4694e-02,  2.7930e-01,
         8.0504e-02,  2.0659e-01,  5.0561e-01,  3.0793e-01, -2.2332e-01,
         2.6898e-01, -3.2897e-01, -1.3582e-02,  3.4275e-01, -1.4591e-01,
         1.1451e-01,  1.3383e-01,  9.2703e-02, -1.2177e-01,  3.1151e-01,
        -1.3779e-01, -2.0864e-01,  9.8295e-02,  4.8174e-01,  1.6992e-01,
         1.8185e-01, -1.9642e-01,  1.3874e-01, -2.8608e-01, -1.8854e-01,
         1.6423e-01, -1.6490e-01,  6.7301e-02, -7.1841e-02,  3.9710e-01,
        -4.5112e-01,  1.2719e-01,  9.4895e-02, -4.8089e-01,  4.7171e-01,
        -4.6157e-01, -1.4512e-01, -7.6379e-02,  6.9886e-02, -2.2302e-01,
        -3.5039e-01,  1.6231e-01, -3.3414e-01, -1.1659e-01, -1.6587e-01,
        -3.1789e-01,  2.4395e-01, -3.4452e-01,  1.2679e-01,  2.9435e-01,
         7.6291e-01, -7.2761e-02, -4.8699e-01, -2.8132e-01, -1.2864e-01,
         5.9352e-01, -1.4877e-01, -2.3235e-02, -3.2729e-01,  3.8141e-02,
         8.8554e-02,  3.3424e-01,  2.0919e-01, -4.7

In [6]:
dimensionality = len(embedding_BERT)
print(f"Dimensionality of the embedding from BERT: {dimensionality}")

Dimensionality of the embedding from BERT: 768


In [8]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to get embeddings for a sentence using BERT
def get_sentence_embedding_BERT(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state across all tokens for the sentence embedding
    sentence_embedding_BERT = outputs.last_hidden_state.mean(dim=1).squeeze()
    return sentence_embedding_BERT.numpy()

# Example sentences
sentence1 = "My bank is 2 miles away from my office"
sentence2 = "The financial institution is located near my workplace"

# Get embeddings for the sentences
embedding_BERT1 = get_sentence_embedding_BERT(sentence1)
embedding_BERT2 = get_sentence_embedding_BERT(sentence2)

# Calculate cosine similarity
similarity = cosine_similarity([embedding_BERT1], [embedding_BERT2])[0][0]
print(f"Cosine Similarity between the two sentences: {similarity}")


Cosine Similarity between the two sentences: 0.7859391570091248


In [12]:
from sklearn.metrics.pairwise import cosine_similarity

# Function to get embeddings for a sentence using BERT
def get_sentence_embedding_BERT(sentence):
    inputs = tokenizer(sentence, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    # Take the mean of the last hidden state across all tokens for the sentence embedding
    sentence_embedding_BERT = outputs.last_hidden_state.mean(dim=1).squeeze()
    return sentence_embedding_BERT.numpy()

# Example sentences
sentence1 = "My bank is 2 miles away from my office"
sentence2 = "The ganga river's bank is located near my home"

# Get embeddings for the sentences
embedding_BERT1 = get_sentence_embedding_BERT(sentence1)
embedding_BERT2 = get_sentence_embedding_BERT(sentence2)

# Calculate cosine similarity
similarity = cosine_similarity([embedding_BERT1], [embedding_BERT2])[0][0]
print(f"Cosine Similarity between the two sentences: {similarity}")

Cosine Similarity between the two sentences: 0.7335179448127747
