## Text Embeddings

In [2]:
# !pip install transformers sentence-transformers

In [3]:
from transformers import AutoTokenizer, AutoModel
import torch

# Choose a pre-trained model (Fill-Mask)
model_name = 'bert-base-uncased'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)



In [4]:
# Define the sentences you want to embed
sentences = ["Embedding text using a Hugging Face open-source model involves converting text into numerical representations using pre-trained language models. Here's a step-by-step guide on how to do this:", "This code provides a comprehensive example of generating text embeddings using two popular methods from the Hugging Face ecosystem. It ensures you have all the necessary steps documented, from loading models to generating and printing embeddings."]


# Tokenize the sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)
print("Tokenized Inputs:", inputs)

Tokenized Inputs: {'input_ids': tensor([[  101,  7861,  8270,  4667,  3793,  2478,  1037, 17662,  2227,  2330,
          1011,  3120,  2944,  7336, 16401,  3793,  2046, 15973, 15066,  2478,
          3653,  1011,  4738,  2653,  4275,  1012,  2182,  1005,  1055,  1037,
          3357,  1011,  2011,  1011,  3357,  5009,  2006,  2129,  2000,  2079,
          2023,  1024,   102,     0,     0,     0,     0],
        [  101,  2023,  3642,  3640,  1037,  7721,  2742,  1997, 11717,  3793,
          7861,  8270,  4667,  2015,  2478,  2048,  2759,  4725,  2013,  1996,
         17662,  2227, 16927,  1012,  2009, 21312,  2017,  2031,  2035,  1996,
          4072,  4084,  8832,  1010,  2013, 10578,  4275,  2000, 11717,  1998,
          8021,  7861,  8270,  4667,  2015,  1012,   102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0,

In [5]:
# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Use the [CLS] token's embedding (first token) for sentence-level tasks
embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
print("Transformers Embeddings:\n", embeddings)


Transformers Embeddings:
 tensor([[-0.3644, -0.4560, -0.3025,  ..., -0.3133,  0.2756,  0.9795],
        [-0.5201, -0.5001,  0.1235,  ..., -0.3319, -0.2964,  0.4896]])


In [6]:
from transformers import AutoTokenizer, AutoModel
import torch

# Choose a pre-trained model
model_name = 'bert-base-uncased'

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

# Define the sentences you want to embed
sentences = ["This is a sample sentence.", "This is another example."]

# Tokenize the sentences
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

# Verify the structure of inputs
print("Tokenized Inputs:", inputs)
print("input_ids:", inputs['input_ids'])
print("attention_mask:", inputs['attention_mask'])

# Generate embeddings
with torch.no_grad():
    outputs = model(**inputs)

# Typically, use the [CLS] token's embedding (first token) for sentence-level tasks
embeddings = outputs.last_hidden_state[:, 0, :]  # Shape: (batch_size, hidden_size)
print("Transformers Embeddings:\n", embeddings)

# Using sentence-transformers
from sentence_transformers import SentenceTransformer

# Load a pre-trained SentenceTransformer model
model = SentenceTransformer('all-MiniLM-L6-v2')  # This is a smaller, faster model suitable for embeddings

# Encode sentences to get their embeddings
embeddings = model.encode(sentences)
print("Sentence-Transformers Embeddings:\n", embeddings)




Tokenized Inputs: {'input_ids': tensor([[ 101, 2023, 2003, 1037, 7099, 6251, 1012,  102],
        [ 101, 2023, 2003, 2178, 2742, 1012,  102,    0]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0]])}
input_ids: tensor([[ 101, 2023, 2003, 1037, 7099, 6251, 1012,  102],
        [ 101, 2023, 2003, 2178, 2742, 1012,  102,    0]])
attention_mask: tensor([[1, 1, 1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 1, 1, 0]])
Transformers Embeddings:
 tensor([[-0.1993, -0.2101, -0.1950,  ..., -0.4733,  0.0861,  0.7103],
        [-0.0188, -0.1547, -0.2812,  ..., -0.3443,  0.4917,  0.6778]])




Sentence-Transformers Embeddings:
 [[ 7.78064504e-02  7.64624849e-02  3.77088003e-02  6.09389693e-02
   4.88075763e-02  7.11166440e-03  2.06367504e-02  2.86464114e-02
   6.35215044e-02  1.49654932e-02  7.55234212e-02 -5.63334711e-02
  -4.15647781e-04 -2.09397804e-02  6.40888736e-02  2.22408436e-02
   4.88932505e-02 -5.74298501e-02 -2.99323350e-02  4.03885581e-02
   3.07553802e-02  3.82084250e-02  4.59010452e-02  7.84995593e-03
   4.37707407e-03  3.39148790e-02 -1.33846579e-02  5.27541600e-02
   1.02368444e-01  2.79702421e-04 -5.75644895e-02  4.39351685e-02
   8.33893940e-02  1.62850320e-02  7.24604726e-02  7.32017355e-03
  -2.33296100e-02  5.43991625e-02 -5.93921356e-03  2.92546861e-02
   4.84408438e-02 -3.95350754e-02  3.56571041e-02  7.58858770e-03
  -1.70145128e-02 -3.67177129e-02 -3.15289609e-02  9.60672367e-03
  -1.58553571e-02  4.69976105e-02 -7.87202269e-02 -3.92243192e-02
  -1.03012361e-01 -8.17152113e-03  1.21502066e-02  2.70931479e-02
   9.10838135e-03  3.93027999e-02  3.8550

In [7]:
# Using transformers
from transformers import AutoTokenizer, AutoModel
import torch

model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

sentences = ["This is a sample sentence.", "This is another example."]
inputs = tokenizer(sentences, return_tensors='pt', padding=True, truncation=True)

with torch.no_grad():
    outputs = model(**inputs)

embeddings = outputs.last_hidden_state[:, 0, :]  # Use [CLS] token's embedding
print("Transformers Embeddings:\n", embeddings)

# Using sentence-transformers
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print("Sentence-Transformers Embeddings:\n", embeddings)


Transformers Embeddings:
 tensor([[-0.1993, -0.2101, -0.1950,  ..., -0.4733,  0.0861,  0.7103],
        [-0.0188, -0.1547, -0.2812,  ..., -0.3443,  0.4917,  0.6778]])
Sentence-Transformers Embeddings:
 [[ 7.78064504e-02  7.64624849e-02  3.77088003e-02  6.09389693e-02
   4.88075763e-02  7.11166440e-03  2.06367504e-02  2.86464114e-02
   6.35215044e-02  1.49654932e-02  7.55234212e-02 -5.63334711e-02
  -4.15647781e-04 -2.09397804e-02  6.40888736e-02  2.22408436e-02
   4.88932505e-02 -5.74298501e-02 -2.99323350e-02  4.03885581e-02
   3.07553802e-02  3.82084250e-02  4.59010452e-02  7.84995593e-03
   4.37707407e-03  3.39148790e-02 -1.33846579e-02  5.27541600e-02
   1.02368444e-01  2.79702421e-04 -5.75644895e-02  4.39351685e-02
   8.33893940e-02  1.62850320e-02  7.24604726e-02  7.32017355e-03
  -2.33296100e-02  5.43991625e-02 -5.93921356e-03  2.92546861e-02
   4.84408438e-02 -3.95350754e-02  3.56571041e-02  7.58858770e-03
  -1.70145128e-02 -3.67177129e-02 -3.15289609e-02  9.60672367e-03
  -1.5

In [31]:
emb_1 = model.encode(
    ["What is the meaning of life?"])

emb_2 = model.encode(
    ["How does one spend their time well on Earth?"])

emb_3 = model.encode(
    ["Would you like a salad?"])


print("Sentence-Transformers Embeddings:\n", emb_3)

print("Sentence-Transformers Embeddings:\n", emb_2)

print("Sentence-Transformers Embeddings:\n", emb_1)

Sentence-Transformers Embeddings:
 [[-9.94613543e-02 -1.68131087e-02  9.17111151e-03  3.07198409e-02
  -1.75803900e-02 -2.73174737e-02  3.38393785e-02 -4.18204516e-02
   1.02727991e-02 -3.12325545e-02  4.94470708e-02 -3.62283215e-02
   2.78369281e-02 -2.82143708e-02  3.30406316e-02 -3.95042226e-02
   9.72072333e-02  3.73475589e-02 -8.82984474e-02  7.26361871e-02
  -1.48148283e-01  5.43499514e-02  3.49745452e-02 -1.87307615e-02
  -6.81444854e-02 -2.79122293e-02  4.48073074e-02  1.38033479e-02
  -7.84473717e-02 -2.09084731e-02 -1.13782780e-02  3.58933806e-02
   4.17363979e-02  1.47404606e-02 -3.74239706e-03 -3.54777426e-02
   7.62145817e-02 -4.88542095e-02 -1.00998543e-02 -4.64923074e-03
   5.59268147e-03  3.19846645e-02  3.46678868e-02 -3.50903859e-03
  -1.18434522e-02 -5.98165877e-02 -2.63321251e-02  3.44077758e-02
   8.23089853e-02 -2.86310911e-02  4.88948170e-03  2.44142860e-02
  -3.21370326e-02 -2.72304006e-02  4.01028395e-02  8.56546462e-02
  -2.44694017e-02 -5.77869266e-03 -1.3660

In [13]:
in_1 = "The kids play in the park."
in_2 = "The play was for kids in the park."

In [16]:
print(in_1)
print(in_2)

The kids play in the park.
The play was for kids in the park.


In [15]:
in_pp_1 = ["kids", "play", "park"]
in_pp_2 = ["play", "kids", "park"]

In [21]:
embeddings_1 = model.encode(in_pp_1)
embeddings_2 = model.encode(in_pp_2)

In [22]:
import numpy as np
emb_array_1 = np.stack(embeddings_1)
print(emb_array_1.shape)

(3, 384)


In [23]:
emb_array_2 = np.stack(embeddings_2)
print(emb_array_2.shape)

(3, 384)


In [24]:
emb_2_mean = emb_array_2.mean(axis = 0) 
emb_1_mean = emb_array_1.mean(axis = 0) 

In [25]:
print(emb_1_mean.shape)
print(emb_2_mean.shape)

(384,)
(384,)


In [26]:
print(emb_1_mean[:4])
print(emb_2_mean[:4])

[ 0.00600321  0.02638755  0.02692896 -0.00738498]
[ 0.00600321  0.02638755  0.02692896 -0.00738498]


In [27]:
embedding_1 = model.encode([in_1])
embedding_2 = model.encode([in_2])

In [29]:
vector_1 = embedding_1[0]
print("Vector 1 first 4 values:", vector_1[:4])
vector_2 = embedding_2[0]
print("Vector 2 first 4 values:", vector_2[:4])

Vector 1 first 4 values: [ 0.03020821 -0.04249752  0.0494425  -0.00938033]
Vector 2 first 4 values: [ 0.00743058  0.06007196  0.01971415 -0.04297099]
