In [1]:
import os
import torch
from transformers import AutoModel, AutoTokenizer
from sklearn.preprocessing import normalize

query_prompt = "Instruct: Given a web search query, retrieve relevant passages that answer the query.\nQuery: "
queries = [
    "What are some ways to reduce stress?",
    "What are the benefits of drinking green tea?",
]
queries = [query_prompt + query for query in queries]
# docs do not need any prompts
docs = [
    "There are many effective ways to reduce stress. Some common techniques include deep breathing, meditation, and physical activity. Engaging in hobbies, spending time in nature, and connecting with loved ones can also help alleviate stress. Additionally, setting boundaries, practicing self-care, and learning to say no can prevent stress from building up.",
    "Green tea has been consumed for centuries and is known for its potential health benefits. It contains antioxidants that may help protect the body against damage caused by free radicals. Regular consumption of green tea has been associated with improved heart health, enhanced cognitive function, and a reduced risk of certain types of cancer. The polyphenols in green tea may also have anti-inflammatory and weight loss properties.",
]

# The path of your model after cloning it
model_dir = "/home/thanhnx/.cache/huggingface/hub/models--dunzhang--stella_en_400M_v5/snapshots/1bb50bc7bb726810eac2140e62155b88b0df198f"

vector_dim = 1024
vector_linear_directory = f"2_Dense_{vector_dim}"
model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).cuda().eval()
# you can also use this model without the features of `use_memory_efficient_attention` and `unpad_inputs`. It can be worked in CPU.
# model = AutoModel.from_pretrained(model_dir, trust_remote_code=True,use_memory_efficient_attention=False,unpad_inputs=False).cuda().eval()
tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)
vector_linear = torch.nn.Linear(in_features=model.config.hidden_size, out_features=vector_dim)
vector_linear_dict = {
    k.replace("linear.", ""): v for k, v in
    torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items()
}
vector_linear.load_state_dict(vector_linear_dict)
vector_linear.cuda()

# Embed the queries
with torch.no_grad():
    input_data = tokenizer(queries, padding="longest", truncation=True, max_length=512, return_tensors="pt")
    input_data = {k: v.cuda() for k, v in input_data.items()}
    attention_mask = input_data["attention_mask"]
    last_hidden_state = model(**input_data)[0]
    last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
    query_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    query_vectors = normalize(vector_linear(query_vectors).cpu().numpy())

# Embed the documents
with torch.no_grad():
    input_data = tokenizer(docs, padding="longest", truncation=True, max_length=512, return_tensors="pt")
    input_data = {k: v.cuda() for k, v in input_data.items()}
    attention_mask = input_data["attention_mask"]
    last_hidden_state = model(**input_data)[0]
    last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)
    docs_vectors = last_hidden.sum(dim=1) / attention_mask.sum(dim=1)[..., None]
    docs_vectors = normalize(vector_linear(docs_vectors).cpu().numpy())

print(query_vectors.shape, docs_vectors.shape)
# (2, 1024) (2, 1024)

similarities = query_vectors @ docs_vectors.T
print(similarities)
# [[0.8397531  0.29900077]
#  [0.32818374 0.80954516]]


  from .autonotebook import tqdm as notebook_tqdm
Some weights of the model checkpoint at /home/thanhnx/.cache/huggingface/hub/models--dunzhang--stella_en_400M_v5/snapshots/1bb50bc7bb726810eac2140e62155b88b0df198f were not used when initializing NewModel: ['new.pooler.dense.bias', 'new.pooler.dense.weight']
- This IS expected if you are initializing NewModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing NewModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


(2, 1024) (2, 1024)
[[0.8397528  0.29900083]
 [0.32818374 0.8095453 ]]


  torch.load(os.path.join(model_dir, f"{vector_linear_directory}/pytorch_model.bin")).items()


In [10]:
inputs = tokenizer(queries, padding="max_length", truncation=True, max_length=512, return_tensors="pt",add_special_tokens=True)

In [2]:
queries = [
    "What are some ways to reduce stress? haha [MASK] [MASK]",
    "What are the benefits of drinking green tea? hoho [MASK] [MASK]",
]

In [17]:
inputs['input_ids'][0]

tensor([ 101, 2054, 2024, 2070, 3971, 2000, 5547, 6911, 1029, 5292, 3270,  103,
         103,  102,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,   

In [20]:
non_zero_pos = inputs['input_ids'][0] != 0
inputs['input_ids'][0][non_zero_pos]

tensor([ 101, 2054, 2024, 2070, 3971, 2000, 5547, 6911, 1029, 5292, 3270,  103,
         103,  102])

In [24]:
inputs['input_ids'][0][[inputs['attention_mask'][0] == 1]]

tensor([ 101, 2054, 2024, 2070, 3971, 2000, 5547, 6911, 1029, 5292, 3270,  103,
         103,  102])

In [25]:
mask_position = torch.where(inputs['input_ids'][0] == tokenizer.mask_token_id)[0]

In [28]:
mask_position[0].item()

11

In [15]:
decoded = tokenizer.batch_decode(inputs["input_ids"])

In [18]:
last_hidden.shape

torch.Size([2, 83, 1024])

In [32]:
input_data = {k: v.cuda() for k, v in inputs.items()}
attention_mask = inputs["attention_mask"].cuda()
last_hidden_state = model(**input_data)[0]
last_hidden = last_hidden_state.masked_fill(~attention_mask[..., None].bool(), 0.0)

In [33]:
last_hidden.shape

torch.Size([2, 15, 1024])

In [34]:
inputs

{'input_ids': tensor([[ 101, 2054, 2024, 2070, 3971, 2000, 5547, 6911, 1029, 5292, 3270,  103,
          103,  102,    0],
        [ 101, 2054, 2024, 1996, 6666, 1997, 5948, 2665, 5572, 1029, 7570, 6806,
          103,  103,  102]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [35]:
# get the last hidden state of the masked token
masked_token_index = torch.where(inputs["input_ids"] == tokenizer.mask_token_id)
print(masked_token_index)
masked_token_hidden_state = last_hidden[masked_token_index]


(tensor([0, 0, 1, 1]), tensor([11, 12, 12, 13]))


In [27]:
last_hidden.shape

torch.Size([2, 14, 1024])

In [42]:
mask_pos = inputs["input_ids"] == tokenizer.mask_token_id

res = last_hidden[mask_pos]

In [47]:
res_ = res.view(2,2,-1)

In [44]:
mask_pos0 = inputs["input_ids"][0] == tokenizer.mask_token_id
res0 = last_hidden[0][mask_pos0]

In [50]:
mask_pos1 = inputs["input_ids"][1] == tokenizer.mask_token_id
res1 = last_hidden[1][mask_pos1]

In [52]:
res_[1] == res0

tensor([[False, False, False,  ..., False, False, False],
        [False, False, False,  ..., False, False, False]], device='cuda:0')