In [3]:
import torch
import torch.nn.functional as F
from torch import Tensor
from transformers import AutoTokenizer, AutoModel, AutoConfig, AutoModelForQuestionAnswering
from datasets import load_dataset

In [5]:
task_name_to_instruct = {"example": "Given a question, retrieve passages that answer the question",}

query_prefix = "Instruct: "+task_name_to_instruct["example"]+"\nQuery: "
queries = [
    'are judo throws allowed in wrestling?', 
    'how to become a radiology technician in michigan?'
    ]
passage_prefix = ""
passages = [
    "Since you're reading this, you are probably someone from a judo background or someone who is just wondering how judo techniques can be applied under wrestling rules. So without further ado, let's get to the question. Are Judo throws allowed in wrestling? Yes, judo throws are allowed in freestyle and folkstyle wrestling. You only need to be careful to follow the slam rules when executing judo throws. In wrestling, a slam is lifting and returning an opponent to the mat with unnecessary force.",
    "Below are the basic steps to becoming a radiologic technologist in Michigan:Earn a high school diploma. As with most careers in health care, a high school education is the first step to finding entry-level employment. Taking classes in math and science, such as anatomy, biology, chemistry, physiology, and physics, can help prepare students for their college studies and future careers.Earn an associate degree. Entry-level radiologic positions typically require at least an Associate of Applied Science. Before enrolling in one of these degree programs, students should make sure it has been properly accredited by the Joint Review Committee on Education in Radiologic Technology (JRCERT).Get licensed or certified in the state of Michigan."
]

In [6]:
# huggingface token
hf_token = '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'

config = AutoConfig.from_pretrained('nvidia/NV-Embed-v1', use_auth_token=hf_token, trust_remote_code=True)

# get Nvidia model
model = AutoModel.from_pretrained('nvidia/NV-Embed-v1', config=config, use_auth_token=hf_token, trust_remote_code=True)

tokenizer = AutoTokenizer.from_pretrained('nvidia/NV-Embed-v1', use_auth_token=hf_token)




Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [20]:
# queries
max_length = 4096
query_embeddings = model.encode(queries, instruction=query_prefix, max_length=max_length)
passage_embeddings = model.encode(passages, instruction=passage_prefix, max_length=max_length)

# normalise embeddings
query_embeddings = F.normalize(query_embeddings, p=2, dim=1)
passage_embeddings = F.normalize(passage_embeddings, p=2, dim=1)

  'input_ids': torch.tensor(batch_dict.get('input_ids').to(batch_dict.get('input_ids')).long()),
  'input_ids': torch.tensor(batch_dict.get('input_ids').to(batch_dict.get('input_ids')).long()),


In [23]:
scores = (query_embeddings @ passage_embeddings.T) * 100
print(scores.tolist())

[[77.94023132324219, 0.4248908758163452], [3.757723093032837, 79.60116577148438]]


In [22]:
print(query_embeddings)
print(passage_embeddings)

cosine_sim = torch.matmul(query_embeddings, passage_embeddings.T)
ground_truth = torch.arange(len(queries))

# Get the top-1 prediction for each query
_, top_k_indices = torch.topk(cosine_sim, k=1, dim=1)
top_k_indices = top_k_indices.squeeze()

# Calculate accuracy
accuracy = (top_k_indices == ground_truth).float().mean().item()

print(f'Accuracy: {accuracy:.4f}')

tensor([[-0.0152, -0.0073, -0.0190,  ..., -0.0372, -0.0128, -0.0261],
        [ 0.0044,  0.0101,  0.0243,  ..., -0.0306, -0.0120,  0.0363]])
tensor([[-0.0095,  0.0006, -0.0150,  ..., -0.0281, -0.0132, -0.0083],
        [ 0.0234,  0.0313,  0.0258,  ..., -0.0356, -0.0245,  0.0211]])
Accuracy: 1.0000


In [34]:
# Preprocess function
def preprocess_function(examples, _):
    max_length = 384
    tokenized = tokenizer(
        examples["question"], examples["context"],
        truncation="only_second",
        max_length=max_length,
        stride=128,
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
        return_tensors="pt",  
    )
    # Ensure all tokenized sequences have the same length
    for key in tokenized.keys():
        tokenized[key] = tokenized[key][:, :max_length]
    return tokenized

def encode_texts(model, tokenizer, texts, instruction, max_length):
    inputs = tokenizer(texts, padding=True, truncation=True, max_length=max_length, return_tensors='pt')
    with torch.no_grad():
        outputs = model(**inputs)
    embeddings = outputs.last_hidden_state.mean(dim=1)  # Assuming embeddings are taken from the mean of last hidden state
    return embeddings

In [38]:
# testing on SQuAD data
dataset = load_dataset("squad_v2", split="validation")
tokenized_dataset = dataset.map(lambda example: tokenizer(example['question'], example['context'], truncation=True, padding='max_length'), batched=True)

Map:   0%|          | 0/11873 [00:00<?, ? examples/s]

Asking to pad to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no padding.
Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [None]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    contexts = [c.strip() for c in examples["context"]]
    return tokenizer(questions, contexts, truncation=True, padding="max_length", max_length=384)

tokenized_squad = dataset.map(preprocess_function, batched=True, remove_columns=dataset["train"].column_names)

In [42]:
model2 = AutoModelForQuestionAnswering.from_pretrained("nvidia/NV-Embed-v1")


ValueError: Unrecognized configuration class <class 'transformers_modules.nvidia.NV-Embed-v1.7f6188488250b5bd3a334d93dfce0f1291f240e3.configuration_nvembed.NVEmbedConfig'> for this kind of AutoModel: AutoModelForQuestionAnswering.
Model type should be one of AlbertConfig, BartConfig, BertConfig, BigBirdConfig, BigBirdPegasusConfig, BloomConfig, CamembertConfig, CanineConfig, ConvBertConfig, Data2VecTextConfig, DebertaConfig, DebertaV2Config, DistilBertConfig, ElectraConfig, ErnieConfig, ErnieMConfig, FalconConfig, FlaubertConfig, FNetConfig, FunnelConfig, GPT2Config, GPTNeoConfig, GPTNeoXConfig, GPTJConfig, IBertConfig, LayoutLMv2Config, LayoutLMv3Config, LEDConfig, LiltConfig, LlamaConfig, LongformerConfig, LukeConfig, LxmertConfig, MarkupLMConfig, MBartConfig, MegaConfig, MegatronBertConfig, MobileBertConfig, MPNetConfig, MptConfig, MraConfig, MT5Config, MvpConfig, NezhaConfig, NystromformerConfig, OPTConfig, QDQBertConfig, ReformerConfig, RemBertConfig, RobertaConfig, RobertaPreLayerNormConfig, RoCBertConfig, RoFormerConfig, SplinterConfig, SqueezeBertConfig, T5Config, UMT5Config, XLMConfig, XLMRobertaConfig, XLMRobertaXLConfig, XLNetConfig, XmodConfig, YosoConfig.