In [1]:
# !pip install pandas numpy langchain-core langchain-community langchain-chroma transformers

# 2.Similar to RAG

# 2.1.Realm Encoder

In [2]:
from transformers import RealmConfig, RealmEmbedder 

configuration = RealmConfig()
model = RealmEmbedder(configuration)
configuration = model.config

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
configuration

RealmConfig {
  "attention_probs_dropout_prob": 0.1,
  "bos_token_id": 0,
  "eos_token_id": 2,
  "hidden_act": "gelu_new",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "max_span_width": 10,
  "model_type": "realm",
  "num_attention_heads": 12,
  "num_block_records": 13353718,
  "num_candidates": 8,
  "num_hidden_layers": 12,
  "pad_token_id": 1,
  "reader_beam_size": 5,
  "reader_layer_norm_eps": 0.001,
  "reader_seq_len": 320,
  "retriever_proj_size": 128,
  "searcher_beam_size": 5000,
  "span_hidden_size": 256,
  "transformers_version": "4.45.1",
  "type_vocab_size": 2,
  "vocab_size": 30522
}

# 2.2.RealmTokenizer

In [4]:
from transformers import RealmTokenizer 
text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]

model_name = "google/realm-cc-news-pretrained-encoder"
tokenizer = RealmTokenizer.from_pretrained(model_name)
tokenized_text = tokenizer.batch_encode_candidates(text, max_length = 10, return_tensors="pt")




## 2.3.RealmTokenizerFast

In [5]:
from transformers import RealmTokenizerFast 

text = [["Hello world!", "Nice to meet you!"], ["The cute cat.", "The adorable dog."]]

tokenizer = RealmTokenizerFast.from_pretrained(model_name)

tokenized_text = tokenizer.batch_encode_candidates(text, max_length = 10, return_tensors="pt")


## 2.4.RealmRetriever

In [6]:
from transformers import AutoTokenizer, RealmEmbedder 
import torch 

model_name = "google/realm-cc-news-pretrained-embedder"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RealmEmbedder.from_pretrained(model_name)

inputs = tokenizer("Hello my dog is cute", return_tensors = "pt")
outputs = model(**inputs)

projected_score = outputs.projected_score

In [7]:
outputs

RealmEmbedderOutput(projected_score=tensor([[ 0.0104,  0.1440, -0.2996, -0.0171, -0.2020,  0.3074,  0.1743,  0.0951,
          0.0742,  0.2060, -0.2306,  0.1745,  0.0702,  0.0442, -0.0107, -0.1233,
          0.2895,  0.1332,  0.2101, -0.2831,  0.1967, -0.2140, -0.2875, -0.1658,
         -0.0781, -0.1009, -0.1484,  0.1377, -0.0052,  0.1274, -0.1648, -0.2175,
         -0.0926,  0.1561, -0.1623,  0.0720, -0.1116,  0.0968, -0.2673,  0.1744,
          0.1238, -0.1646, -0.1151, -0.2973,  0.0232,  0.2000,  0.0569, -0.0054,
         -0.1775,  0.0825,  0.0477,  0.0840, -0.0737,  0.2533,  0.0917, -0.3854,
         -0.2690, -0.1063, -0.1390, -0.1299,  0.3417,  0.1054, -0.3920,  0.2655,
          0.0299,  0.0093, -0.1478, -0.1488,  0.1248,  0.1710,  0.1008, -0.0475,
          0.1224,  0.3072, -0.0770,  0.2762, -0.2797,  0.0218,  0.0072, -0.0360,
         -0.1487,  0.0146,  0.0945,  0.1217,  0.2518,  0.1535, -0.0093,  0.1245,
         -0.1556,  0.0351, -0.4017, -0.0254, -0.0615,  0.2113,  0.0956, -

## 2.5.RealmScorer

In [8]:
import torch 
from transformers import AutoTokenizer, RealmScorer 

model_name = "google/realm-cc-news-pretrained-scorer"
tokenizer = AutoTokenizer.from_pretrained(model_name) 
model = RealmScorer.from_pretrained(model_name, num_candidates =2)

input_texts = ["How are you?", "What is the item in the picture?"]
candidates_texts = [["Hello world!", "Nice to meet you!"], ["A cute cat.", "An adorable dog."]]

inputs = tokenizer(input_texts, return_tensors = "pt", padding = True)
candidates_inputs = tokenizer.batch_encode_candidates(candidates_texts, max_length = 10, return_tensors="pt")

outputs = model(
    **inputs, 
    candidate_input_ids = candidates_inputs.input_ids, 
    candidate_attention_mask = candidates_inputs.attention_mask, 
    candidate_token_type_ids = candidates_inputs.token_type_ids,
)

relevance_score= outputs.relevance_score 
relevance_score


Some weights of the model checkpoint at google/realm-cc-news-pretrained-scorer were not used when initializing RealmScorer: ['query_embedder.realm.embeddings.position_ids']
- This IS expected if you are initializing RealmScorer from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RealmScorer from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[0.6536, 0.3909],
        [0.3008, 0.2741]], grad_fn=<ViewBackward0>)

In [9]:
outputs

RealmScorerOutput(relevance_score=tensor([[0.6536, 0.3909],
        [0.3008, 0.2741]], grad_fn=<ViewBackward0>), query_score=tensor([[-9.9364e-02,  1.9061e-02, -1.1568e-01, -2.4949e-03,  5.2143e-02,
          3.2825e-02,  9.0304e-02,  8.8276e-02, -4.8526e-02, -5.8189e-02,
         -4.8693e-02,  5.9679e-02,  2.7601e-02, -7.4258e-03, -1.0389e-02,
          3.7499e-02, -6.7484e-02,  3.5330e-02,  6.4482e-02, -1.8363e-02,
         -4.6311e-02, -4.0528e-02, -4.3615e-03, -9.2813e-02, -8.6109e-02,
          1.9582e-02, -1.2501e-01, -1.3312e-02,  1.4748e-02, -1.3577e-02,
         -3.0497e-02, -5.4675e-03, -2.9807e-02, -1.4008e-02, -1.1179e-01,
          8.3566e-02, -1.7059e-02,  2.0392e-02, -6.7560e-03,  1.0769e-02,
         -5.6942e-02, -3.7400e-02, -6.4183e-02, -5.9047e-02,  2.3588e-02,
          7.9742e-02,  5.8374e-02,  5.3356e-02,  3.0724e-02,  6.6851e-02,
          7.1329e-02, -1.0509e-01, -3.7729e-02,  1.5201e-02, -9.1713e-03,
         -9.0360e-02, -1.1937e-01, -4.9125e-02,  7.3877e-02, 

## 2.5. RealmForOpenQA

In [10]:
import torch 
from transformers import RealmForOpenQA, RealmRetriever, AutoTokenizer 

model_name = "google/realm-orqa-nq-openqa"

retriever = RealmRetriever.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = RealmForOpenQA.from_pretrained(model_name, retriever)

question = "Who is the pioneer in modern computer science?"
question_ids = tokenizer([question], return_tensors="pt")
answer_ids = tokenizer(
    ["alan mathison turing"],
    add_special_tokens=False,
    return_token_type_ids=False,
    return_attention_mask=False,
).input_ids

reader_output, predicted_answer_ids = model(**question_ids, answer_ids = answer_ids, return_dict = False)
predicted_answer = tokenizer.decode(predicted_answer_ids)
loss = reader_output.loss

In [None]:
predicted_answer, loss