### RAG Model

In [1]:
# !pip install transformers
# conda install -c conda-forge faiss-gpu

In [2]:
from transformers import RagTokenizer
from transformers import RagRetriever
from transformers import RagTokenForGeneration
from transformers import RagSequenceForGeneration
from transformers import RagModel
from transformers import DPRContextEncoderTokenizer
import torch

In [3]:
## Sample1 (use "rag-token-nq")
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-nq")
retriever = RagRetriever.from_pretrained("facebook/rag-token-nq", index_name="exact", use_dummy_dataset=True)

# Initialize with RagRetriever
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", retriever=retriever)

inputs = tokenizer("How many people live in Paris?", return_tensors="pt")

# with tokenizer.as_target_tokenizer():
targets = tokenizer("In Paris, there are 10 million people.", return_tensors="pt")
input_ids = inputs["input_ids"]
labels = targets["input_ids"]
outputs = model(input_ids=input_ids, labels=labels)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr




In [4]:
# Optional: use retriever separately
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-nq", use_dummy_dataset=True)
# 1. Encode
question_hidden_states = model.question_encoder(input_ids)[0]

# 2. Retrieve
docs_dict = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
doc_scores = torch.bmm(question_hidden_states.unsqueeze(1), docs_dict["retrieved_doc_embeds"].float().transpose(1, 2)).squeeze(1)

# 3. Forward to generator
outputs = model(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores, decoder_input_ids=labels)
outputs

Some weights of the model checkpoint at facebook/rag-token-nq were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.weight', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RetrievAugLMMarginOutput(loss=None, logits=tensor([[[-1.9397e+01,  5.6320e+00, -2.2659e+00,  ...,  5.9408e+00,
           7.4354e+00,  1.8106e+00],
         [-1.0601e+01,  1.5022e+00,  7.8418e+00,  ...,  1.4301e+00,
           1.4129e+00, -8.5760e-02],
         [-3.5679e+00,  2.5542e-01,  2.2238e+00,  ...,  1.8642e-01,
           8.1896e-02,  3.9636e-01],
         ...,
         [ 1.6917e+00,  1.7313e-01,  1.3211e+01,  ...,  1.5808e-01,
           1.6088e-01,  1.9634e-01],
         [ 1.4014e+00,  1.7203e-01,  1.3208e+01,  ...,  1.8200e-01,
           1.7199e-01,  1.4298e-01],
         [ 1.3044e+00,  1.7966e-01,  1.3211e+01,  ...,  1.8430e-01,
           1.7046e-01,  1.1160e-01]],

        [[-1.4260e-01,  1.6169e-02,  2.2944e+00,  ..., -3.6549e-02,
          -2.1869e-02,  4.8434e-02],
         [-6.0029e-02,  9.7634e-02,  2.5408e+00,  ...,  1.0386e-01,
           1.1406e-01,  9.1566e-02],
         [-5.3237e-01,  8.3934e-02,  2.2713e+00,  ...,  9.1527e-02,
           9.5105e-02,  1.0102e-0

In [5]:
# Optional: directly generate
generated = model.generate(context_input_ids=docs_dict["context_input_ids"], context_attention_mask=docs_dict["context_attention_mask"], doc_scores=doc_scores)
generated_string = tokenizer.batch_decode(generated, skip_special_tokens=True)
generated_string



[' 270,000,000']

In [6]:
## Sample2: use "rag-token-nq"
tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")   
retriever = RagRetriever.from_pretrained("facebook/rag-token-base", index_name="exact", use_dummy_dataset=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

In [7]:
model = RagTokenForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)

Some weights of the model checkpoint at facebook/rag-token-base were not used when initializing RagTokenForGeneration: ['rag.question_encoder.question_encoder.bert_model.pooler.dense.weight', 'rag.question_encoder.question_encoder.bert_model.pooler.dense.bias']
- This IS expected if you are initializing RagTokenForGeneration from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RagTokenForGeneration from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
question = "Who is the greatest scientist in the world?"
input_ids = tokenizer(question, return_tensors="pt").input_ids
output = model.generate(input_ids)
answer = tokenizer.batch_decode(output, skip_special_tokens=True)[0]
print(answer)



11. Albert Einstein / theoretical physics or mathematics. Einstein was deeply impressed by Mahat


In [9]:
question_hidden_states = model.question_encoder(input_ids)[0]
docs = retriever(input_ids.numpy(), question_hidden_states.detach().numpy(), return_tensors="pt")
docs

{'context_input_ids': tensor([[    0,  8098, 27648,  ...,     1,     1,     1],
        [    0,  8098, 27648,  ...,     1,     1,     1],
        [    0,  8098, 27648,  ...,     1,     1,     1],
        [    0, 11502,  1653,  ...,     1,     1,     1],
        [    0,  1817,  1589,  ...,     1,     1,     1]]), 'context_attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'retrieved_doc_embeds': tensor([[[ 0.1247,  0.2402, -0.4992,  ..., -0.1777,  0.6764, -0.1529],
         [-0.0267,  0.4492, -0.2772,  ..., -0.3479,  0.5957,  0.1257],
         [ 0.1171,  0.2806,  0.0444,  ..., -0.1164,  0.6655, -0.2059],
         [-0.0199,  0.3306, -0.4458,  ..., -0.5286,  0.7065,  0.3285],
         [-0.1162,  0.0919, -0.4464,  ..., -0.0072,  0.9548, -0.4109]]]), 'doc_ids': tensor([[2116, 2064, 2083,  641, 9715]], dtype=torch.int32)}

In [10]:
## Document context
doc_context = tokenizer.batch_decode(docs["context_input_ids"], skip_special_tokens=True)[0]
print(doc_context)

 Albert Einstein / theoretical physics or mathematics. He strongly advocated the idea of a democratic global government that would check the power of nation-states in the framework of a world federation. The FBI created a secret dossier on Einstein in 1932, and by the time of his death his FBI file was 1,427 pages long. Einstein was deeply impressed by Mahatma Gandhi. He exchanged written letters with Gandhi, and called him "a role model for the generations to come" in a letter writing about him. Einstein spoke of his spiritual outlook in a wide array of original writings and interviews. Einstein stated that // who is the greatest scientist in the world?


In [11]:
## Document title
doc_titles = tokenizer.batch_decode(docs["doc_ids"], skip_special_tokens=True)
print(doc_titles)

[' cross Sp Sun Department PH']


In [12]:
## Top tokens
probs = model(input_ids, labels=output, return_dict=True).logits
probs = probs.softmax(dim=-1)
top_tokens = tokenizer.batch_decode(probs[0,-1].topk(5).indices)
print(top_tokens)

['.', 'ge', 'k', 'ar', 't']


In [13]:
## Ackknowledges:
## 1. Transformers (Huggingface.co)
## 2. Udemy courses
## 3. OpenAI
## All above for practices only.