In [1]:
from transformers import RagTokenizer, RagRetriever, RagSequenceForGeneration
from datasets import Dataset
import faiss
import numpy as np

#1) Prepare docs (list of strings)

docs = [
"Earth is the third planet from the Sun.",
"Jupiter has a strong magnetic field and many moons.",
# ... your doc passages ...
]

#2) Build simple FAISS index with embeddings (example using sentence-transformers)

from sentence_transformers import SentenceTransformer
embedder = SentenceTransformer("all-MiniLM-L6-v2")
doc_embeddings = embedder.encode(docs, convert_to_numpy=True)

d = doc_embeddings.shape[1]
index = faiss.IndexFlatIP(d)
faiss.normalize_L2(doc_embeddings)
index.add(doc_embeddings)

#3) Build HF components

tokenizer = RagTokenizer.from_pretrained("facebook/rag-token-base")

#HuggingFace RagRetriever expects specific index format; here we use a simple retriever wrapper:

retriever = RagRetriever.from_pretrained(
"facebook/rag-token-base",
index_name="custom",
passages=docs,                # small example: pass docs directly
)

model = RagSequenceForGeneration.from_pretrained("facebook/rag-token-base", retriever=retriever)

#4) Ask a question

question = "Which planet is third from the Sun?"
inputs = tokenizer(question, return_tensors="pt")
generated = model.generate(input_ids=inputs["input_ids"], attention_mask=inputs["attention_mask"], do_deduplication=True, num_return_sequences=1)
print(tokenizer.batch_decode(generated, skip_special_tokens=True))

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'DPRQuestionEncoderTokenizerFast'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'RagTokenizer'. 
The class this function is called from is 'BartTokenizer'.
The tokenizer class you load from this checkpoint is not the same type as the class this function is called fr

ValueError: Please provide `dataset_path` and `index_path` after calling `dataset.save_to_disk(dataset_path)` and `dataset.get_index('embeddings').save(index_path)`.