In [2]:
from datasets import load_dataset, Dataset

data = {
    "text": [
        "The capital of France is Paris. It is famous for the Eiffel Tower.",
        "The fastest land animal is the cheetah, capable of speeds up to 120 km/h.",
        "The sun is a star at the center of the Solar System.",
        "Generative AI models like BERT and GPT are widely used in NLP."
    ]
}
knowledge_base = Dataset.from_dict(data)

print(knowledge_base[0])

  from .autonotebook import tqdm as notebook_tqdm


{'text': 'The capital of France is Paris. It is famous for the Eiffel Tower.'}


In [3]:
from sentence_transformers import SentenceTransformer

EMBEDDING_MODEL = 'sentence-transformers/all-MiniLM-L6-v2'
retriever = SentenceTransformer(EMBEDDING_MODEL)

def generate_embeddings(examples):
    return {'embeddings': retriever.encode(examples['text']).tolist()}

knowledge_base_with_embeddings = knowledge_base.map(generate_embeddings, batched=False)

2025-11-17 14:47:28.216508: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-11-17 14:47:28.256206: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.
2025-11-17 14:47:29.667540: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
Map: 100%|██████████| 4/4 [00:00<00:00, 55.70 examples/s]


In [4]:
import faiss
import numpy as np
embeddings_matrix = knowledge_base_with_embeddings['embeddings']

d = len(embeddings_matrix[0]) 
index = faiss.IndexFlatL2(d)


index.add(np.array(embeddings_matrix))

print(f"FAISS index created with {index.ntotal} vectors.")

FAISS index created with 4 vectors.


In [5]:
import numpy as np

def retrieve_documents(query, top_k=2):

    query_vector = retriever.encode(query)

    D, I = index.search(np.expand_dims(query_vector, axis=0), top_k)

    retrieved_texts = [knowledge_base_with_embeddings[i.item()]['text'] for i in I[0]]

    context = "\n---\n".join(retrieved_texts)
    return context

In [6]:
from transformers import AutoTokenizer, AutoModelForCausalLM

GENERATOR_MODEL = 'distilgpt2'
generator_tokenizer = AutoTokenizer.from_pretrained(GENERATOR_MODEL)
generator_model = AutoModelForCausalLM.from_pretrained(GENERATOR_MODEL)

def generate_answer(query, context):
    
    prompt = (
        f"Context: {context}\n\n"
        f"Question: {query}\n\n"
        f"Answer:"
    )

    input_ids = generator_tokenizer.encode(prompt, return_tensors='pt')

    output_ids = generator_model.generate(
        input_ids,
        max_length=200,          
        do_sample=True,
        temperature=0.7,
        pad_token_id=generator_tokenizer.eos_token_id 
    )

    response = generator_tokenizer.decode(output_ids[0], skip_special_tokens=True)


    try:
        answer_start = response.index("Answer:") + len("Answer:")
        final_answer = response[answer_start:].strip()
    except ValueError:
        final_answer = response.strip() 
        
    return final_answer

In [7]:
QUERY = "What is the capital of France and what is it known for?"

retrieved_context = retrieve_documents(QUERY, top_k=1)

print(f"--- Retrieved Context ---\n{retrieved_context}\n-------------------------")

final_answer = generate_answer(QUERY, retrieved_context)

print(f"\nRAG Model Answer: {final_answer}")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


--- Retrieved Context ---
The capital of France is Paris. It is famous for the Eiffel Tower.
-------------------------

RAG Model Answer: It is the capital of France.
Question: Are you a scholar?
Answer: Yes. My work has been published in French.
Question: What has your book made in French?
Answer: The book of French study has been published in the English edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the French edition of the


In [8]:
SAVE_PATH = "rag_files"

generator_model.save_pretrained(SAVE_PATH)

generator_tokenizer.save_pretrained(SAVE_PATH)

print(f"Generator model and tokenizer saved to: {SAVE_PATH}")

Generator model and tokenizer saved to: rag_files


In [9]:
import faiss
from datasets import load_from_disk, DatasetDict
faiss.write_index(index, f"{SAVE_PATH}/faiss_index.bin")
knowledge_base_dict = DatasetDict({'knowledge': knowledge_base_with_embeddings})
knowledge_base_dict.save_to_disk(f"{SAVE_PATH}/knowledge_base")

print("FAISS index and knowledge base saved.")

Saving the dataset (1/1 shards): 100%|██████████| 4/4 [00:00<00:00, 760.29 examples/s]

FAISS index and knowledge base saved.



