Code reference: Huggingface https://huggingface.co/ngxson/demo_simple_rag_py/blob/main/demo.py

Txt file source: Wikipedia https://en.wikipedia.org/wiki/Smartphone

In [None]:
from sentence_transformers import SentenceTransformer
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch
import numpy as np

In [None]:
dataset = []
with open('smartphones.txt', 'r', encoding='utf-8') as file:
    dataset = [line.strip() for line in file if line.strip()]
print(f"Loaded {len(dataset)} entries")

In [None]:
dataset

In [None]:
EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"
LANGUAGE_MODEL = "google-t5/t5-base"

embedder = SentenceTransformer(EMBEDDING_MODEL) # a sentence embedding model
tokenizer = T5Tokenizer.from_pretrained(LANGUAGE_MODEL)
t5_model = T5ForConditionalGeneration.from_pretrained(LANGUAGE_MODEL)

In [None]:
VECTOR_DB = []  # (chunk, embedding)

def add_chunk_to_db(chunk):
    embedding = embedder.encode(chunk)
    VECTOR_DB.append((chunk, embedding))

for i, chunk in enumerate(dataset):
    add_chunk_to_db(chunk)
    print(f"Added chunk {i+1}/{len(dataset)} to database")

In [None]:
VECTOR_DB[0]

In [None]:
def cosine_similarity(a, b):
    dot = np.dot(a, b)
    norm_a = np.linalg.norm(a)
    norm_b = np.linalg.norm(b)
    return dot / (norm_a * norm_b)

In [None]:
# retrieval
def retrieve(query, top_n=3):
    query_emb = embedder.encode(query)
    sims = [(chunk, cosine_similarity(query_emb, emb)) for chunk, emb in VECTOR_DB]
    sims.sort(key=lambda x: x[1], reverse=True)
    return sims[:top_n]

In [None]:
query = input("Ask me a question: ")
retrieved = retrieve(query)

print("\nRetrieved knowledge:")
for chunk, sim in retrieved:
    print(f" - (similarity: {sim:.2f}) {chunk}")

In [None]:
# augmentation
context = "\n".join([f"- {chunk}" for chunk, _ in retrieved])
prompt = f"""
You are a helpful chatbot. Use only the following information to answer the question.
Do not add extra knowledge.

Context:
{context}

Question: {query}
Answer:
"""

# generation
inputs = tokenizer(prompt, return_tensors="pt", max_length=512, truncation=True)
outputs = t5_model.generate(**inputs, max_new_tokens=128)

print("\nChatbot response:")
print(tokenizer.decode(outputs[0], skip_special_tokens=True))