In [2]:
import faiss
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from transformers import pipeline




  from .autonotebook import tqdm as notebook_tqdm


In [9]:
# 1. Load FAISS index and processed chunks
index = faiss.read_index("../models/embeddings.faiss")
df = pd.read_csv("../data/processed/chunks_embedded.csv")

# 2. Load the same embedding model used before
embed_model = SentenceTransformer("all-mpnet-base-v2")

# 3. Load a small language model for answer generation
# You can use a local or HuggingFace-hosted model.
# This one runs on CPU and gives good quality short answers.
qa_model = pipeline("text2text-generation", model="google/flan-t5-small")

# 4. Helper: retrieve top relevant chunks
def retrieve_context(query, k=3):
    q_emb = embed_model.encode([query])
    D, I = index.search(np.array(q_emb).astype("float32"), k)
    contexts = df.iloc[I[0]]["text"].tolist()
    return contexts

# 5. Helper: build the full prompt for the LLM
def build_prompt(query, contexts):
    context_text = "\n\n".join(contexts)
    prompt = f"Answer the question based only on the context below.\n\nContext:\n{context_text}\n\nQuestion: {query}\n\nAnswer briefly:"
    return prompt

# 6. Helper: get final answer
def answer_query(query):
    contexts = retrieve_context(query)
    prompt = build_prompt(query, contexts)
    answer = qa_model(prompt, max_new_tokens=200)[0]["generated_text"]
    return answer, contexts

# 7. Example query
query = "What is the main topic discussed in the document?"
answer, contexts = answer_query(query)

print("🧠 Question:", query)
print("\n💬 Answer:", answer)
print("\n📚 Top supporting contexts:\n")
for c in contexts:
    print("-", c[:200], "...\n")

Device set to use cpu
Token indices sequence length is longer than the specified maximum sequence length for this model (3747 > 512). Running this sequence through the model will result in indexing errors


🧠 Question: What is the main topic discussed in the document?

💬 Answer: Let us lay the cornerstone of American freedom without fear. To hesitate is to perish Bolvar establishes Gran Colombia 220 Life without industry is guilt Stephenson’s Rocket enters service 226 You may choose to look the other way, but you can never again say you did not know The Slave Trade Abolition Act 228 Society was cut in two The 1848 revolutions 230 This enterprise will return immense rewards The construction of the Suez Canal 236 Endless forms most beautiful and most wonderful have been and are being evolved Darwin publishes On the Origin of Species 238 Let us arm. Let us fight for our brothers The Expedition of the Thousand 242 These sad scenes of death and sorrow, when are they to come to an end? The Siege of Lucknow243 Better to abolish serfdom from above, than to wait for it to abolish itself from below Russia emancipates the serfs 244 Government

📚 Top supporting contexts:

- of the Bastille 214 I must