In [12]:

from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from langchain.llms import HuggingFacePipeline
from langchain.prompts import PromptTemplate
from langchain.chains import LLMChain
import faiss
import numpy as np
import random
import torch

# Step 2: Load a dataset of QA pairs
ds = load_dataset("tatsu-lab/alpaca", split="train")
qa_pairs = [{"question": x["instruction"], "answer": x["output"]} for x in ds]
random.shuffle(qa_pairs)
qa_pairs = qa_pairs[:1000]  # limit for fast local use

# Step 3: Embed documents using sentence-transformers
model = SentenceTransformer("all-MiniLM-L6-v2")
documents = [f"Q: {q['question']}\nA: {q['answer']}" for q in qa_pairs]
embeddings = model.encode(documents, show_progress_bar=True)

# Step 4: Build FAISS vector index
index = faiss.IndexFlatL2(embeddings.shape[1])
index.add(np.array(embeddings))

# Step 5: Load Phi-2 model (CPU-safe float32)
model_id = "microsoft/phi-2"
tokenizer = AutoTokenizer.from_pretrained(model_id)
phi_model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32).cpu()

# Step 6: Set up text generation pipeline
pipe = pipeline("text-generation", model=phi_model, tokenizer=tokenizer, max_new_tokens=200)

# Step 7: LangChain LLM wrapper
llm = HuggingFacePipeline(pipeline=pipe)

# Step 8: LangChain Prompt Template
prompt = PromptTemplate(
    input_variables=["context", "question"],
    template="""You are a helpful assistant. Use the following context to answer the question.
Context:
{context}

Question:
{question}

Answer:"""
)

# Step 9: Combine prompt and LLM into a chain
rag_chain = LLMChain(prompt=prompt, llm=llm)

# Step 10: Final chatbot function
def ask_chatbot(user_input, top_k=3):
    query_emb = model.encode([user_input])
    D, I = index.search(np.array(query_emb), k=top_k)
    context = "\n\n".join([documents[i] for i in I[0]])
    return rag_chain.invoke({"context": context, "question": user_input})["text"]


print(ask_chatbot("what is data privacy?"))

Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Device set to use cpu
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


You are a helpful assistant. Use the following context to answer the question.
Context:
Q: Rewrite this short introduction to make it more engaging.
A: Are we doing enough to protect our data from the ever-growing risk of privacy violations? In this paper, we explore the potential implications of privacy rules and regulations on data confidentiality, equipping ourselves with the knowledge needed to keep our data secure.

Q: Describe 3 ways to protect your data
A: 1. Using strong passwords and maintaining good password hygiene, such as avoiding reusing passwords and regularly changing them.
2. Installing and regularly updating antivirus and anti–malware software. 
3. Encrypting data, such as with the encryption software or a Virtual Private Network (VPN).

Q: Find an example of the given kind of data.
A: An example of qualitative data is an opinion survey which asks respondents to rate certain aspects of a product or service on a scale from one (really bad) to five (really good). This t