<a href="https://colab.research.google.com/github/ywangumichigan/EECS595-Project/blob/main/RAG%20new.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [13]:
!pip install --quiet wikipedia
!pip install --quiet nltk
!pip install --quiet gensim
!pip install --quiet faiss-cpu
!pip install --quiet langchain-text-splitters
!pip install --quiet transformers
!pip install --quiet torch


In [14]:
# --- Imports ---
import nltk
import numpy as np
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
import faiss
from langchain_text_splitters import RecursiveCharacterTextSplitter
from transformers import AutoTokenizer, AutoModelForCausalLM

# --- Download tokenizer data ---
nltk.download('punkt')

# --- Step 1: Collect Wikipedia data ---
import wikipedia

topics = ["Algebra", "Calculus", "Derivative", "Integral",
          "Matrix (mathematics)", "Probability", "Statistics",
          "Geometry", "Trigonometry", "Number theory"]

wiki_corpus = []
for topic in topics:
    try:
        page = wikipedia.page(topic)
        wiki_corpus.append({"title": topic, "content": page.content})
        print(f"Collected: {topic}")
    except:
        print(f"Skipped {topic}")



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Collected: Algebra
Collected: Calculus
Skipped Derivative
Collected: Integral
Collected: Matrix (mathematics)
Collected: Probability
Collected: Statistics
Collected: Geometry
Collected: Trigonometry
Collected: Number theory


In [15]:
# --- Step 2: Split long pages into smaller chunks ---
nltk.download('punkt_tab')

splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,      # around 500 characters per chunk
    chunk_overlap=100,   # slight overlap to preserve context
)

chunks = []
titles = []
for doc in wiki_corpus:
    doc_chunks = splitter.split_text(doc["content"])
    chunks.extend(doc_chunks)
    titles.extend([doc["title"]] * len(doc_chunks))

print("Total chunks:", len(chunks))


Total chunks: 1391


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [16]:

# --- Step 3: Train Word2Vec embeddings on all chunks ---
sentences = [word_tokenize(chunk.lower()) for chunk in chunks]
model_w2v = Word2Vec(sentences, vector_size=200, window=5, min_count=1, sg=1)

# --- Step 4: Turn each chunk into a vector ---
def doc_vector(model, text):
    words = [w for w in word_tokenize(text.lower()) if w in model.wv]
    if len(words) == 0:
        return np.zeros(model.vector_size)
    return np.mean(model.wv[words], axis=0)

doc_vectors = np.array([doc_vector(model_w2v, chunk) for chunk in chunks])


In [17]:

# --- Step 5: Create FAISS index ---
d = model_w2v.vector_size
index = faiss.IndexFlatL2(d)
index.add(doc_vectors.astype("float32"))
print("Indexed", index.ntotal, "chunks")

# --- Step 6: Define retrieval + answer function ---

from transformers import AutoTokenizer, AutoModelForSeq2SeqLM

tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-large")
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-large")


Indexed 1391 chunks


In [18]:

def ask(query, k=3):
    # Embed query
    query_vec = doc_vector(model_w2v, query).astype("float32").reshape(1, -1)

    D, I = index.search(query_vec, k)
    retrieved_chunks = [chunks[i][:1000] for i in I[0]]
    context = "\n\n".join(retrieved_chunks)

    # Build prompt
    prompt = f"""
    Please answer the question based on the retrieved context below.
    Context:
    {context}

    Question: {query}
    """

    # Generate answer
    inputs = tokenizer(prompt, return_tensors="pt", truncation=True)
    outputs = model.generate(**inputs, max_new_tokens=150)
    answer = tokenizer.decode(outputs[0], skip_special_tokens=True)



    print("Top related titles:", [titles[i] for i in I[0]])
    print("\nLLM Answer:\n")
    return answer

# --- Step 7: Example queries ---
ask("what's the result of 1+1")


Top related titles: ['Probability', 'Statistics', 'Geometry']

LLM Answer:



'unanswerable'

In [19]:
import langchain

print("LangChain version:", langchain.__version__)



LangChain version: 1.1.0


In [20]:
# --- Install dependencies ---
!pip install --quiet wikipedia langchain langchain-community langchain-text-splitters faiss-cpu sentence-transformers transformers

# --- Imports ---
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
from langchain_core.prompts import ChatPromptTemplate

# --- Step 1: Collect small Wikipedia sample ---
import re
from langchain_text_splitters import RecursiveCharacterTextSplitter
import wikipedia

topics = ["Algebra", "Calculus", "Derivative", "Integral",
          "Matrix (mathematics)", "Probability", "Statistics",
          "Geometry", "Trigonometry", "Number theory"]

def prune_wiki(text):
    stop_markers = [
        "== See also ==",
        "== References ==",
        "== External links ==",
        "== Further reading =="
    ]
    for marker in stop_markers:
        idx = text.find(marker)
        if idx != -1:
            text = text[:idx]
    return text

raw_docs = []
for topic in topics:
    try:
        page = wikipedia.page(topic)
        content = prune_wiki(page.content)
        raw_docs.append({"title": topic, "content": content})
        print("Collected:", topic)
    except:
        print("Skipped:", topic)

splitter = RecursiveCharacterTextSplitter(
    chunk_size=800,
    chunk_overlap=150,
    separators=["\n\n", "\n", ". ", " ", ""],
)

docs = []
for d in raw_docs:
    chunks = splitter.create_documents(
        [d["content"]],
        metadatas=[{"title": d["title"]}]
    )
    docs.extend(chunks)

print("Total chunks:", len(docs))



Collected: Algebra
Collected: Calculus
Skipped: Derivative
Collected: Integral
Collected: Matrix (mathematics)
Collected: Probability
Collected: Statistics
Collected: Geometry
Collected: Trigonometry
Collected: Number theory
Total chunks: 851


In [21]:


# --- Step 2: Split into small text chunks ---
splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=100)

# adjust the keys here to match your actual dict structure
texts = [item["content"] for item in wiki_corpus]
metadatas = [{"title": item.get("title", "")} for item in wiki_corpus]

docs = splitter.create_documents(texts, metadatas=metadatas)
print("Total chunks:", len(docs))

# --- Step 3: Create embeddings and FAISS vectorstore ---
embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-mpnet-base-v2"
)

vectorstore = FAISS.from_documents(docs, embeddings)
retriever = vectorstore.as_retriever(
    search_type="mmr",
    search_kwargs={"k": 6, "lambda_mult": 0.7}
)



Total chunks: 1391


In [22]:
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM, pipeline
from langchain_community.llms import HuggingFacePipeline  # make sure this import is from langchain_community

# --- Step 4: Load small question-answering model ---
model_id = "google/flan-t5-large"
tokenizer = AutoTokenizer.from_pretrained(model_id)
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)
pipe = pipeline("text2text-generation", model=model, tokenizer=tokenizer, max_new_tokens=150)

llm = HuggingFacePipeline(pipeline=pipe)


Device set to use cpu


In [23]:


# --- Step 5: Build QA chain with  customprompt ---
from langchain_core.prompts import PromptTemplate
from langchain_core.runnables import RunnablePassthrough
from langchain_core.output_parsers import StrOutputParser

prompt = PromptTemplate(
    template=(
        "Use the following context to answer the question concisely.\n\n"
        "Context:\n{context}\n\n"
        "Question:\n{question}\n\n"
        "Answer:"
    ),
    input_variables=["context", "question"],
)

def format_docs(docs):
    # docs is a list of Document objects from your retriever
    return "\n\n".join(doc.page_content for doc in docs)

# RAG chain: question (str) -> answer (str)
qa_chain = (
    {
        "context": retriever | format_docs,   # retrieve docs then format
        "question": RunnablePassthrough(),    # pass the original question through
    }
    | prompt                                  # fill the PromptTemplate
    | llm                                     # your HuggingFacePipeline LLM
    | StrOutputParser()                       # get back a plain string
)

# Example query
question = "What is the result of 1+1?"
answer = qa_chain.invoke(question)
print(answer)



1 1 s + 1 2 s + 1 3 s


In [24]:
# --- Step 6: Example queries ---
query = "What is the volume of a box with a height of 6 cm, width 7 cm, and length 9 cm?"
print("\nQuery:", query)
print("\nAnswer:", qa_chain.invoke({"query": query})["result"])

query2 = "Who discovered calculus?"
print("\nQuery:", query2)
print("\nAnswer:", qa_chain.invoke({"query": query2})["result"])


query3 = "What is the result of 1+1"
print("\nQuery:", query3)
print("\nAnswer:", qa_chain.invoke({"query": query3})["result"])


Query: What is the volume of a box with a height of 6 cm, width 7 cm, and length 9 cm?


AttributeError: 'dict' object has no attribute 'replace'

In [None]:

query4 = "What is the integral of 4x "
print("\nQuery:", query4)
print("\nAnswer:", qa_chain.invoke({"query": query4})["result"])