# RAG
it's so cooked

In [None]:
# Installs

# LLM stuff
!pip install torch transformers accelerate bitsandbytes
# LangChain (splitting)
!pip install langchain langchain-community langchain-core langchain-text-splitters langchain-chroma
# Embeddings
!pip install langchain-huggingface sentence-transformers
# Vector Store, Documents
!pip install chromadb pymupdf

# Runtime -> Restart session



In [None]:
# Imports

import torch # GPU usage
# LLM Stuff
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    BitsAndBytesConfig,
    pipeline,
)

# LangChain
from langchain_community.document_loaders import PyMuPDFLoader, WikipediaLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_community.llms.huggingface_pipeline import HuggingFacePipeline

# Prompting
from langchain_core.prompts import PromptTemplate

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

In [None]:
# Load LLM + Tokenizer (Requires HF_TOKEN in Colab secrets)
model_id = "meta-llama/Meta-Llama-3-8B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token_id = tokenizer.eos_token_id

# Quantization: load model weights with less precision to save memory
quant_config = BitsAndBytesConfig(load_in_8bit=True)

llm_model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="auto",# choose device (cuda if available, else CPU)
    dtype=torch.bfloat16 if torch.cuda.is_available() else torch.float32,
    quantization_config=quant_config,
)

# Pipeline: abstracts lots of LLM stuff
gen_pipe = pipeline(
    task="text-generation",
    model=llm_model,
    tokenizer=tokenizer,
    temperature=0.1,# controls variation
    repetition_penalty=1.2,# discourages repetition
    max_new_tokens=256,# response size limit
    pad_token_id=tokenizer.eos_token_id,
    return_full_text=False,# only output new text (set to true to include prompt + context)
)

llm = HuggingFacePipeline(pipeline=gen_pipe)

tokenizer_config.json:   0%|          | 0.00/51.0k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.09M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/73.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/654 [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/23.9k [00:00<?, ?B/s]

Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/187 [00:00<?, ?B/s]

Device set to use cuda:0


In [None]:
# Load Documents (PDF)

pdf_path = "/content/ap-us-history-course-and-exam-description.pdf"
docs = PyMuPDFLoader(pdf_path).load()

In [None]:
# Chunking: split documents
splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=60,# prevent info loss at chunk edges
    separators=["\n\n", "\n", ".", " ", ""],
)

chunks = splitter.split_documents(docs)

In [None]:
# Embeddings: Convert docs into numerical representations
model_name = "sentence-transformers/all-mpnet-base-v2"
model_kwargs = {"device": "cuda" if torch.cuda.is_available() else "cpu"}
hf_embeddings = HuggingFaceEmbeddings(
    model_name=model_name, model_kwargs=model_kwargs
)

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/438M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [None]:
# Vector Store (search for similar embeddings)
persist_directory = "./vector_store"

vectordb = Chroma.from_documents(
    documents=chunks,
    embedding=hf_embeddings,
    persist_directory=persist_directory,
)

retriever = vectordb.as_retriever()

In [None]:
# Prompt template
rag_template = """Use the given context to answer the question.
If you don't know the answer, say you don't know. Do not make up facts.
Keep the answer concise.

Context:

{context}

Question:

{question}
"""
prompt = PromptTemplate.from_template(rag_template)

In [None]:
from langchain_classic.chains import RetrievalQA

# Q&A chain
qa_chain = RetrievalQA.from_chain_type(
    llm,
    retriever=vectordb.as_retriever(search_kwargs={'k': 3}),# retrieve top-k chunks
    return_source_documents=True,
    chain_type="stuff",
    chain_type_kwargs={"prompt": prompt},
)

In [None]:
from IPython.display import Markdown

# Prompt
qa_response = qa_chain.invoke({"query": "What do I need to know about George Washington for AP US History?"})

Markdown(qa_response['result'])

Answer:
According to the context, as an AP U.S. History student, you should know that George Washington's military leadership was one factor that contributed to the success of the Patriot cause despite significant Loyalist opposition. This information can be found in the Optional Activity "Match Claims with Evidence" under Period 3: 1754-1800.