In [None]:
# !pip install langchain langchain-community langchain-chroma transformers sentence-transformers pypdf


In [7]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_chroma import Chroma
from langchain_community.embeddings import HuggingFaceEmbeddings
from transformers import pipeline

# Load PDFs from a folder
def load_docs(folder_path):
    docs = []
    for file in os.listdir(folder_path):
        if file.endswith(".pdf"):
            loader = PyPDFLoader(os.path.join(folder_path, file))
            docs.extend(loader.load())
    return docs

# Update this path to where your PDFs are stored
docs = load_docs("data/")
print("PDF Pages Loaded:", len(docs))

PDF Pages Loaded: 3


In [8]:
# Split PDFs into chunks
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,
    chunk_overlap=80
)
chunks = text_splitter.split_documents(docs)
print("Chunks Created:", len(chunks))

Chunks Created: 12


In [9]:
# Embeddings
embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")

# Save texts into Chroma vector DB
texts = [c.page_content for c in chunks]
db = Chroma(
    collection_name="rag_store",
    embedding_function=embedding_model
)
db.add_texts(texts)

# Retriever
retriever = db.as_retriever(search_kwargs={"k": 3})

  embedding_model = HuggingFaceEmbeddings(model_name="all-MiniLM-L6-v2")
Error while fetching `HF_TOKEN` secret value from your vault: 'Requesting secret HF_TOKEN timed out. Secrets can only be fetched when running from the Colab UI.'.
You are not authenticated with the Hugging Face Hub in this notebook.
If the error persists, please let us know by opening an issue on GitHub (https://github.com/huggingface/huggingface_hub/issues/new).


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [10]:
# Local LLM
llm = pipeline(
    "text2text-generation",
    model="google/flan-t5-base",
    max_new_tokens=150
)

config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/990M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

Device set to use cpu


In [11]:
# Agent brain
def agent_controller(query):
    q = query.lower()
    if any(word in q for word in ["pdf", "document", "data", "summarize", "information", "find"]):
        return "search"
    return "direct"

In [12]:
# RAG
def rag_answer(query):
    action = agent_controller(query)

    if action == "search":
        print(f"🕵️ Agent decided to SEARCH document for: '{query}'")
        results = retriever.invoke(query)
        context = "\n".join([r.page_content for r in results])
        final_prompt = f"Use this context:\n{context}\n\nAnswer:\n{query}"
    else:
        print(f"🤖 Agent decided to answer DIRECTLY: '{query}'")
        final_prompt = query

    response = llm(final_prompt)[0]["generated_text"]
    return response

# Test 1: A document-specific question
query = "Give me a 5-point summary from the PDF"
print(rag_answer(query))

print("-" * 20)

# Test 2: A general knowledge question
print(rag_answer("What is an Ideal Resume Format? Explain in 50 words."))

🕵️ Agent decided to SEARCH document for: 'Give me a 5-point summary from the PDF'
Data Scientist Centre for Non-Communicable Diseases (CNCD) | Aug 2018 – May 2021 Model Evaluation ML & NLP Frameworks: TensorFlow, PyTorch, Scikit-learn, Hugging Face, LangChain, LangGraph, LlamaIndex, BERT, GPT, T5, LLaMA Data Engineering & Big Data: SQL, PySpark, Pandas, NumPy, ETL Pipelines, BigQuery, Hadoop, Spark, Hive MLOps & Deployment: MLflow, Docker, Kubernetes, Jenkins, GitHub Actions,
--------------------
🤖 Agent decided to answer DIRECTLY: 'What is an Ideal Resume Format? Explain in 50 words.'
An Ideal Resume Format is a format for a resume to be written in a professional manner. An Ideal Resume Format is a format for a resume to be written in a professional manner. An Ideal Resume Format is a format for a resume to be written in a professional manner. An Ideal Resume Format is a format for a resume to be written in a professional manner. An Ideal Resume Format is a format for a resume to be w