#### **Explanation**
* **Introduces Embeddings-Based Memory**
  Teaches how to convert text into vector representations using OpenAI's embedding models.

* **Uses ChromaDB for Vector Storage**
  Demonstrates how an AI agent can store and search through vectorized documents with persistent memory.

* **Loads External PDF Knowledge**
  Prepares students to bring in real-world knowledge from documents (e.g., PDF reports, manuals, articles).

* **Performs Similarity Search**
Uses ChromaDB to find relevant context based on semantic meaning, not just keyword matching.

* **Injects Retrieved Context into LLM Prompt**
  Shows how to ground LLM responses by combining retrieved knowledge with user questions.

* **Implements Retrieval-Augmented Generation Lite**
Lays the foundation for full RAG systems by walking through the basic building blocks:
Document → Embed → Store → Retrieve → Inject → Respond

* **Console-Based and Interactive**
Keeps the experience user-driven, helping learners understand how input affects output at each stage.

* **Prepares for Full RAG Pipelines**
Sets up future upgrades like:
LangChain Retriever & RAGChain
Prompt templates
Metadata filtering
Chunking strategies

In [None]:
!pip install -U openai --quiet
!pip install -U chromadb --quiet
!pip install -U sentence-transformers --quiet
!pip install -U pymupdf --quiet

In [None]:
# === Mount Google Drive to access API key ===
from google.colab import drive
drive_path = '/content/drive'
drive.mount(drive_path)

Mounted at /content/drive


In [None]:
import json

secret_file_path = "/content/drive/My Drive/Secret_Keys/OpenAI_Secret_Key.json"

with open(secret_file_path, "r") as f:
    data = json.load(f)

secret_Key = data["openai_api_key"]

'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'

In [None]:
# Import required modules
import os
import fitz  # PyMuPDF
import chromadb
from chromadb.config import Settings
from sentence_transformers import SentenceTransformer
from uuid import uuid4
from openai import OpenAI

In [None]:
# Configure OpenAI key (use your own)
os.environ["OPENAI_API_KEY"] = secret_Key
client = OpenAI()

# ChromaDB setup
PERSIST_DIR = "/content/chroma_rag_pdf"
chroma_client = chromadb.Client(Settings(persist_directory=PERSIST_DIR, anonymized_telemetry=False))
collection = chroma_client.get_or_create_collection(name="rag_pdf_collection")

# Load SentenceTransformer model
embedder = SentenceTransformer("all-MiniLM-L6-v2")

In [None]:
# Function to read PDF text from Google Drive path
def load_pdf_text(drive_path):
    doc = fitz.open(drive_path)
    all_text = ""
    for page in doc:
        all_text += page.get_text()
    return all_text

In [None]:
# Chunking logic (simple split by paragraph)
def chunk_text(text, max_len=300):
    chunks = []
    current = ""
    for para in text.split("\n"):
        if len(current) + len(para) < max_len:
            current += para + "\n"
        else:
            chunks.append(current.strip())
            current = para + "\n"
    if current:
        chunks.append(current.strip())
    return chunks

In [None]:
# Ingest PDF into ChromaDB
def ingest_pdf(drive_pdf_path):
    text = load_pdf_text(drive_pdf_path)
    chunks = chunk_text(text)
    embeddings = embedder.encode(chunks).tolist()
    ids = [str(uuid4()) for _ in chunks]
    collection.add(ids=ids, documents=chunks, embeddings=embeddings)
    print(f"Ingested {len(chunks)} chunks from {drive_pdf_path}")

In [None]:
# Query ChromaDB + Inject into LLM prompt
def ask_question_with_context(question, top_k=5):
    query_embedding = embedder.encode([question]).tolist()[0]
    results = collection.query(query_embeddings=[query_embedding], n_results=top_k)
    relevant_chunks = results["documents"][0]

    context = "\n\n".join(relevant_chunks)
    prompt = f"Use the context below to answer the question.\n\nContext:\n{context}\n\nQuestion: {question}\nAnswer:"

    response = client.chat.completions.create(
        model="gpt-4o-mini",
        messages=[{"role": "user", "content": prompt}]
    )
    print("\nLLM Answer:\n" + response.choices[0].message.content)

In [None]:
#  Console-based interaction
def run_rag_agent():
    print("RAG PDF Agent (type 'ingest <google_drive_path>', 'ask <question>', or 'exit')")
    while True:
        user_input = input("You > ").strip()
        if user_input.lower() == "exit":
            break
        elif user_input.startswith("ingest"):
            parts = user_input.split(" ", 1)
            if len(parts) == 2:
                ingest_pdf(parts[1])
            else:
                print("Usage: ingest <google_drive_path>")
        elif user_input.startswith("ask"):
            parts = user_input.split(" ", 1)
            if len(parts) == 2:
                ask_question_with_context(parts[1])
            else:
                print("Usage: ask <your question>")
        else:
            print("Unknown command. Use 'ingest <path>' or 'ask <question>'")

# Start the agent
run_rag_agent()

RAG PDF Agent (type 'ingest <google_drive_path>', 'ask <question>', or 'exit')
You > exit
