# Install and Imports

In [1]:
!pip install pypdf
!pip install google-generativeai
!pip install chromadb
!pip install typing

Collecting pypdf
  Downloading pypdf-5.1.0-py3-none-any.whl.metadata (7.2 kB)
Downloading pypdf-5.1.0-py3-none-any.whl (297 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m298.0/298.0 kB[0m [31m3.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pypdf
Successfully installed pypdf-5.1.0
Collecting chromadb
  Downloading chromadb-0.6.3-py3-none-any.whl.metadata (6.8 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.2.2.post1-py3-none-any.whl.metadata (6.5 kB)
Collecting chroma-hnswlib==0.7.6 (from chromadb)
  Downloading chroma_hnswlib-0.7.6-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (252 bytes)
Collecting fastapi>=0.95.2 (from chromadb)
  Downloading fastapi-0.115.7-py3-none-any.whl.metadata (27 kB)
Collecting uvicorn>=0.18.3 (from uvicorn[standard]>=0.18.3->chromadb)
  Downloading uvicorn-0.34.0-py3-none-any.whl.metadata (6.5 kB)
Collecting posthog>=2.4.0 (from chromadb)
  Downloading posthog-3.9.3-py2.py3-

In [2]:
import requests
from pypdf import PdfReader
import os
import re
import google.generativeai as genai
from chromadb import Documents, EmbeddingFunction, Embeddings
import chromadb
from chromadb.config import Settings
from typing import List, Dict

# Download and load PDF

In [3]:
def download_pdf(url, save_path):
    response = requests.get(url)
    with open(save_path, 'wb') as f:
        f.write(response.content)

def load_pdf(file_path):
    reader = PdfReader(file_path)
    text = ""
    for page in reader.pages:
        page_text = page.extract_text()
        if page_text:
            text += page_text
    return text

# ToDo:
- Text splitting
- ChromaDB
- Prompt Construction

In [4]:
# TODO: Students implement text splitting function
def split_text(text):
    """
    Split the input text into meaningful chunks.
    Returns a list of text chunks.
    """
    pass

# Custom embedding function using Gemini API
class GeminiEmbeddingFunction(EmbeddingFunction):
    def __call__(self, input: Documents) -> Embeddings:
        gemini_api_key = os.getenv("GEMINI_API_KEY")
        genai.configure(api_key=gemini_api_key)
        model = "models/embedding-001"
        title = "Custom query"
        return genai.embed_content(model=model, content=input, task_type="retrieval_document", title=title)["embedding"]

# TODO: Students implement ChromaDB creation and querying
def create_chroma_db(documents: List[str], path: str, name: str):
    """
    Create a ChromaDB collection with the provided documents.
    Returns the database instance and name.

    Hint: Use the following to create the client:
    client = chromadb.Client(Settings(
        chroma_db_impl="duckdb+parquet",
        persist_directory=path
    ))
    """
    pass

def get_relevant_passage(query: str, db, n_results: int):
    """
    Retrieve the most relevant passages for the given query.
    Returns a list of relevant text passages.
    """
    pass

# TODO: Students implement prompt construction
def make_rag_prompt(query: str, relevant_passage: str):
    """
    Construct a prompt for the generation model using the query and retrieved passage.
    Returns the formatted prompt string.
    """
    pass


In [5]:
# Implement the text splitting function
def split_text(text: str, max_length: int = 500) -> List[str]:
    """
    Split the input text into chunks of a specified maximum length.
    Ensures meaningful segmentation based on sentences or words.
    """
    sentences = re.split(r'(?<=[.!?]) +', text)  # Split on sentence boundaries
    chunks = []
    current_chunk = []
    current_length = 0

    for sentence in sentences:
        if current_length + len(sentence) > max_length:
            chunks.append(" ".join(current_chunk))
            current_chunk = [sentence]
            current_length = len(sentence)
        else:
            current_chunk.append(sentence)
            current_length += len(sentence)

    if current_chunk:  # Add any remaining text
        chunks.append(" ".join(current_chunk))
    return chunks

# Implement ChromaDB creation
def create_chroma_db(documents, path, name):
    db_client = chromadb.PersistentClient(path=path)

    #  get_or_create_collection
    collection = db_client.get_or_create_collection(name=name)

    existing_ids = set(collection.get()['ids'])

    for i, doc in enumerate(documents):
        doc_id = str(i)
        if doc_id not in existing_ids:
            collection.add(documents=[doc], ids=[doc_id])

    return db_client, collection



# Implement passage retrieval
def get_relevant_passage(query: str, db, n_results: int = 1) -> List[str]:
    collection = db.get_collection(name="rag_experiment")
    results = collection.query(query_texts=[query], n_results=n_results)
    return results["documents"][0] if results["documents"] else []


# Implement prompt construction
def make_rag_prompt(query: str, relevant_passage: str) -> str:
    """
    Construct a prompt for the generation model using the query and retrieved passage.
    """
    return f"Use the following context to answer the query:\n\nContext: {relevant_passage}\n\nQuery: {query}\n\nAnswer:"


# LLM Response Generation

In [6]:
def generate_answer(prompt: str):
    """Generate answer using Gemini Pro API"""

    os.environ["GEMINI_API_KEY"] = "your_genimi_api_key"

    gemini_api_key = os.getenv("GEMINI_API_KEY")
    if not gemini_api_key:
        raise ValueError("Gemini API Key not provided. Please provide GEMINI_API_KEY as an environment variable")
    genai.configure(api_key=gemini_api_key)
    model = genai.GenerativeModel('gemini-pro')
    result = model.generate_content(prompt)
    return result.text

# Main execution
## ToDo:
 - Chat history
 - Multiple file injest

In [7]:
# chat history
chat_history: Dict[str, List[str]] = {}

def add_to_chat_history(user_query: str, answer: str):
    if user_query not in chat_history:
        chat_history[user_query] = []
    chat_history[user_query].append(answer)

def display_chat_history():
    print("\nChat History:")
    for query, responses in chat_history.items():
        print(f"Q: {query}")
        for resp in responses:
            print(f"  -> A: {resp}")

# multiple file injest
def process_multiple_pdfs(pdf_urls: List[str], db_path: str, db_name: str):
    all_text_chunks = []
    for url in pdf_urls:
        filename = url.split("/")[-1]
        download_pdf(url, filename)
        text = load_pdf(filename)
        chunks = split_text(text)
        all_text_chunks.extend(chunks)

    # create ChromaDB
    db, _ = create_chroma_db(all_text_chunks, db_path, db_name)
    return db


In [10]:
def main():
    # Set up configurations
    pdf_url = "https://services.google.com/fh/files/misc/ai_adoption_framework_whitepaper.pdf"
    pdf_path = "ai_adoption_framework_whitepaper.pdf"
    db_folder = "chroma_db"
    db_name = "rag_experiment"

    # Create database directory
    if not os.path.exists(db_folder):
        os.makedirs(db_folder)

    # Download and process PDF
    download_pdf(pdf_url, pdf_path)
    pdf_text = load_pdf(pdf_path)

    # Split text into chunks
    chunked_text = split_text(pdf_text)

    # Create and set up database
    db_path = os.path.join(os.getcwd(), db_folder)
    db_client, collection = create_chroma_db(chunked_text, db_path, db_name)


    # Process user query
    query = input("Please enter your query: ")
    relevant_text = get_relevant_passage(query, db_client, n_results=3)

    # Generate and display answer
    if relevant_text:
        final_prompt = make_rag_prompt(query, "".join(relevant_text))
        answer = generate_answer(final_prompt)
        print("\nGenerated Answer:", answer)
    else:
        print("No relevant information found for the given query.")

if __name__ == "__main__":
    main()

# example query：
# What are the key components of the AI adoption framework?

Please enter your query: What are the key components of the AI adoption framework?

Generated Answer: The context suggests that the key components of the AI Adoption Framework are Learn , Lead , Govern , and Secure, which together with  Innovate, and  Deliver, make up the six key components.
