In [1]:
# # Cell 1: Install Dependencies
# # Run this cell first to ensure all necessary libraries are installed.
# # If you're running this in a fresh environment or devcontainer,
# # ensure these are installed in your active Python environment.
# !pip install -qU langchain-community pypdf chromadb langchain-groq python-dotenv sentence-transformers langchain-chroma

# print("Dependencies installed.")

In [16]:
import os
from dotenv import load_dotenv
import glob
from pathlib import Path

from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain_chroma import Chroma
from langchain_groq import ChatGroq
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains import create_retrieval_chain
from langchain_core.prompts import ChatPromptTemplate

In [3]:
# --- 1. Environment Setup ---

# Load environment variables from .env file (if it exists)
load_dotenv()

# Set your API keys from environment variables
# IMPORTANT: Never hardcode your API keys directly in the script for production!
# Create a .env file in the same directory as this script with:
# GROQ_API_KEY="your_groq_api_key_here"
groq_api_key = os.getenv("GROQ_API_KEY")


# --- Configuration ---
PDF_DIRECTORY = "./documents" # Create this directory and place your PDFs here
CHUNK_SIZE = 1000
CHUNK_OVERLAP = 200
COLLECTION_NAME = "pdf_rag_collection"
GROQ_MODEL_NAME = "llama3-8b-8192" # e.g., "mixtral-8x7b-32768", "llama3-70b-8192"


# Configuration for Hugging Face Embeddings model
HF_EMBEDDING_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"


In [4]:
# --- 2. Load PDF Documents from Directory ---

def load_pdf_documents_from_directory(directory_path: str):
    """Loads all PDF documents from a specified directory."""
    print(f"Loading documents from directory: {directory_path}")
    all_documents = []
    
    # Ensure the directory exists
    if not Path(directory_path).exists():
        Path(directory_path).mkdir(parents=True, exist_ok=True)
        print(f"Created directory: {directory_path}. Please place your PDF documents here.")
        return None # Return None as no docs are present yet

    pdf_files = glob.glob(os.path.join(directory_path, "*.pdf"))

    if not pdf_files:
        print(f"No PDF files found in {directory_path}. Please ensure PDFs are present.")
        return None

    for file_path in pdf_files:
        try:
            loader = PyPDFLoader(file_path)
            documents = loader.load()
            all_documents.extend(documents)
            print(f"Loaded {len(documents)} pages from {file_path}")
        except Exception as e:
            print(f"Error loading PDF {file_path}: {e}")
    
    if not all_documents:
        print("No documents were successfully loaded. Exiting.")
        return None

    print(f"Successfully loaded a total of {len(all_documents)} pages from {len(pdf_files)} PDF files.")
    return all_documents


In [5]:
# --- 3. Split Document into Chunks ---

def split_documents(documents, chunk_size: int, chunk_overlap: int):
    """Splits loaded documents into smaller, manageable chunks."""
    print(f"Splitting documents into chunks (size={chunk_size}, overlap={chunk_overlap})...")
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    chunks = text_splitter.split_documents(documents)
    print(f"Created {len(chunks)} chunks.")
    return chunks

In [6]:
# --- 4. Create Embeddings (Changed to Hugging Face) ---
def create_embeddings(model_name: str):
    """Initializes Hugging Face Embeddings model."""
    print(f"Initializing Hugging Face Embeddings (model: {model_name})...")
    # HuggingFaceEmbeddings will download the model the first time it's used
    embeddings = HuggingFaceEmbeddings(
        model_name=model_name,
        model_kwargs={'device': 'cpu'} # Use 'cuda' if you have a GPU
    )
    print("Hugging Face Embeddings initialized.")
    return embeddings

In [7]:
# --- 5. Store Chunks and Embeddings in ChromaDB ---
def create_vector_store(chunks, embeddings, collection_name: str):
    """Creates and persists a ChromaDB vector store from document chunks and embeddings."""
    print(f"Creating/Loading ChromaDB vector store (collection: {collection_name})...")
    # Using a persistent client to store data in a local directory 'chroma_db'
    vectorstore = Chroma.from_documents(
        documents=chunks,
        embedding=embeddings,
        collection_name=collection_name,
        persist_directory="./chroma_db" # Data will be saved here
    )
    # The .persist() method is often not needed with recent langchain-chroma versions
    # when persist_directory is provided, as persistence is handled implicitly.
    # vectorstore.persist() # Removed this line
    print("ChromaDB vector store created and persisted.")
    return vectorstore

In [8]:
# --- 6. Set up Retriever ---
def setup_retriever(vectorstore):
    """Converts the vector store into a retriever."""
    print("Setting up retriever...")
    # 'k' parameter controls how many top-k relevant documents are retrieved
    retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
    print("Retriever set up.")
    return retriever

In [9]:
# --- 7. Initialize Groq LLM ---
def initialize_groq_llm(api_key: str, model_name: str):
    """Initializes the Groq LLM."""
    print(f"Initializing Groq LLM (model: {model_name})...")
    llm = ChatGroq(
        temperature=0.0, # Adjust temperature for creativity vs. factual accuracy
        groq_api_key=api_key,
        model_name=model_name
    )
    print("Groq LLM initialized.")
    return llm

In [10]:
# --- 8. Build the RAG Chain ---
def build_rag_chain(llm, retriever):
    """Builds the RAG (Retrieval Augmented Generation) chain."""
    print("Building RAG chain...")

    prompt = ChatPromptTemplate.from_messages([
        ("system", "You are an AI assistant for answering questions based on the provided context only. "
                   "If the answer is not in the context, clearly state that you don't have enough information.\n\n"
                   "Context: {context}"),
        ("human", "{input}")
    ])

    document_chain = create_stuff_documents_chain(llm, prompt)
    rag_chain = create_retrieval_chain(retriever, document_chain)
    print("RAG chain built successfully.")
    return rag_chain

In [11]:
# --- Main Execution ---

# Load and split the documents from the directory
documents = load_pdf_documents_from_directory(PDF_DIRECTORY)
if documents is None:
    print("Exiting due to PDF loading error or no documents found.")
    exit() # Use exit() for a script to stop execution

Loading documents from directory: ./documents
Loaded 7 pages from ./documents/singapore-court-case1.pdf
Loaded 5 pages from ./documents/singapore-court-case2.pdf
Successfully loaded a total of 12 pages from 2 PDF files.


In [12]:
chunks = split_documents(documents, CHUNK_SIZE, CHUNK_OVERLAP)

Splitting documents into chunks (size=1000, overlap=200)...
Created 58 chunks.


In [14]:
# Create embeddings (Now using Hugging Face model name)
embeddings = create_embeddings(HF_EMBEDDING_MODEL_NAME)

Initializing Hugging Face Embeddings (model: sentence-transformers/all-MiniLM-L6-v2)...
Hugging Face Embeddings initialized.


In [15]:
vectorstore = create_vector_store(chunks, embeddings, COLLECTION_NAME)

Creating/Loading ChromaDB vector store (collection: pdf_rag_collection)...
ChromaDB vector store created and persisted.


In [37]:
# Setup retriever
retriever = setup_retriever(vectorstore)

Setting up retriever...
Retriever set up.


In [46]:
# Initialize LLM
llm = initialize_groq_llm(groq_api_key, GROQ_MODEL_NAME)

Initializing Groq LLM (model: llama3-8b-8192)...
Groq LLM initialized.


In [48]:
# Build RAG chain
rag_chain = build_rag_chain(llm, retriever)

Building RAG chain...
RAG chain built successfully.


In [50]:
user_question = "What is the content of the penalty rule?"

# Invoke the RAG chain with the user's question
response = rag_chain.invoke({"input": user_question})

print("\n--- Answer ---")
print(response["answer"])

print("\n--- Sources (from retrieved chunks) ---")
if response["context"]:
    for i, doc in enumerate(response["context"]):
        page_number = doc.metadata.get('page', 'N/A')
        source_file = os.path.basename(doc.metadata.get('source', 'N/A'))
        print(f"Document {i+1} (File: {source_file}, Page: {page_number}):")
        content_snippet = str(doc.page_content)[:200]
        print(f"  Content snippet: {content_snippet}...")
    print("--------------------------------------")
else:
    print("No relevant context found.")


--- Answer ---
According to the provided context, the penalty rule was developed in relation to the very specific situation of enforcement of penal bonds. However, the context does not explicitly state the content of the penalty rule. It only mentions that the CA did not agree with the historical approach adopted by the court in Andrews and that the penalty rule would be confined to the sphere of secondary obligations, without interfering with the primary obligations between the contracting parties.

--- Sources (from retrieved chunks) ---
Document 1 (File: singapore-court-case1.pdf, Page: 2):
  Content snippet: CA did not agree with the historical approach adopted by the court in Andrews.2 The penalty rule was 
developed in relation to the very specific situation of enforcement of penal bonds;3 there was no ...
Document 2 (File: singapore-court-case1.pdf, Page: 2):
  Content snippet: CA did not agree with the historical approach adopted by the court in Andrews.2 The penalty rule was 