## Policy Document Indexing for RAG

In [None]:
import os
import shutil
from datetime import datetime
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import re
import string
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# Path to the documents
path = 'assets/documents'

# Show all files in the folder:
files = [f for f in os.listdir(path) if f.lower().endswith('.pdf')]

# Load environment variables from .env file
load_dotenv()

# Function to extract text from a PDF file
def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts all text from a PDF file using PyPDF2.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        str: Extracted raw text.
    """
    text = ""
    reader = PdfReader(file_path)
    for page in reader.pages:
        text += page.extract_text() or ""  # handle None if empty page
    return text

# We start by splitting the document into sections for later text preprocessing
def split_into_sections(text: str) -> dict:
    """
    Splits text into sections based on detected headings.
    Returns a dictionary {heading: content}.
    """
    lines = text.splitlines()
    sections = {}
    current_heading = "Document"
    current_content = []
    
    for line in lines:
        stripped = line.strip()
        
        # Heuristic: heading if short, capitalized, and not ending with period
        if stripped and len(stripped.split()) <= 6 and stripped[0].isupper() and not stripped.endswith('.'):
            # save previous section
            if current_content:
                sections[current_heading] = " ".join(current_content).strip()
            # start new section
            current_heading = stripped
            current_content = []
        else:
            current_content.append(stripped)
    
    # save last section
    if current_content:
        sections[current_heading] = " ".join(current_content).strip()
    
    return sections

# Function to clean and remove noise from text
# We observe that the pdfs don't contain any page numbers, or images
def clean_text(text: str, lowercase: bool = True) -> str:
    """
    Cleans extracted PDF text for preprocessing:
    - Lowercase (optional)
    - Remove line breaks, tabs
    - Remove punctuation
    - Normalize spaces
    
    Args:
        text (str): Raw extracted text.
        lowercase (bool): Convert to lowercase (default True).
    
    Returns:
        str: Cleaned text ready for NLP tasks.
    """
    # Convert to lowercase if needed
    if lowercase:
        text = text.lower()
    
    # Replace newlines and tabs with space
    text = text.replace("\n", " ").replace("\t", " ")
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # number of characters per chunk
    chunk_overlap=50  # overlap to maintain context
)

all_chunks = []
all_metadatas = []

for file in files:
    pdf_path = os.path.join(path, file)
    pdf_text = extract_text_from_pdf(pdf_path)
    
    sections = split_into_sections(pdf_text)
    
    for section_title, content in sections.items():
        chunks = text_splitter.split_text(content)
        all_chunks.extend(chunks)
        
        # Add metadata per chunk, including section
        for _ in chunks:
            all_metadatas.append({"source": file, "section": section_title})


In [None]:
persist_directory = "./persist"

# # Remove old data
# if os.path.exists(persist_directory):
#     shutil.rmtree(persist_directory)

# # Create folder with proper permissions
# os.makedirs(persist_directory, exist_ok=True)

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
vectorstore = Chroma.from_texts(
                                all_chunks, 
                                embedding=embeddings, 
                                metadatas=all_metadatas,
                                persist_directory=persist_directory
                                )

vectorstore.persist()

In [None]:

queries = [
    "What's the maternity leave policy?",
    "What is the eligibility for Tuition Reimbursement",
    "How much can employees contribute to 401-k?",
    "Do I have to manually enroll for 401-k?",
    "I work in Finance, can I work remotely?"
]

for q in queries:
    results = vectorstore.similarity_search(q, k=3)
    print(f"Query: {q}\n")
    
    for i, doc in enumerate(results):
        source = doc.metadata.get("source", "unknown")
        section = doc.metadata.get("section", "unknown")
        print(f"Result {i+1} (from {source}, section: {section}):\n{doc.page_content}\n")
    
    print("="*50 + "\n")


### Advanced RAG Methods

Metadata Filtering - Useful if we want to use only specific files for our answers or we want to search in specific section of the data.

In [None]:
# Metadata Filtering
query = "What is the maternity leave policy?"

# Filter chunks where file = 'childcare-policy.pdf'
results = vectorstore.similarity_search(
    query, 
    k=3,
    filter={"source": "vacation-policy.pdf"}  # Metadata filter
)

for i, doc in enumerate(results):
    print(f"{i+1}. {doc.metadata['source']} - {doc.metadata['section']}")
    print(doc.page_content, "\n")


Query exapnsion - Automatically expand your query with related terms to improve retrieval.

In [None]:
query = "Maternity leave policy"

# Simple query expansion (you could also use an LLM to generate expansions)
expanded_terms = ["parental leave", "pregnancy leave", "childcare leave"]
expanded_query = query + ", " + ", ".join(expanded_terms)

results = vectorstore.similarity_search(expanded_query, k=3)

for i, doc in enumerate(results):
    print(f"{i+1}. {doc.metadata['source']} - {doc.metadata['section']}")
    print(doc.page_content, "\n")


HyDE - Generate a “hypothetical answer” for the query, then retrieve documents closest to that answer.

In [None]:
from langchain.chat_models import ChatOpenAI

llm = ChatOpenAI(model_name="gpt-3.5-turbo", temperature=0)

# Step 1: Generate hypothetical answer
prompt = f"Generate a concise hypothetical answer to this question: '{query}'"
hypothetical_answer = llm.predict(prompt)

# Step 2: Retrieve documents using embedding of the hypothetical answer
embedding_fn = OpenAIEmbeddings()
hypothetical_vector = embedding_fn.embed_query(hypothetical_answer)

# Chroma supports querying via embedding directly
results = vectorstore.similarity_search_by_vector(hypothetical_vector, k=3)

for i, doc in enumerate(results):
    print(f"{i+1}. {doc.metadata['source']} - {doc.metadata['section']}")
    print(doc.page_content, "\n")


# Make the code more modualar

### Create a function that will check last modified time of the files and if it is not new we won't need to re-create the vector store
Below there are functions that are included in ```rag.py``` - they check for the vector store and if it is up to date with pdf documents in a given folder. If no, the store is recreated. Furthermore, there is a processing of the documents.

In [1]:
import os
import re
import string
from dotenv import load_dotenv
from PyPDF2 import PdfReader
from langchain.text_splitter import RecursiveCharacterTextSplitter
import json
from datetime import datetime
from langchain_community.vectorstores import Chroma
from langchain.embeddings.openai import OpenAIEmbeddings

load_dotenv()  # Load environment variables from .env

class PDFProcessor:
    """
    Processes PDF documents:
    - Extracts raw text
    - Splits text into sections based on headings
    - Cleans text for NLP
    - Splits text into chunks with metadata
    """
    def __init__(self, pdf_folder: str, chunk_size: int = 500, chunk_overlap: int = 50):
        self.pdf_folder = pdf_folder
        self.chunk_size = chunk_size
        self.chunk_overlap = chunk_overlap
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=self.chunk_size,
            chunk_overlap=self.chunk_overlap
        )
        self.files = [f for f in os.listdir(pdf_folder) if f.lower().endswith(".pdf")]

    def extract_text(self, file_path: str) -> str:
        """Extracts raw text from a PDF using PyPDF2."""
        text = ""
        reader = PdfReader(file_path)
        for page in reader.pages:
            text += page.extract_text() or ""
        return text

    def split_into_sections(self, text: str) -> dict:
        """Splits text into sections based on detected headings."""
        lines = text.splitlines()
        sections = {}
        current_heading = "Document"
        current_content = []

        for line in lines:
            stripped = line.strip()
            if stripped and len(stripped.split()) <= 6 and stripped[0].isupper() and not stripped.endswith("."):
                if current_content:
                    sections[current_heading] = " ".join(current_content).strip()
                current_heading = stripped
                current_content = []
            else:
                current_content.append(stripped)

        if current_content:
            sections[current_heading] = " ".join(current_content).strip()

        return sections

    def clean_text(self, text: str, lowercase: bool = True) -> str:
        """Cleans text: lowercases, removes punctuation, normalizes spaces."""
        if lowercase:
            text = text.lower()
        text = text.replace("\n", " ").replace("\t", " ")
        text = text.translate(str.maketrans("", "", string.punctuation))
        text = re.sub(r"\s+", " ", text)
        return text.strip()

    def process_pdfs(self):
        """
        Processes all PDFs in the folder:
        - Extracts text
        - Splits into sections
        - Splits sections into chunks
        - Returns chunks and metadata
        """
        all_chunks = []
        all_metadatas = []

        for file in self.files:
            pdf_path = os.path.join(self.pdf_folder, file)
            raw_text = self.extract_text(pdf_path)
            sections = self.split_into_sections(raw_text)

            for section_title, content in sections.items():
                cleaned_content = self.clean_text(content)
                chunks = self.text_splitter.split_text(cleaned_content)
                all_chunks.extend(chunks)
                all_metadatas.extend([{"source": file, "section": section_title}] * len(chunks))

        return all_chunks, all_metadatas




In [5]:
# rag.py
# Contains PDFProcessor and VectorStoreManager definitions

# main.py
from rag import PDFProcessor, VectorStoreManager
from langchain.embeddings.openai import OpenAIEmbeddings
import os
from dotenv import load_dotenv

load_dotenv()

PDF_FOLDER = "assets/documents"
PERSIST_DIR = "./persist"
embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))

processor = PDFProcessor(pdf_folder=PDF_FOLDER)
chunks, metadatas = processor.process_pdfs()

manager = VectorStoreManager(
    pdf_folder=PDF_FOLDER,
    persist_dir=PERSIST_DIR,
    embeddings=embeddings,
    chunks=chunks,
    metadatas=metadatas
)

vectorstore = manager.load_or_create()


Vector store is up to date. Last updated: 2025-08-22 17:09:21.983432
