## Policy Document Indexing for RAG

In [1]:
import os
import shutil
from datetime import datetime
from dotenv import load_dotenv
from PyPDF2 import PdfReader
import re
import string
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain_community.vectorstores import Chroma

# Path to the documents
path = 'assets/documents'

# Show all files in the folder:
files = [f for f in os.listdir(path) if f.lower().endswith('.pdf')]

# Load environment variables from .env file
load_dotenv()


# Function to extract text from a PDF file
def extract_text_from_pdf(file_path: str) -> str:
    """
    Extracts all text from a PDF file using PyPDF2.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        str: Extracted raw text.
    """
    text = ""
    reader = PdfReader(file_path)
    for page in reader.pages:
        text += page.extract_text() or ""  # handle None if empty page
    return text

# We start by splitting the document into sections for later text preprocessing
def split_into_sections(text: str) -> dict:
    """
    Splits text into sections based on detected headings.
    Returns a dictionary {heading: content}.
    """
    lines = text.splitlines()
    sections = {}
    current_heading = "Document"
    current_content = []
    
    for line in lines:
        stripped = line.strip()
        
        # Heuristic: heading if short, capitalized, and not ending with period
        if stripped and len(stripped.split()) <= 6 and stripped[0].isupper() and not stripped.endswith('.'):
            # save previous section
            if current_content:
                sections[current_heading] = " ".join(current_content).strip()
            # start new section
            current_heading = stripped
            current_content = []
        else:
            current_content.append(stripped)
    
    # save last section
    if current_content:
        sections[current_heading] = " ".join(current_content).strip()
    
    return sections




In [2]:

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=500,  # number of characters per chunk
    chunk_overlap=50  # overlap to maintain context
)

all_chunks = []
all_metadatas = []

for file in files:
    pdf_path = os.path.join(path, file)
    pdf_text = extract_text_from_pdf(pdf_path)
    
    sections = split_into_sections(pdf_text)
    
    for section_title, content in sections.items():
        chunks = text_splitter.split_text(content)
        all_chunks.extend(chunks)
        
        # Add metadata per chunk, including section
        for _ in chunks:
            all_metadatas.append({"source": file, "section": section_title})


In [5]:
persist_directory = "./persist"

# # Remove old data
# if os.path.exists(persist_directory):
#     shutil.rmtree(persist_directory)

# Create folder with proper permissions
os.makedirs(persist_directory, exist_ok=True)

embeddings = OpenAIEmbeddings(openai_api_key=os.getenv("OPENAI_API_KEY"))
vectorstore = Chroma.from_texts(
                                all_chunks, 
                                embedding=embeddings, 
                                metadatas=all_metadatas,
                                persist_directory=persist_directory
                                )

vectorstore.persist()

In [4]:

queries = [
    "What's the maternity leave policy?",
    "What is the eligibility for Tuition Reimbursement"
]

for q in queries:
    results = vectorstore.similarity_search(q, k=3)
    print(f"Query: {q}\n")
    
    for i, doc in enumerate(results):
        source = doc.metadata.get("source", "unknown")
        section = doc.metadata.get("section", "unknown")
        print(f"Result {i+1} (from {source}, section: {section}):\n{doc.page_content}\n")
    
    print("="*50 + "\n")


Query: What's the maternity leave policy?

Result 1 (from childcare-policy.pdf, section: Support for Working Parents):
TechLance’s parental leave policies integrate closely with our childcare support programs. New mothers receive 12 weeks of paid maternity leave, while non-birth parents receive six weeks of paid paternity leave. Adoptive parents receive eight weeks of paid leave that can be shared between both parents. Employees must have been with the company for at least 12 months to qualify for paid parental leave, though unpaid leave options may be available for newer employees under FMLA guidelines.During

Result 2 (from childcare-policy.pdf, section: Support for Working Parents):
for newer employees under FMLA guidelines.During parental leave, health insurance and other beneﬁts continue, and we guarantee a spot in our on-site childcare center for employees returning from maternity or paternity leave. We also oﬀer ﬂexible return-to-work arrangements, including part-time schedules 

### Create a function that will check last modified time of the files and if it is not new we won't need to re-create the vector store

In [None]:
import pandas as pd
# assign an index to the files and store it together with the last modified date of the file
file_index = []
for idx, file in enumerate(files):
    if file.lower().endswith('.pdf'):
        file_path = os.path.join(path, file)
        mod_time = os.path.getmtime(file_path)
        file_index.append({
            'index': idx,
            'file': file,
            'last_modified': datetime.fromtimestamp(mod_time)
        })

file_index_df = pd.DataFrame(file_index)
print(file_index_df)

### Text Extraction, Cleaning, Preprocessing

In [None]:
# Create a function that will use PyPDF2 library to extrac text from policy PDFs
# Create a functions that will extracts pdf to text, remove the noise, split the text into section

import re
import string
from PyPDF2 import PdfReader

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts all text from a PDF file using PyPDF2.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        str: Extracted raw text.
    """
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ""  # handle None if empty page
    return text

# We start by splitting the document into sections for later text preprocessing
def split_into_sections(text: str) -> dict:
    """
    Splits text into sections based on detected headings.
    Returns a dictionary {heading: content}.
    """
    lines = text.splitlines()
    sections = {}
    current_heading = "Document"
    current_content = []
    
    for line in lines:
        stripped = line.strip()
        
        # Heuristic: heading if short, capitalized, and not ending with period
        if stripped and len(stripped.split()) <= 6 and stripped[0].isupper() and not stripped.endswith('.'):
            # save previous section
            if current_content:
                sections[current_heading] = " ".join(current_content).strip()
            # start new section
            current_heading = stripped
            current_content = []
        else:
            current_content.append(stripped)
    
    # save last section
    if current_content:
        sections[current_heading] = " ".join(current_content).strip()
    
    return sections

# Function to clean and remove noise from text
# We observe that the pdfs don't contain any page numbers, or images
def clean_text(text: str, lowercase: bool = True) -> str:
    """
    Cleans extracted PDF text for preprocessing:
    - Lowercase (optional)
    - Remove line breaks, tabs
    - Remove punctuation
    - Normalize spaces
    
    Args:
        text (str): Raw extracted text.
        lowercase (bool): Convert to lowercase (default True).
    
    Returns:
        str: Cleaned text ready for NLP tasks.
    """
    # Convert to lowercase if needed
    if lowercase:
        text = text.lower()
    
    # Replace newlines and tabs with space
    text = text.replace("\n", " ").replace("\t", " ")
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()


In [None]:
from tqdm import tqdm

segments = []
for file in tqdm(files):
    if file.lower().endswith('.pdf'):
        pdf_path = os.path.join(path, file)
        raw_text = extract_text_from_pdf(pdf_path)
        sections = split_into_sections(raw_text)

        for heading, content in sections.items():
            cleaned = clean_text(content)
            segments.append({
                'file': file,
                'section': heading,
                'text': cleaned
            })

# Convert to DataFrame for easy processing
segments_df = pd.DataFrame(segments)

chroma_db_input = pd.merge(segments_df, file_index_df, on = 'file', how = 'left')

# Create a unique index by combining file index and section order
chroma_db_input['unique_id'] = chroma_db_input['index'].astype(str) + '_' + chroma_db_input.groupby('file').cumcount().astype(str)


### We will check later if we need to split it into smaller chunks

### Chroma collection

In [None]:
pip install langchain_community

In [None]:
chroma_db_input['text'][0]

In [None]:
chroma_db_input

In [None]:
from langchain.docstore.document import Document
from langchain_community.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings

persist_directory = "./persist"

# Convert your segments DataFrame into Document objects
documents = [
    Document(
        page_content=row['text'],
        metadata={
            'file': row['file'],
            'section': row['section'],
            'chunk_id': row['unique_id']
        }
    )
    for _, row in chroma_db_input.iterrows()
]

# Create embeddings and Chroma vector store
embeddings = OpenAIEmbeddings()
vectorstore = Chroma.from_documents(
    documents,
    embeddings,
    persist_directory=persist_directory
)

vectorstore.persist()
print("Chroma vector store created and persisted!")

In [None]:
# Create Chroma vector store using LangChain integration
from langchain_community.vectorstores import Chroma
from langchain.embeddings import OpenAIEmbeddings  # or use another embedding model

persist_directory = "./persist"

# Prepare documents and metadatas
documents = chroma_db_input["text"].tolist()
metadatas = chroma_db_input[["file", "section", "last_modified", "unique_id"]].to_dict(orient="records")
ids = chroma_db_input["unique_id"].tolist()

# Initialize embeddings (replace with your preferred embedding model if needed)
embedding_function = OpenAIEmbeddings()  

# Create the Chroma vector store
vectordb = Chroma.from_texts(
    texts=documents,
    embedding=embedding_function,
    metadatas=metadatas,
    ids=ids,
    persist_directory=persist_directory
)

vectordb.persist()
print("Chroma vector store created and persisted!")

In [None]:
pip install langchain chromadb PyPDF2 tiktoken openai


In [None]:
files