## Policy Document Indexing for RAG

In [4]:
import os
from datetime import datetime

# Path to the documents
path = 'assets/documents'

# Show all files in the folder:
files = os.listdir(path)
print(files)

# Check the length
print(len(files))

['tuition-reimbursement-policy.pdf', 'health-insurance-policy.pdf', 'work-from-home-policy.pdf', 'gym-policy.pdf', 'vacation-policy.pdf', '401k-retirement-policy.pdf', 'life-insurance-policy.pdf', 'childcare-policy.pdf']
8


### Create a function that will check last modified time of the files and if it is not new we won't need to re-create the vector store

In [None]:
# Check when were they last modified
for file in files:
    file_path = os.path.join(path, file)
    mod_time = os.path.getmtime(file_path)
    print(f"{file}: {datetime.fromtimestamp(mod_time)}")

### Text Extraction, Cleaning, Preprocessing

In [None]:
# Create a function that will use PyPDF2 library to extrac text from policy PDFs
# Create a functions that will extracts pdf to text, remove the noise, split the text into section

import re
import string
from PyPDF2 import PdfReader

# Function to extract text from a PDF file
def extract_text_from_pdf(pdf_path: str) -> str:
    """
    Extracts all text from a PDF file using PyPDF2.
    
    Args:
        pdf_path (str): Path to the PDF file.
    
    Returns:
        str: Extracted raw text.
    """
    text = ""
    reader = PdfReader(pdf_path)
    for page in reader.pages:
        text += page.extract_text() or ""  # handle None if empty page
    return text

# We start by splitting the document into sections for later text preprocessing
def split_into_sections(text: str) -> dict:
    """
    Splits text into sections based on detected headings.
    Returns a dictionary {heading: content}.
    """
    lines = text.splitlines()
    sections = {}
    current_heading = "Document"
    current_content = []
    
    for line in lines:
        stripped = line.strip()
        
        # Heuristic: heading if short, capitalized, and not ending with period
        if stripped and len(stripped.split()) <= 6 and stripped[0].isupper() and not stripped.endswith('.'):
            # save previous section
            if current_content:
                sections[current_heading] = " ".join(current_content).strip()
            # start new section
            current_heading = stripped
            current_content = []
        else:
            current_content.append(stripped)
    
    # save last section
    if current_content:
        sections[current_heading] = " ".join(current_content).strip()
    
    return sections

# Function to clean and remove noise from text
# We observe that the pdfs don't contain any page numbers, or images
def clean_text(text: str, lowercase: bool = True) -> str:
    """
    Cleans extracted PDF text for preprocessing:
    - Lowercase (optional)
    - Remove line breaks, tabs
    - Remove punctuation
    - Normalize spaces
    
    Args:
        text (str): Raw extracted text.
        lowercase (bool): Convert to lowercase (default True).
    
    Returns:
        str: Cleaned text ready for NLP tasks.
    """
    # Convert to lowercase if needed
    if lowercase:
        text = text.lower()
    
    # Replace newlines and tabs with space
    text = text.replace("\n", " ").replace("\t", " ")
    
    # Remove punctuation
    text = text.translate(str.maketrans("", "", string.punctuation))
    
    # Remove multiple spaces
    text = re.sub(r"\s+", " ", text)
    
    return text.strip()


In [19]:
import pandas as pd
from tqdm import tqdm  # for progress bar, optional
tqdm.pandas()

# Preprocess all PDF files and collect segments for vector store
segments = []
for file in tqdm(files):
    if file.lower().endswith('.pdf'):
        pdf_path = os.path.join(path, file)
        raw_text = extract_text_from_pdf(pdf_path)
        sections = split_into_sections(raw_text)
        for heading, content in sections.items():
            cleaned = clean_text(content)
            segments.append({
                'file': file,
                'section': heading,
                'text': cleaned
            })

# Convert to DataFrame for easy processing
segments_df = pd.DataFrame(segments)
segments_df.head()

100%|██████████| 8/8 [00:07<00:00,  1.02it/s]



Unnamed: 0,file,section,text
0,tuition-reimbursement-policy.pdf,Introduction,TechLance is committed to supporting the profe...
1,tuition-reimbursement-policy.pdf,Eligibility Requirements and Performance Stand...,"To be eligible for tuition reimbursement, empl..."
2,tuition-reimbursement-policy.pdf,Covered Educational Expenses and Limits,TechLance covers a range of educational expens...
3,tuition-reimbursement-policy.pdf,Application and Approval Process,The application process must be completed at l...
4,tuition-reimbursement-policy.pdf,Academic Performance Requirements,"To receive reimbursement, you must successfull..."
