In [1]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
import chromadb

In [2]:
# Define path to the PDF file
pdf_file_path = Path("Principal-Sample-Life-Insurance-Policy.pdf")
print(f"PDF file path: {pdf_file_path}")
print(f"File exists: {pdf_file_path.exists()}")

PDF file path: Principal-Sample-Life-Insurance-Policy.pdf
File exists: True


In [5]:
# Open the PDF file
with pdfplumber.open(pdf_file_path) as pdf:

    # Get one of the pages from the PDF and examine it
    single_page = pdf.pages[9]

    # Extract text from the first page
    text = single_page.extract_text()

    # Extract tables from the first page
    tables = single_page.extract_tables()

    # Print the extracted text
    print(text)

T he legally recognized union of two eligible individuals of the same sex established according to
law.
Civil Union Partner
For two persons to establish a Civil Union in Rhode Island, it shall be necessary that they satisfy
all of the following criteria:
a. not be a party to another Civil Union or marriage in Rhode Island;
b. be of the same sex and therefore be excluded from the marriage laws of Rhode Island or
any other state;
c. be at least 18 years of age;
d. not be related to the other proposed party to the Civil Union.
NOTE: For the purposes of this Group Policy, the term "spouse" will include Civil Union
Partner, except as otherwise provided in this Group Policy.
Date of Issue
The date this Group Policy is placed in force: November 1, 2007.
Dependent
a. A Member's spouse, if that spouse:
(1) is legally married to the Member; and
(2) is not in the Armed Forces of any country; and
(3) is not insured under this Group Policy as a Member.
A Member's spouse will also include a Civil Un

In [14]:
# Function to extract text from a PDF file.
# 1. Declare a variable p to store the iteration of the loop that will help us store page numbers alongside the text
# 2. Declare an empty list 'full_text' to store all the text files
# 3. Use pdfplumber to open the pdf pages one by one
# 4. Find the tables and their locations in the page
# 5. Extract the text from the tables in the variable 'tables'
# 6. Extract the regular words by calling the function check_bboxes() and checking whether words are present in the table or not
# 7. Use the cluster_objects utility to cluster non-table and table words together so that they retain the same chronology as in the original PDF
# 8. Declare an empty list 'lines' to store the page text
# 9. If a text element in present in the cluster, append it to 'lines', else if a table element is present, append the table
# 10. Append the page number and all lines to full_text, and increment 'p'
# 11. When the function has iterated over all pages, return the 'full_text' list

def extract_text_from_pdf(pdf_path):
    p = 0
    full_text = []


    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            page_no = f"Page {p+1}"
            text = page.extract_text()

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                [check_bboxes(word, table_bbox) for table_bbox in table_bboxes])]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):

                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass

                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))


            full_text.append([page_no, " ".join(lines)])
            p +=1

    return full_text

In [16]:
full_text = extract_text_from_pdf(pdf_file_path)

In [20]:
extracted_text_df = pd.DataFrame(full_text, columns=['Page No.', 'Page_Text'])
extracted_text_df['Document Name'] = pdf_file_path.name

In [21]:
extracted_text_df

Unnamed: 0,Page No.,Page_Text,Document Name
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf
1,Page 2,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf
3,Page 4,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf
...,...,...,...
59,Page 60,I f a Dependent who was insured dies during th...,Principal-Sample-Life-Insurance-Policy.pdf
60,Page 61,Section D - Claim Procedures Article 1 - Notic...,Principal-Sample-Life-Insurance-Policy.pdf
61,Page 62,A claimant may request an appeal of a claim de...,Principal-Sample-Life-Insurance-Policy.pdf
62,Page 63,This page left blank intentionally,Principal-Sample-Life-Insurance-Policy.pdf


In [22]:
# Check one of the extracted page texts to ensure that the text has been correctly read

extracted_text_df.Page_Text[2]

'POLICY RIDER GROUP INSURANCE POLICY NO: S655 COVERAGE: Life EMPLOYER: RHODE ISLAND JOHN DOE Effective on the later of the Date of Issue of this Group Policy or March 1, 2005, the following will apply to your Policy: From time to time The Principal may offer or provide certain employer groups who apply for coverage with The Principal a Financial Services Hotline and Grief Support Services or any other value added service for the employees of that employer group. In addition, The Principal may arrange for third party service providers (i.e., optometrists, health clubs), to provide discounted goods and services to those employer groups who apply for coverage with The Principal or who become insureds/enrollees of The Principal. While The Principal has arranged these goods, services and/or third party provider discounts, the third party service providers are liable to the applicants/insureds/enrollees for the provision of such goods and/or services. The Principal is not responsible for the

In [23]:
# Let's also check the length of all the texts as there might be some empty pages or pages with very few words that we can drop

extracted_text_df['Text_Length'] = extracted_text_df['Page_Text'].apply(lambda x: len(x.split(' ')))

In [24]:
extracted_text_df['Text_Length']

0      30
1       5
2     230
3       5
4     110
     ... 
59    285
60    418
61    322
62      5
63      8
Name: Text_Length, Length: 64, dtype: int64

In [25]:
# Retain only the rows with a text length of at least 10

insurance_pdfs_data = extracted_text_df.loc[extracted_text_df['Text_Length'] >= 10]
insurance_pdfs_data

Unnamed: 0,Page No.,Page_Text,Document Name,Text_Length
0,Page 1,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,Principal-Sample-Life-Insurance-Policy.pdf,30
2,Page 3,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,Principal-Sample-Life-Insurance-Policy.pdf,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,Principal-Sample-Life-Insurance-Policy.pdf,110
5,Page 6,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,Principal-Sample-Life-Insurance-Policy.pdf,153
6,Page 7,Section A – Eligibility Member Life Insurance ...,Principal-Sample-Life-Insurance-Policy.pdf,176
7,Page 8,Section A - Member Life Insurance Schedule of ...,Principal-Sample-Life-Insurance-Policy.pdf,171
8,Page 9,P ART I - DEFINITIONS When used in this Group ...,Principal-Sample-Life-Insurance-Policy.pdf,387
9,Page 10,T he legally recognized union of two eligible ...,Principal-Sample-Life-Insurance-Policy.pdf,251
10,Page 11,(2) has been placed with the Member or spouse ...,Principal-Sample-Life-Insurance-Policy.pdf,299
11,Page 12,An institution that is licensed as a Hospital ...,Principal-Sample-Life-Insurance-Policy.pdf,352


In [26]:
# Let's examine the current structure of insurance_pdfs_data
print("Current columns in insurance_pdfs_data:")
print(insurance_pdfs_data.columns.tolist())
print("\nFirst few rows:")
print(insurance_pdfs_data.head())
print(f"\nDataFrame shape: {insurance_pdfs_data.shape}")
print(f"\nData types:\n{insurance_pdfs_data.dtypes}")

Current columns in insurance_pdfs_data:
['Page No.', 'Page_Text', 'Document Name', 'Text_Length']

First few rows:
  Page No.                                          Page_Text  \
0   Page 1  DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...   
2   Page 3  POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...   
4   Page 5  PRINCIPAL LIFE INSURANCE COMPANY (called The P...   
5   Page 6  TABLE OF CONTENTS PART I - DEFINITIONS PART II...   
6   Page 7  Section A – Eligibility Member Life Insurance ...   

                                Document Name  Text_Length  
0  Principal-Sample-Life-Insurance-Policy.pdf           30  
2  Principal-Sample-Life-Insurance-Policy.pdf          230  
4  Principal-Sample-Life-Insurance-Policy.pdf          110  
5  Principal-Sample-Life-Insurance-Policy.pdf          153  
6  Principal-Sample-Life-Insurance-Policy.pdf          176  

DataFrame shape: (60, 4)

Data types:
Page No.         object
Page_Text        object
Document Name    object
Text_Length      

In [27]:
# Enhanced metadata for insurance_pdfs_data
import datetime
import re

# Create a copy to add metadata
enhanced_data = insurance_pdfs_data.copy()

# 1. Document Processing Timestamp
enhanced_data['Processing_Timestamp'] = datetime.datetime.now()

# 2. Document Type/Category
enhanced_data['Document_Type'] = 'Life Insurance Policy'

# 3. Page Number (numeric for sorting)
enhanced_data['Page_Number'] = enhanced_data['Page No.'].str.extract(r'(\d+)').astype(int)

# 4. Text Statistics
enhanced_data['Word_Count'] = enhanced_data['Page_Text'].apply(lambda x: len(x.split()))
enhanced_data['Character_Count'] = enhanced_data['Page_Text'].apply(len)
enhanced_data['Sentence_Count'] = enhanced_data['Page_Text'].apply(lambda x: len(re.split(r'[.!?]+', x)))

# 5. Content Classification (basic heuristics)
def classify_content(text):
    text_lower = text.lower()
    if any(word in text_lower for word in ['table of contents', 'contents']):
        return 'Table of Contents'
    elif any(word in text_lower for word in ['premium', 'benefit', 'coverage']):
        return 'Policy Details'
    elif any(word in text_lower for word in ['definition', 'definitions']):
        return 'Definitions'
    elif any(word in text_lower for word in ['rider', 'endorsement']):
        return 'Rider/Endorsement'
    elif any(word in text_lower for word in ['claim', 'claims']):
        return 'Claims Information'
    else:
        return 'General Content'

enhanced_data['Content_Category'] = enhanced_data['Page_Text'].apply(classify_content)

# 6. Text Quality Indicators
enhanced_data['Has_Tables'] = enhanced_data['Page_Text'].apply(lambda x: '[' in x and ']' in x)
enhanced_data['Text_Density'] = enhanced_data['Character_Count'] / (enhanced_data['Character_Count'].max() + 1)

# 7. Document Structure
enhanced_data['Is_First_Page'] = enhanced_data['Page_Number'] == 1
enhanced_data['Is_Last_Page'] = enhanced_data['Page_Number'] == enhanced_data['Page_Number'].max()

# 8. Processing Source Information
enhanced_data['Extraction_Method'] = 'PDFPlumber'
enhanced_data['File_Size_Bytes'] = pdf_file_path.stat().st_size if pdf_file_path.exists() else None

print("Enhanced metadata columns added:")
print(enhanced_data.columns.tolist())
print(f"\nNew DataFrame shape: {enhanced_data.shape}")

Enhanced metadata columns added:
['Page No.', 'Page_Text', 'Document Name', 'Text_Length', 'Processing_Timestamp', 'Document_Type', 'Page_Number', 'Word_Count', 'Character_Count', 'Sentence_Count', 'Content_Category', 'Has_Tables', 'Text_Density', 'Is_First_Page', 'Is_Last_Page', 'Extraction_Method', 'File_Size_Bytes']

New DataFrame shape: (60, 17)


In [28]:
# Display sample of enhanced data
print("Sample of enhanced insurance_pdfs_data with metadata:")
print(enhanced_data[['Page No.', 'Content_Category', 'Word_Count', 'Has_Tables', 'Text_Density']].head(10))

print("\nContent Category Distribution:")
print(enhanced_data['Content_Category'].value_counts())

print("\nPages with Tables:")
print(f"Total pages with tables: {enhanced_data['Has_Tables'].sum()}")

print("\nText Statistics Summary:")
print(enhanced_data[['Word_Count', 'Character_Count', 'Sentence_Count']].describe())

Sample of enhanced insurance_pdfs_data with metadata:
   Page No.   Content_Category  Word_Count  Has_Tables  Text_Density
0    Page 1    General Content          30       False      0.069578
2    Page 3     Policy Details         230       False      0.543301
4    Page 5     Policy Details         110       False      0.262398
5    Page 6  Table of Contents         153       False      0.378608
6    Page 7  Table of Contents         176       False      0.429312
7    Page 8  Table of Contents         171       False      0.398594
8    Page 9        Definitions         387       False      0.824574
9   Page 10        Definitions         251       False      0.510733
10  Page 11     Policy Details         299       False      0.655440
11  Page 12        Definitions         352       False      0.771281

Content Category Distribution:
Content_Category
Policy Details       49
Definitions           5
General Content       3
Table of Contents     3
Name: count, dtype: int64

Pages with Tabl

In [29]:
# Additional metadata suggestions for RAG applications

# Update the original insurance_pdfs_data with the enhanced version
insurance_pdfs_data = enhanced_data.copy()

print("METADATA ADDED TO insurance_pdfs_data:")
print("="*50)

print("\n1. TEMPORAL METADATA:")
print("   - Processing_Timestamp: When the document was processed")

print("\n2. DOCUMENT IDENTIFICATION:")
print("   - Document_Type: Category of the document")
print("   - Document_Name: Original filename")
print("   - File_Size_Bytes: Size of the source file")

print("\n3. PAGE-LEVEL METADATA:")
print("   - Page_Number: Numeric page number for sorting")
print("   - Is_First_Page/Is_Last_Page: Document structure indicators")

print("\n4. TEXT ANALYSIS METADATA:")
print("   - Word_Count: Number of words per page")
print("   - Character_Count: Number of characters per page")
print("   - Sentence_Count: Number of sentences per page")
print("   - Text_Density: Relative density of text content")
print("   - Text_Length: Original length metric")

print("\n5. CONTENT CLASSIFICATION:")
print("   - Content_Category: Semantic classification of page content")
print("   - Has_Tables: Whether the page contains table data")

print("\n6. PROCESSING METADATA:")
print("   - Extraction_Method: Tool used for extraction")

print("\n7. ADDITIONAL METADATA YOU COULD ADD:")
print("   - Embedding_Vector: Text embeddings for similarity search")
print("   - Chunk_ID: For text chunking strategies")
print("   - Language: Document language detection")
print("   - Confidence_Score: OCR confidence if applicable")
print("   - Keywords: Extracted key terms")
print("   - Named_Entities: Person/Organization/Location entities")
print("   - Section_Header: Detected section titles")
print("   - Relevance_Score: Business importance ranking")
print("   - Last_Updated: Document modification date")
print("   - Access_Level: Security/privacy classification")

print(f"\nFinal DataFrame shape: {insurance_pdfs_data.shape}")
print(f"Total metadata columns: {len(insurance_pdfs_data.columns)}")

METADATA ADDED TO insurance_pdfs_data:

1. TEMPORAL METADATA:
   - Processing_Timestamp: When the document was processed

2. DOCUMENT IDENTIFICATION:
   - Document_Type: Category of the document
   - Document_Name: Original filename
   - File_Size_Bytes: Size of the source file

3. PAGE-LEVEL METADATA:
   - Page_Number: Numeric page number for sorting
   - Is_First_Page/Is_Last_Page: Document structure indicators

4. TEXT ANALYSIS METADATA:
   - Word_Count: Number of words per page
   - Character_Count: Number of characters per page
   - Sentence_Count: Number of sentences per page
   - Text_Density: Relative density of text content
   - Text_Length: Original length metric

5. CONTENT CLASSIFICATION:
   - Content_Category: Semantic classification of page content
   - Has_Tables: Whether the page contains table data

6. PROCESSING METADATA:
   - Extraction_Method: Tool used for extraction

7. ADDITIONAL METADATA YOU COULD ADD:
   - Embedding_Vector: Text embeddings for similarity search

In [30]:
# Set the API key
with open("OpenAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [31]:
# Import the OpenAI Embedding Function into chroma

from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction

In [32]:
# Define the path where chroma collections will be stored

chroma_data_path = 'ChromaDB_Data'

In [33]:
import chromadb

In [34]:
# Call PersistentClient()

client = chromadb.PersistentClient()

In [35]:
# Set up the embedding function using the OpenAI embedding model

model = "text-embedding-ada-002"
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name=model)

In [36]:
# Initialise a collection in chroma and pass the embedding_function to it so that it used OpenAI embeddings to embed the documents

insurance_collection = client.get_or_create_collection(name='RAG_on_Insurance', embedding_function=embedding_function)

In [37]:
# Convert the page text and metadata from your dataframe to lists to be able to pass it to chroma

documents_list = insurance_pdfs_data["Page_Text"].tolist()
metadata_list = insurance_pdfs_data['Content_Category'].tolist()

In [38]:
# Add the documents and metadata to the collection alongwith generic integer IDs. You can also feed the metadata information as IDs by combining the policy name and page no.

insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

ValueError: Expected metadata to be a dict or None, got str as metadata in add.

In [39]:
# Combine all metadata columns into a single dictionary column for ChromaDB

# Define which columns are metadata (exclude the main content columns)
content_columns = ['Page No.', 'Page_Text', 'Document Name']
metadata_columns = [col for col in insurance_pdfs_data.columns if col not in content_columns]

print("Metadata columns to combine:")
print(metadata_columns)

# Function to create metadata dictionary for each row
def create_metadata_dict(row):
    metadata_dict = {}
    for col in metadata_columns:
        value = row[col]
        # Convert timestamps to string for JSON compatibility
        if isinstance(value, pd.Timestamp):
            metadata_dict[col] = value.isoformat()
        # Convert numpy/pandas types to native Python types
        elif hasattr(value, 'item'):
            metadata_dict[col] = value.item()
        else:
            metadata_dict[col] = value
    return metadata_dict

# Create the combined metadata column
insurance_pdfs_data['metadata'] = insurance_pdfs_data.apply(create_metadata_dict, axis=1)

# Display sample of the new metadata column
print(f"\nSample metadata dictionary:")
print(insurance_pdfs_data['metadata'].iloc[0])

print(f"\nDataFrame now has {len(insurance_pdfs_data.columns)} columns")
print(f"New column 'metadata' contains dictionaries with {len(metadata_columns)} metadata fields")

Metadata columns to combine:
['Text_Length', 'Processing_Timestamp', 'Document_Type', 'Page_Number', 'Word_Count', 'Character_Count', 'Sentence_Count', 'Content_Category', 'Has_Tables', 'Text_Density', 'Is_First_Page', 'Is_Last_Page', 'Extraction_Method', 'File_Size_Bytes']

Sample metadata dictionary:
{'Text_Length': 30, 'Processing_Timestamp': '2025-08-01T19:36:08.387381', 'Document_Type': 'Life Insurance Policy', 'Page_Number': 1, 'Word_Count': 30, 'Character_Count': 188, 'Sentence_Count': 1, 'Content_Category': 'General Content', 'Has_Tables': False, 'Text_Density': 0.0695780903034789, 'Is_First_Page': True, 'Is_Last_Page': False, 'Extraction_Method': 'PDFPlumber', 'File_Size_Bytes': 222772}

DataFrame now has 18 columns
New column 'metadata' contains dictionaries with 14 metadata fields


In [41]:
# Prepare ChromaDB format with the new combined metadata (without uploading due to SSL issue)

# Extract the documents and new metadata format
documents_list_new = insurance_pdfs_data["Page_Text"].tolist()
metadata_list_new = insurance_pdfs_data['metadata'].tolist()

print("Sample document and metadata for ChromaDB:")
print(f"Document (first 100 chars): {documents_list_new[0][:100]}...")
print(f"\nFull metadata example:")
print(metadata_list_new[0])

print("\n" + "="*60)
print("CHROMADB READY FORMAT:")
print("="*60)
print(f"Total documents: {len(documents_list_new)}")
print(f"Total metadata entries: {len(metadata_list_new)}")

# Show how you would use this with ChromaDB
print("\nCode to add to ChromaDB collection:")
print("""
insurance_collection.add(
    documents=documents_list_new,
    ids=[str(i) for i in range(len(documents_list_new))],
    metadatas=metadata_list_new
)
""")

# Verify metadata structure
print("\nMetadata structure validation:")
print(f"All metadata are dictionaries: {all(isinstance(m, dict) for m in metadata_list_new)}")
print(f"All metadata have same keys: {len(set(str(sorted(m.keys())) for m in metadata_list_new)) == 1}")
print(f"Metadata keys: {list(metadata_list_new[0].keys())}")

# Show final dataframe structure
print(f"\nFinal insurance_pdfs_data columns:")
print(insurance_pdfs_data.columns.tolist())
print(f"\nDataFrame shape: {insurance_pdfs_data.shape}")

Sample document and metadata for ChromaDB:
Document (first 100 chars): DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/01/2014 711 HIGH STREET GEORGE RI 02903 GROUP POLICY F...

Full metadata example:
{'Text_Length': 30, 'Processing_Timestamp': '2025-08-01T19:36:08.387381', 'Document_Type': 'Life Insurance Policy', 'Page_Number': 1, 'Word_Count': 30, 'Character_Count': 188, 'Sentence_Count': 1, 'Content_Category': 'General Content', 'Has_Tables': False, 'Text_Density': 0.0695780903034789, 'Is_First_Page': True, 'Is_Last_Page': False, 'Extraction_Method': 'PDFPlumber', 'File_Size_Bytes': 222772}

CHROMADB READY FORMAT:
Total documents: 60
Total metadata entries: 60

Code to add to ChromaDB collection:

insurance_collection.add(
    documents=documents_list_new,
    ids=[str(i) for i in range(len(documents_list_new))],
    metadatas=metadata_list_new
)


Metadata structure validation:
All metadata are dictionaries: True
All metadata have same keys: True
Metadata keys: ['Text_Length', 'Proc

In [None]:
insurance_collection.add(
    documents=documents_list_new,
    ids=[str(i) for i in range(len(documents_list_new))],
    metadatas=metadata_list_new
)