In [1]:
!pip install pdfplumber tiktoken openai chromadb sentence_transformers

Collecting pdfplumber
  Downloading pdfplumber-0.11.4-py3-none-any.whl.metadata (41 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m42.0/42.0 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting tiktoken
  Downloading tiktoken-0.7.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.6 kB)
Collecting openai
  Downloading openai-1.43.0-py3-none-any.whl.metadata (22 kB)
Collecting chromadb
  Downloading chromadb-0.5.5-py3-none-any.whl.metadata (6.8 kB)
Collecting sentence_transformers
  Downloading sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pdfminer.six==20231228 (from pdfplumber)
  Downloading pdfminer.six-20231228-py3-none-any.whl.metadata (4.2 kB)
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m1.8 MB/s[0m eta [36m0:00:

In [2]:
import pdfplumber
from pathlib import Path
import pandas as pd
from operator import itemgetter
import json
import tiktoken
import openai
from sentence_transformers import CrossEncoder, util
from chromadb.utils.embedding_functions import OpenAIEmbeddingFunction
import chromadb

  from tqdm.autonotebook import tqdm, trange


In [3]:
pdf_path='/kaggle/input/helpmate-ai/Principal-Sample-Life-Insurance-Policy.pdf'
with pdfplumber.open(pdf_path) as pdf:

    single_page = pdf.pages[6]
    text = single_page.extract_text()
    tables = single_page.extract_tables()
    print(text)

Section A – Eligibility
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Section B - Effective Dates
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Section C - Individual Terminations
Member Life Insurance Article 1
Member Accidental Death and Dismemberment Insurance Article 2
Dependent Life Insurance Article 3
Termination for Fraud Article 4
Coverage While Outside of the United States Article 5
Section D - Continuation
Member Life Insurance Article 1
Dependent Insurance - Developmentally Disabled or
Physically Handicapped Children Article 2
Section E - Reinstatement
Reinstatement Article 1
Federal Required Family and Medical Leave Act (FMLA) Article 2
Reinstatement of Coverage for a Member or Dependent When
Coverage Ends due to Living Outside of the United States Article 3
Section F - Individual Purchase Rights
Member Life In

In [4]:
def check_bboxes(word, bbox):
    """
    Helper function to check if a word's bbox is within a table's bbox.
    """
    word_bbox = (word['x0'], word['top'], word['x1'], word['bottom'])
    return (word_bbox[0] >= bbox[0] and word_bbox[2] <= bbox[2] and
            word_bbox[1] >= bbox[1] and word_bbox[3] <= bbox[3])

def extract_text_from_pdf(pdf_path):
    full_text = []
    with pdfplumber.open(pdf_path) as pdf:
        for p, page in enumerate(pdf.pages, start=1):
            page_no = f"Page {p}"
            text = page.extract_text()
            
            # Extract heading (if text exists)
            heading = text.split('\n')[0].strip() if text else None

            tables = page.find_tables()
            table_bboxes = [i.bbox for i in tables]
            tables = [{'table': i.extract(), 'top': i.bbox[1]} for i in tables]
            non_table_words = [word for word in page.extract_words() if not any(
                check_bboxes(word, table_bbox) for table_bbox in table_bboxes)]
            lines = []

            for cluster in pdfplumber.utils.cluster_objects(non_table_words + tables, itemgetter('top'), tolerance=5):
                if 'text' in cluster[0]:
                    try:
                        lines.append(' '.join([i['text'] for i in cluster]))
                    except KeyError:
                        pass
                elif 'table' in cluster[0]:
                    lines.append(json.dumps(cluster[0]['table']))

            full_text.append([page_no, heading, " ".join(lines)])

    # Convert the extracted data to a DataFrame
    df = pd.DataFrame(full_text, columns=['Page_Number', 'Heading', 'Text'])
    
    return df

In [5]:
df = extract_text_from_pdf(pdf_path)

In [6]:
df.head(10)

Unnamed: 0,Page_Number,Heading,Text
0,Page 1,DOROTHEA GLAUSE S655,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...
1,Page 2,This page left blank intentionally,This page left blank intentionally
2,Page 3,POLICY RIDER,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...
3,Page 4,This page left blank intentionally,This page left blank intentionally
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY,PRINCIPAL LIFE INSURANCE COMPANY (called The P...
5,Page 6,TABLE OF CONTENTS,TABLE OF CONTENTS PART I - DEFINITIONS PART II...
6,Page 7,Section A – Eligibility,Section A – Eligibility Member Life Insurance ...
7,Page 8,Section A - Member Life Insurance,Section A - Member Life Insurance Schedule of ...
8,Page 9,P ART I - DEFINITIONS,P ART I - DEFINITIONS When used in this Group ...
9,Page 10,T he legally recognized union of two eligible ...,T he legally recognized union of two eligible ...


In [7]:
df['Text'][9]

'T he legally recognized union of two eligible individuals of the same sex established according to law. Civil Union Partner For two persons to establish a Civil Union in Rhode Island, it shall be necessary that they satisfy all of the following criteria: a. not be a party to another Civil Union or marriage in Rhode Island; b. be of the same sex and therefore be excluded from the marriage laws of Rhode Island or any other state; c. be at least 18 years of age; d. not be related to the other proposed party to the Civil Union. NOTE: For the purposes of this Group Policy, the term "spouse" will include Civil Union Partner, except as otherwise provided in this Group Policy. Date of Issue The date this Group Policy is placed in force: November 1, 2007. Dependent a. A Member\'s spouse, if that spouse: (1) is legally married to the Member; and (2) is not in the Armed Forces of any country; and (3) is not insured under this Group Policy as a Member. A Member\'s spouse will also include a Civil

In [8]:
df['Text_Length'] = df['Text'].apply(lambda x: len(x.split(' ')))

In [9]:
df.head(10)

Unnamed: 0,Page_Number,Heading,Text,Text_Length
0,Page 1,DOROTHEA GLAUSE S655,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30
1,Page 2,This page left blank intentionally,This page left blank intentionally,5
2,Page 3,POLICY RIDER,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230
3,Page 4,This page left blank intentionally,This page left blank intentionally,5
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110
5,Page 6,TABLE OF CONTENTS,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153
6,Page 7,Section A – Eligibility,Section A – Eligibility Member Life Insurance ...,176
7,Page 8,Section A - Member Life Insurance,Section A - Member Life Insurance Schedule of ...,171
8,Page 9,P ART I - DEFINITIONS,P ART I - DEFINITIONS When used in this Group ...,387
9,Page 10,T he legally recognized union of two eligible ...,T he legally recognized union of two eligible ...,251


In [10]:
df = df.loc[df['Text_Length']>=10]

In [11]:
df.head(10)

Unnamed: 0,Page_Number,Heading,Text,Text_Length
0,Page 1,DOROTHEA GLAUSE S655,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30
2,Page 3,POLICY RIDER,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110
5,Page 6,TABLE OF CONTENTS,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153
6,Page 7,Section A – Eligibility,Section A – Eligibility Member Life Insurance ...,176
7,Page 8,Section A - Member Life Insurance,Section A - Member Life Insurance Schedule of ...,171
8,Page 9,P ART I - DEFINITIONS,P ART I - DEFINITIONS When used in this Group ...,387
9,Page 10,T he legally recognized union of two eligible ...,T he legally recognized union of two eligible ...,251
10,Page 11,(2) has been placed with the Member or spouse ...,(2) has been placed with the Member or spouse ...,299
11,Page 12,An institution that is licensed as a Hospital ...,An institution that is licensed as a Hospital ...,352


In [12]:
df.head()

Unnamed: 0,Page_Number,Heading,Text,Text_Length
0,Page 1,DOROTHEA GLAUSE S655,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30
2,Page 3,POLICY RIDER,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230
4,Page 5,PRINCIPAL LIFE INSURANCE COMPANY,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110
5,Page 6,TABLE OF CONTENTS,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153
6,Page 7,Section A – Eligibility,Section A – Eligibility Member Life Insurance ...,176


In [13]:
df['Metadata'] = df.apply(
    lambda x: {
        'Section': (x['Heading'][:20] if x['Heading'] else ''),
        'Page_No.': x['Page_Number']
    },
    axis=1
)

In [14]:
def chunk_text(text, chunk_size=300, overlap_size=50):
    # Split the text into individual words
    words = text.split()
    chunks = []
    
    # Iterate over the words to create chunks with overlap
    for i in range(0, len(words), chunk_size - overlap_size):
        # Create a chunk from the current position
        chunk = " ".join(words[i:i + chunk_size])
        chunks.append(chunk)
    
    return chunks


In [15]:
df['Chunks'] = df['Text'].apply(lambda x: chunk_text(x))

# Flatten the DataFrame to have one row per chunk
chunked_df = df.explode('Chunks').reset_index(drop=True)

# Add an identifier to each chunk to keep track of the page and chunk number
chunked_df['Chunk_ID'] = chunked_df.index + 1




In [16]:
chunked_df.head(20)

Unnamed: 0,Page_Number,Heading,Text,Text_Length,Metadata,Chunks,Chunk_ID
0,Page 1,DOROTHEA GLAUSE S655,DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,30,"{'Section': 'DOROTHEA GLAUSE S655', 'Page_No.'...",DOROTHEA GLAUSE S655 RHODE ISLAND JOHN DOE 01/...,1
1,Page 3,POLICY RIDER,POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,230,"{'Section': 'POLICY RIDER', 'Page_No.': 'Page 3'}",POLICY RIDER GROUP INSURANCE POLICY NO: S655 C...,2
2,Page 5,PRINCIPAL LIFE INSURANCE COMPANY,PRINCIPAL LIFE INSURANCE COMPANY (called The P...,110,"{'Section': 'PRINCIPAL LIFE INSUR', 'Page_No.'...",PRINCIPAL LIFE INSURANCE COMPANY (called The P...,3
3,Page 6,TABLE OF CONTENTS,TABLE OF CONTENTS PART I - DEFINITIONS PART II...,153,"{'Section': 'TABLE OF CONTENTS', 'Page_No.': '...",TABLE OF CONTENTS PART I - DEFINITIONS PART II...,4
4,Page 7,Section A – Eligibility,Section A – Eligibility Member Life Insurance ...,176,"{'Section': 'Section A – Eligibil', 'Page_No.'...",Section A – Eligibility Member Life Insurance ...,5
5,Page 8,Section A - Member Life Insurance,Section A - Member Life Insurance Schedule of ...,171,"{'Section': 'Section A - Member L', 'Page_No.'...",Section A - Member Life Insurance Schedule of ...,6
6,Page 9,P ART I - DEFINITIONS,P ART I - DEFINITIONS When used in this Group ...,387,"{'Section': 'P ART I - DEFINITION', 'Page_No.'...",P ART I - DEFINITIONS When used in this Group ...,7
7,Page 9,P ART I - DEFINITIONS,P ART I - DEFINITIONS When used in this Group ...,387,"{'Section': 'P ART I - DEFINITION', 'Page_No.'...",f. Continence - the ability to voluntarily con...,8
8,Page 10,T he legally recognized union of two eligible ...,T he legally recognized union of two eligible ...,251,"{'Section': 'T he legally recogni', 'Page_No.'...",T he legally recognized union of two eligible ...,9
9,Page 10,T he legally recognized union of two eligible ...,T he legally recognized union of two eligible ...,251,"{'Section': 'T he legally recogni', 'Page_No.'...",2,10


In [17]:
# chunked_df['Metadata'] = chunked_df.apply(
#     lambda x: {
#         'Section': (x['Heading'][:20] if x['Heading'] else ''),
#         'Page_No.': x['Page_Number'],
#         'Chunk_ID':x['Chunk_ID']
#     },
#     axis=1
# )

In [18]:
filepath = "/kaggle/input/my-openais-api/Session_1/OpenAI_API_Key.txt"

with open(filepath + "OpenAI_API_Key.txt", "r") as f:
  openai.api_key = ' '.join(f.readlines())

In [19]:


chroma_data_path = '/kaggle/working/'
client = chromadb.PersistentClient(path=chroma_data_path)

In [20]:
embedding_function = OpenAIEmbeddingFunction(api_key=openai.api_key, model_name='text-embedding-ada-002')
insurance_collection = client.get_or_create_collection(name='InsurancePolicyDoc', embedding_function=embedding_function)

In [21]:
documents_list = chunked_df["Chunks"].tolist()
metadata_list = chunked_df['Metadata'].tolist()

In [22]:
insurance_collection.add(
    documents= documents_list,
    ids = [str(i) for i in range(0, len(documents_list))],
    metadatas = metadata_list
)

In [23]:

cache_collection = client.get_or_create_collection(name='Insurance_Cache', embedding_function=embedding_function)
cache_collection.peek()

{'ids': [],
 'embeddings': [],
 'metadatas': [],
 'documents': [],
 'uris': None,
 'data': None,
 'included': ['embeddings', 'metadatas', 'documents']}

In [24]:
query_1=input()

 what types of coverage does this policy include?


In [25]:
# Query the cache collection to check if the results are already stored
cache_results = cache_collection.query(
    query_texts=query_1,
    n_results=1
)

# Print the results from the cache query for debugging
print(cache_results)

# If the cache did not return satisfactory results (e.g., based on distance), query the main collection
results = insurance_collection.query(
    query_texts=query_1,
    n_results=10
)


{'ids': [[]], 'distances': [[]], 'metadatas': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}


In [26]:
import json

threshold = 0.2

results_df_1 = pd.DataFrame()

# Query the cache collection to check if the results are already stored
cache_results = cache_collection.query(
    query_texts=query_1,
    n_results=1
)

# Print the results from the cache query for debugging
print(cache_results)

# Check if the cache is empty or if the distance exceeds the threshold
if not cache_results['distances'][0] or cache_results['distances'][0][0] > threshold:
    # Query the main collection for the top 10 results
    results = insurance_collection.query(
        query_texts=query_1,
        n_results=10
    )

    # Prepare keys and values for storing in cache
    cache_data = {}
    for key, val in results.items():
        if val is None:
            continue
        # Adjust the loop to match the actual number of items in val
        for i in range(min(len(val[0]), 10)):  # Ensure you only loop over existing items
            cache_data[f"{key}_{i}"] = val[0][i]

    # Flatten the metadata for storage in ChromaDB
    flat_cache_data = {}
    for k, v in cache_data.items():
        if isinstance(v, dict):
            # Convert the dictionary to a JSON string
            flat_cache_data[k] = json.dumps(v)
        else:
            flat_cache_data[k] = v

    # Store the query in cache
    cache_collection.add(
        documents=[query_1],
        ids=[query_1],  # Alternatively, you can use a unique ID
        metadatas=flat_cache_data
    )

    print("Not found in cache. Found in main collection.")

    # Convert the results to a DataFrame
    result_dict = {
        'Metadatas_1': results['metadatas'][0],
        'Documents_1': results['documents'][0],
        'Distances_1': results['distances'][0],
        'IDs': results['ids'][0]
    }
    results_df_1 = pd.DataFrame.from_dict(result_dict)

# If the distance is within the threshold, retrieve results from the cache
elif cache_results['distances'][0][0] <= threshold:
    # Extract data from the cache
    cache_result_dict = cache_results['metadatas'][0][0]
    ids = []
    documents = []
    distances = []
    metadatas = []

    # Collect data based on keys
    for key, value in cache_result_dict.items():
        if 'ids' in key:
            ids.append(value)
        elif 'documents' in key:
            documents.append(value)
        elif 'distances' in key:
            distances.append(value)
        elif 'metadatas' in key:
            metadatas.append(value)

    print("Found in cache!")

    # Convert the cache data to a DataFrame
    results_df_1 = pd.DataFrame({
        'IDs_1': ids,
        'Documents_1': documents,
        'Distances_1': distances,
        'Metadatas_1': metadatas
    })

# Display the DataFrame with results
#print(results_df_1)


{'ids': [[]], 'distances': [[]], 'metadatas': [[]], 'embeddings': None, 'documents': [[]], 'uris': None, 'data': None, 'included': ['metadatas', 'documents', 'distances']}
Not found in cache. Found in main collection.


In [27]:
results_df_1.head()

Unnamed: 0,Metadatas_1,Documents_1,Distances_1,IDs
0,"{'Page_No.': 'Page 16', 'Section': 'PART II - ...","coverage, benefits, and participation privileg...",0.377665,20
1,"{'Page_No.': 'Page 55', 'Section': 'Exposure'}","""Automobile"" means a four-wheel passenger vehi...",0.383765,86
2,"{'Page_No.': 'Page 36', 'Section': 'A Member's...",state or federal law. Article 5 - Coverage Whi...,0.387754,52
3,"{'Page_No.': 'Page 11', 'Section': '(2) has be...",dependent on the Member for principal support....,0.388576,11
4,"{'Page_No.': 'Page 17', 'Section': 'a. be acti...",a. be actively engaged in business for profit ...,0.389656,21


In [28]:
query_2=input()

 what documentation is required when filing a claim?


In [29]:
cache_results= cache_collection.query(
         query_texts=query_2,
          n_results=1
     )
cache_results

{'ids': [['what types of coverage does this policy include?']],
 'distances': [[0.39654462972718474]],
 'metadatas': [[{'distances_0': 0.3776649363243732,
    'distances_1': 0.38376540582557356,
    'distances_2': 0.38775409643713393,
    'distances_3': 0.388576212809206,
    'distances_4': 0.38965634298891,
    'distances_5': 0.39450742548817425,
    'distances_6': 0.3948292433981987,
    'distances_7': 0.39487230539362395,
    'distances_8': 0.39498381181835585,
    'distances_9': 0.3997019708508814,
    'documents_0': "coverage, benefits, and participation privileges, may be made without the consent of any Member or Dependent. Payment of premium beyond the effective date of the change constitutes the Policyholder's consent to the change. Article 3 - Policyholder Eligibility Requirements To be an eligible group and to remain an eligible group, the Policyholder must: This policy has been updated effective January 1, 2014 PART II - POLICY ADMINISTRATION GC 6003 Section A - Contract, Pa

In [30]:
threshold = 0.2

results_df_2 = pd.DataFrame()

# Query the cache collection to check if the results are already stored
cache_results = cache_collection.query(
    query_texts=query_2,
    n_results=1
)

# Print the results from the cache query for debugging
print(cache_results)

# Check if the cache is empty or if the distance exceeds the threshold
if not cache_results['distances'][0] or cache_results['distances'][0][0] > threshold:
    # Query the main collection for the top 10 results
    results = insurance_collection.query(
        query_texts=query_2,
        n_results=10
    )

    # Prepare keys and values for storing in cache
    cache_data = {}
    for key, val in results.items():
        if val is None:
            continue
        # Adjust the loop to match the actual number of items in val
        for i in range(min(len(val[0]), 10)):  # Ensure you only loop over existing items
            cache_data[f"{key}_{i}"] = val[0][i]

    # Flatten the metadata for storage in ChromaDB
    flat_cache_data = {}
    for k, v in cache_data.items():
        if isinstance(v, dict):
            # Convert the dictionary to a JSON string
            flat_cache_data[k] = json.dumps(v)
        else:
            flat_cache_data[k] = v

    # Store the query in cache
    cache_collection.add(
        documents=[query_2],
        ids=[query_2],  # Alternatively, you can use a unique ID
        metadatas=flat_cache_data
    )

    print("Not found in cache. Found in main collection.")

    # Convert the results to a DataFrame
    result_dict = {
        'Metadatas_2': results['metadatas'][0],
        'Documents_2': results['documents'][0],
        'Distances_2': results['distances'][0],
        'IDs': results['ids'][0]
    }
    results_df_2 = pd.DataFrame.from_dict(result_dict)

# If the distance is within the threshold, retrieve results from the cache
elif cache_results['distances'][0][0] <= threshold:
    # Extract data from the cache
    cache_result_dict = cache_results['metadatas'][0][0]
    ids = []
    documents = []
    distances = []
    metadatas = []

    # Collect data based on keys
    for key, value in cache_result_dict.items():
        if 'ids' in key:
            ids.append(value)
        elif 'documents' in key:
            documents.append(value)
        elif 'distances' in key:
            distances.append(value)
        elif 'metadatas' in key:
            metadatas.append(value)

    print("Found in cache!")

    # Convert the cache data to a DataFrame
    results_df_2 = pd.DataFrame({
        'IDs_2': ids,
        'Documents_2': documents,
        'Distances_2': distances,
        'Metadatas_2': metadatas
    })

{'ids': [['what types of coverage does this policy include?']], 'distances': [[0.39654462972718474]], 'metadatas': [[{'distances_0': 0.3776649363243732, 'distances_1': 0.38376540582557356, 'distances_2': 0.38775409643713393, 'distances_3': 0.388576212809206, 'distances_4': 0.38965634298891, 'distances_5': 0.39450742548817425, 'distances_6': 0.3948292433981987, 'distances_7': 0.39487230539362395, 'distances_8': 0.39498381181835585, 'distances_9': 0.3997019708508814, 'documents_0': "coverage, benefits, and participation privileges, may be made without the consent of any Member or Dependent. Payment of premium beyond the effective date of the change constitutes the Policyholder's consent to the change. Article 3 - Policyholder Eligibility Requirements To be an eligible group and to remain an eligible group, the Policyholder must: This policy has been updated effective January 1, 2014 PART II - POLICY ADMINISTRATION GC 6003 Section A - Contract, Page 1", 'documents_1': '"Automobile" means 

In [31]:
results_df_2

Unnamed: 0,Metadatas_2,Documents_2,Distances_2,IDs
0,"{'Page_No.': 'Page 61', 'Section': 'Section D ...",Section D - Claim Procedures Article 1 - Notic...,0.33596,95
1,"{'Page_No.': 'Page 62', 'Section': 'A claimant...",of loss has been filed and before the appeal p...,0.354624,98
2,"{'Page_No.': 'Page 62', 'Section': 'A claimant...",A claimant may request an appeal of a claim de...,0.362261,97
3,"{'Page_No.': 'Page 61', 'Section': 'Section D ...",will be considered to be met when the appropri...,0.366767,96
4,"{'Page_No.': 'Page 29', 'Section': 'Insurance ...",by The Principal. A Member must submit Proof o...,0.386173,40
5,"{'Page_No.': 'Page 18', 'Section': 'c . a copy...",c . a copy of the form which contains the stat...,0.390515,23
6,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","f . claim requirements listed in PART IV, Sect...",0.401005,83
7,"{'Page_No.': 'Page 28', 'Section': 'Section B ...",to an individual policy; or (2) were eligible ...,0.409296,38
8,"{'Page_No.': 'Page 17', 'Section': 'a. be acti...",Written form Signed by the insured person; and...,0.414463,22
9,"{'Page_No.': 'Page 31', 'Section': 'Scheduled ...",Benefit amounts due to a request by the Member...,0.414665,44


In [32]:
query_3 = input()

 what happens if i miss a payment?


In [33]:
cache_results= cache_collection.query(
         query_texts=query_3,
          n_results=1
     )
cache_results

{'ids': [['what documentation is required when filing a claim?']],
 'distances': [[0.4857106171859851]],
 'metadatas': [[{'distances_0': 0.3359597039883748,
    'distances_1': 0.35462433451933106,
    'distances_2': 0.3622610727387903,
    'distances_3': 0.3667674024181407,
    'distances_4': 0.3861733920826373,
    'distances_5': 0.39051496084501053,
    'distances_6': 0.4010045736094843,
    'distances_7': 0.4092963392539709,
    'distances_8': 0.4144634866813787,
    'distances_9': 0.4146653561842776,
    'documents_0': 'Section D - Claim Procedures Article 1 - Notice of Claim Written notice must be sent to The Principal by or for a Member or Dependent who wishes to file claim for benefits under this Group Policy. This notice must be sent within 20 days after the date of the loss for which claim is being made. Failure to give notice within the time specified will not invalidate or reduce any claim if notice is given as soon as reasonably possible. Article 2 - Claim Forms The Princip

In [34]:
threshold = 0.2

results_df_3 = pd.DataFrame()

# Query the cache collection to check if the results are already stored
cache_results = cache_collection.query(
    query_texts=query_3,
    n_results=1
)

# Print the results from the cache query for debugging
#print(cache_results)

# Check if the cache is empty or if the distance exceeds the threshold
if not cache_results['distances'][0] or cache_results['distances'][0][0] > threshold:
    # Query the main collection for the top 10 results
    results = insurance_collection.query(
        query_texts=query_3,
        n_results=10
    )

    # Prepare keys and values for storing in cache
    cache_data = {}
    for key, val in results.items():
        if val is None:
            continue
        # Adjust the loop to match the actual number of items in val
        for i in range(min(len(val[0]), 10)):  # Ensure you only loop over existing items
            cache_data[f"{key}_{i}"] = val[0][i]

    # Flatten the metadata for storage in ChromaDB
    flat_cache_data = {}
    for k, v in cache_data.items():
        if isinstance(v, dict):
            # Convert the dictionary to a JSON string
            flat_cache_data[k] = json.dumps(v)
        else:
            flat_cache_data[k] = v

    # Store the query in cache
    cache_collection.add(
        documents=[query_3],
        ids=[query_3],  # Alternatively, you can use a unique ID
        metadatas=flat_cache_data
    )

    print("Not found in cache. Found in main collection.")

    # Convert the results to a DataFrame
    result_dict = {
        'Metadatas_3': results['metadatas'][0],
        'Documents_3': results['documents'][0],
        'Distances_3': results['distances'][0],
        'IDs': results['ids'][0]
    }
    results_df_3 = pd.DataFrame.from_dict(result_dict)

# If the distance is within the threshold, retrieve results from the cache
elif cache_results['distances'][0][0] <= threshold:
    # Extract data from the cache
    cache_result_dict = cache_results['metadatas'][0][0]
    ids = []
    documents = []
    distances = []
    metadatas = []

    # Collect data based on keys
    for key, value in cache_result_dict.items():
        if 'ids' in key:
            ids.append(value)
        elif 'documents' in key:
            documents.append(value)
        elif 'distances' in key:
            distances.append(value)
        elif 'metadatas' in key:
            metadatas.append(value)

    print("Found in cache!")

    # Convert the cache data to a DataFrame
    results_df_3 = pd.DataFrame({
        'IDs_3': ids,
        'Documents_3': documents,
        'Distances_3': distances,
        'Metadatas_3': metadatas
    })

Not found in cache. Found in main collection.


In [35]:
results_df_3

Unnamed: 0,Metadatas_3,Documents_3,Distances_3,IDs
0,"{'Page_No.': 'Page 23', 'Section': 'Section C ...",or d. fails to pay premium in accordance with ...,0.425369,32
1,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","Settlement of Proceeds provisions of PART IV, ...",0.451775,84
2,"{'Page_No.': 'Page 23', 'Section': 'Section C ...",Section C - Policy Termination Article 1 - Fai...,0.453811,31
3,"{'Page_No.': 'Page 20', 'Section': 'Section B ...",Section B - Premiums Article 1 - Payment Respo...,0.46758,26
4,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","f . claim requirements listed in PART IV, Sect...",0.472797,83
5,"{'Page_No.': 'Page 24', 'Section': 'T he Princ...",T he Principal may terminate the Policyholder'...,0.476565,33
6,"{'Page_No.': 'Page 47', 'Section': 'M ember's ...",insurance and recorded by the Policyholder or ...,0.477835,71
7,"{'Page_No.': 'Page 60', 'Section': 'I f a Depe...","before a change request is received, that paym...",0.479992,94
8,"{'Page_No.': 'Page 20', 'Section': 'Section B ...",Premium Rate Changes The Principal may change ...,0.483176,27
9,"{'Page_No.': 'Page 48', 'Section': 'c . If a b...",c . If a beneficiary dies at the same time or ...,0.483333,72


In [36]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L-6-v2')

config.json:   0%|          | 0.00/794 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]



In [37]:
results_df_3

Unnamed: 0,Metadatas_3,Documents_3,Distances_3,IDs
0,"{'Page_No.': 'Page 23', 'Section': 'Section C ...",or d. fails to pay premium in accordance with ...,0.425369,32
1,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","Settlement of Proceeds provisions of PART IV, ...",0.451775,84
2,"{'Page_No.': 'Page 23', 'Section': 'Section C ...",Section C - Policy Termination Article 1 - Fai...,0.453811,31
3,"{'Page_No.': 'Page 20', 'Section': 'Section B ...",Section B - Premiums Article 1 - Payment Respo...,0.46758,26
4,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","f . claim requirements listed in PART IV, Sect...",0.472797,83
5,"{'Page_No.': 'Page 24', 'Section': 'T he Princ...",T he Principal may terminate the Policyholder'...,0.476565,33
6,"{'Page_No.': 'Page 47', 'Section': 'M ember's ...",insurance and recorded by the Policyholder or ...,0.477835,71
7,"{'Page_No.': 'Page 60', 'Section': 'I f a Depe...","before a change request is received, that paym...",0.479992,94
8,"{'Page_No.': 'Page 20', 'Section': 'Section B ...",Premium Rate Changes The Principal may change ...,0.483176,27
9,"{'Page_No.': 'Page 48', 'Section': 'c . If a b...",c . If a beneficiary dies at the same time or ...,0.483333,72


In [38]:
cross_inputs_1 = [[query_1, response] for response in results_df_1['Documents_1']]
cross_rerank_scores_1 = cross_encoder.predict(cross_inputs_1)
cross_rerank_scores_1

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

array([-2.1723294, -5.0158787, -1.9306265, -2.5475593, -8.101703 ,
       -5.111147 , -9.388777 , -1.8970793, -4.3696127, -3.4865303],
      dtype=float32)

In [39]:
results_df_1['Reranked_scores'] = cross_rerank_scores_1
results_df_1

Unnamed: 0,Metadatas_1,Documents_1,Distances_1,IDs,Reranked_scores
0,"{'Page_No.': 'Page 16', 'Section': 'PART II - ...","coverage, benefits, and participation privileg...",0.377665,20,-2.172329
1,"{'Page_No.': 'Page 55', 'Section': 'Exposure'}","""Automobile"" means a four-wheel passenger vehi...",0.383765,86,-5.015879
2,"{'Page_No.': 'Page 36', 'Section': 'A Member's...",state or federal law. Article 5 - Coverage Whi...,0.387754,52,-1.930627
3,"{'Page_No.': 'Page 11', 'Section': '(2) has be...",dependent on the Member for principal support....,0.388576,11,-2.547559
4,"{'Page_No.': 'Page 17', 'Section': 'a. be acti...",a. be actively engaged in business for profit ...,0.389656,21,-8.101703
5,"{'Page_No.': 'Page 51', 'Section': 'Coverage D...",This policy has been updated effective January...,0.394507,79,-5.111147
6,"{'Page_No.': 'Page 6', 'Section': 'TABLE OF CO...",TABLE OF CONTENTS PART I - DEFINITIONS PART II...,0.394829,3,-9.388777
7,"{'Page_No.': 'Page 13', 'Section': 'a . A lice...",a . A licensed Doctor of Medicine (M.D.) or Os...,0.394872,14,-1.897079
8,"{'Page_No.': 'Page 55', 'Section': 'Exposure'}",Exposure Exposure to the elements will be pres...,0.394984,85,-4.369613
9,"{'Page_No.': 'Page 28', 'Section': 'Section B ...",to an individual policy; or (2) were eligible ...,0.399702,38,-3.48653


In [40]:
top_3_semantic_1 = results_df_1.sort_values(by='Distances_1')
top_3_semantic_1[:3]

Unnamed: 0,Metadatas_1,Documents_1,Distances_1,IDs,Reranked_scores
0,"{'Page_No.': 'Page 16', 'Section': 'PART II - ...","coverage, benefits, and participation privileg...",0.377665,20,-2.172329
1,"{'Page_No.': 'Page 55', 'Section': 'Exposure'}","""Automobile"" means a four-wheel passenger vehi...",0.383765,86,-5.015879
2,"{'Page_No.': 'Page 36', 'Section': 'A Member's...",state or federal law. Article 5 - Coverage Whi...,0.387754,52,-1.930627


In [41]:
top_3_rerank_1 = results_df_1.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank_1[:3]

Unnamed: 0,Metadatas_1,Documents_1,Distances_1,IDs,Reranked_scores
7,"{'Page_No.': 'Page 13', 'Section': 'a . A lice...",a . A licensed Doctor of Medicine (M.D.) or Os...,0.394872,14,-1.897079
2,"{'Page_No.': 'Page 36', 'Section': 'A Member's...",state or federal law. Article 5 - Coverage Whi...,0.387754,52,-1.930627
0,"{'Page_No.': 'Page 16', 'Section': 'PART II - ...","coverage, benefits, and participation privileg...",0.377665,20,-2.172329


In [42]:
cross_inputs_2 = [[query_2, response] for response in results_df_2['Documents_2']]
cross_rerank_scores_2 = cross_encoder.predict(cross_inputs_2)
results_df_2['Reranked_scores'] = cross_rerank_scores_2

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [43]:
top_3_semantic_2 = results_df_2.sort_values(by='Distances_2')
top_3_semantic_2[:3]

Unnamed: 0,Metadatas_2,Documents_2,Distances_2,IDs,Reranked_scores
0,"{'Page_No.': 'Page 61', 'Section': 'Section D ...",Section D - Claim Procedures Article 1 - Notic...,0.33596,95,-0.474061
1,"{'Page_No.': 'Page 62', 'Section': 'A claimant...",of loss has been filed and before the appeal p...,0.354624,98,-2.255106
2,"{'Page_No.': 'Page 62', 'Section': 'A claimant...",A claimant may request an appeal of a claim de...,0.362261,97,-0.586972


In [44]:
top_3_rerank_2 = results_df_2.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank_2[:3]

Unnamed: 0,Metadatas_2,Documents_2,Distances_2,IDs,Reranked_scores
0,"{'Page_No.': 'Page 61', 'Section': 'Section D ...",Section D - Claim Procedures Article 1 - Notic...,0.33596,95,-0.474061
2,"{'Page_No.': 'Page 62', 'Section': 'A claimant...",A claimant may request an appeal of a claim de...,0.362261,97,-0.586972
1,"{'Page_No.': 'Page 62', 'Section': 'A claimant...",of loss has been filed and before the appeal p...,0.354624,98,-2.255106


In [45]:
cross_inputs_3 = [[query_3, response] for response in results_df_3['Documents_3']]
cross_rerank_scores_3 = cross_encoder.predict(cross_inputs_3)

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

In [46]:
results_df_3['Reranked_scores'] = cross_rerank_scores_3
top_3_semantic_3 = results_df_3.sort_values(by='Distances_3')
top_3_semantic_3[:3]

Unnamed: 0,Metadatas_3,Documents_3,Distances_3,IDs,Reranked_scores
0,"{'Page_No.': 'Page 23', 'Section': 'Section C ...",or d. fails to pay premium in accordance with ...,0.425369,32,-9.93795
1,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","Settlement of Proceeds provisions of PART IV, ...",0.451775,84,-5.530196
2,"{'Page_No.': 'Page 23', 'Section': 'Section C ...",Section C - Policy Termination Article 1 - Fai...,0.453811,31,-7.300391


In [47]:
top_3_rerank_3 = results_df_3.sort_values(by='Reranked_scores', ascending=False)
top_3_rerank_3[:3]

Unnamed: 0,Metadatas_3,Documents_3,Distances_3,IDs,Reranked_scores
9,"{'Page_No.': 'Page 48', 'Section': 'c . If a b...",c . If a beneficiary dies at the same time or ...,0.483333,72,-5.348612
1,"{'Page_No.': 'Page 54', 'Section': 'f . claim ...","Settlement of Proceeds provisions of PART IV, ...",0.451775,84,-5.530196
6,"{'Page_No.': 'Page 47', 'Section': 'M ember's ...",insurance and recorded by the Policyholder or ...,0.477835,71,-6.560679


In [48]:
top_3_RAG_1 = top_3_rerank_1[["Documents_1", "Metadatas_1"]][:3]
top_3_RAG_2 = top_3_rerank_2[["Documents_2", "Metadatas_2"]][:3]
top_3_RAG_3 = top_3_rerank_3[["Documents_3", "Metadatas_3"]][:3]

In [49]:
# Define the function to generate the response. Provide a comprehensive prompt that passes the user query and the top 3 results to the model

def generate_response(query, top_3_RAG):
    """
    Generate a response using GPT-3.5's ChatCompletion based on the user query and retrieved information.
    """
    messages = [
        {"role": "system", "content": "You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents."},
        {"role": "user", "content": f"""
            You are a helpful assistant in the insurance domain who can effectively answer user queries about insurance policies and documents.
            You have a question asked by the user in '{query}' and you have some search results from a corpus of insurance documents in the dataframe '{top_3_RAG}'. These search results are essentially one page of an insurance document that may be relevant to the user query.

            The column 'documents' inside this dataframe contains the actual text from the policy document and the column 'metadata' contains the policy name and source page. The text inside the document may also contain tables in the format of a list of lists where each of the nested lists indicates a row.

            Use the documents in '{top_3_RAG}' to answer the query '{query}'. Frame an informative answer and also, use the dataframe to return the relevant policy names and page numbers as citations.

            Follow the guidelines below when performing the task:
            1. Try to provide relevant/accurate numbers if available.
            2. You don’t have to necessarily use all the information in the dataframe. Only choose information that is relevant.
            3. If the document text has tables with relevant information, please reformat the table and return the final information in a tabular format.
            4. Use the 'metadata' columns in the dataframe to retrieve and cite the policy name(s) and page number(s) as citation.
            5. If you can't provide the complete answer, please also provide any information that will help the user to search specific sections in the relevant cited documents.
            6. You are a customer-facing assistant, so do not provide any information on internal workings, just answer the query directly.

            The generated response should answer the query directly addressing the user and avoiding additional information. If you think that the query is not relevant to the document, reply that the query is irrelevant. Provide the final response as a well-formatted and easily readable text along with the citation. Provide your complete response first with all information, and then provide the citations.

            ### Few-Shot Examples

            ### Example 1: Basic Query about Coverage
            **Query:**  
            What does the policy say about coverage for accidental death?

            **Top 3 RAG Results:**  
            - **Document 1:** "This policy provides coverage for accidental death. The insured amount for accidental death is 200% of the base coverage amount if the death occurs within 90 days of the accident..."
            - **Document 2:** "Accidental death benefits are payable under this policy if the insured dies as a result of an accident. The benefit amount equals double the coverage amount, provided the death is a direct result of the accident and occurs within a specified time frame..."
            - **Document 3:** "In the event of accidental death, the policy pays an additional benefit, which is equal to twice the original coverage amount. This benefit is contingent on the death occurring within 180 days from the date of the accident..."

            **Response:**  
            The policy provides coverage for accidental death, where the benefit amount is typically 200% of the base coverage. The death must occur as a direct result of an accident and within a specified period, which varies between 90 to 180 days depending on the policy.  
            **Citations:**  
            Document 1: Policy X, Page 5  
            Document 2: Policy Y, Page 12  
            Document 3: Policy Z, Page 7

            ### Example 2: Query about Exclusions
            **Query:**  
            Are there any exclusions for pre-existing conditions in this policy?

            **Top 3 RAG Results:**  
            - **Document 1:** "This policy excludes coverage for any conditions that were diagnosed or treated within 12 months prior to the policy's start date. However, if the condition remains stable for 24 months after the policy's start date, it may be eligible for coverage..."
            - **Document 2:** "Pre-existing conditions are generally not covered under this policy unless explicitly stated otherwise. Any condition that has shown symptoms or required medical attention in the 12 months before the policy start date is excluded..."
            - **Document 3:** "Exclusions apply to pre-existing conditions if they were present within a 12-month window before the policy commencement. After a waiting period of 24 months, these conditions may be reconsidered for coverage..."

            **Response:**  
            The policy excludes coverage for pre-existing conditions that were diagnosed or treated within 12 months before the policy start date. However, if the condition remains stable and does not require treatment for 24 months after the policy start date, it may be eligible for coverage.  
            **Citations:**  
            Document 1: Policy X, Page 8  
            Document 2: Policy Y, Page 15  
            Document 3: Policy Z, Page 10

            ### Example 3: Query about Beneficiaries
            **Query:**  
            How can I update the beneficiary for my life insurance policy?

            **Top 3 RAG Results:**  
            - **Document 1:** "To update the beneficiary, the policyholder must submit a written request to the insurance company. The request should include the policy number, current beneficiary, and the new beneficiary's details..."
            - **Document 2:** "Beneficiary changes can be made by filling out the 'Beneficiary Change Form,' which must be signed by the policyholder and submitted to the insurance provider. The change takes effect upon the company's receipt and acceptance of the form..."
            - **Document 3:** "The policyholder has the right to change the beneficiary at any time by providing written notice to the insurer. The change will be recorded and acknowledged by the insurer upon receipt..."

            **Response:**  
            To update the beneficiary of your life insurance policy, you need to submit a written request or complete a 'Beneficiary Change Form' provided by your insurer. This request must be signed by you as the policyholder, and the change will take effect once the insurance company has received and acknowledged it.  
            **Citations:**  
            Document 1: Policy X, Page 3  
            Document 2: Policy Y, Page 6  
            Document 3: Policy Z, Page 4

            ### Example 4: Complex Query about Premium Payments
            **Query:**  
            What are the options if I miss a premium payment?

            **Top 3 RAG Results:**  
            - **Document 1:** "If a premium payment is missed, the policyholder typically has a 30-day grace period to make the payment without penalty. If payment is not received within this period, the policy may lapse, and coverage could be lost..."
            - **Document 2:** "The policy includes a 30-day grace period for missed payments. During this time, coverage remains in force. If payment is not made by the end of the grace period, the policyholder may request reinstatement of the policy, subject to underwriting..."
            - **Document 3:** "In the event of a missed premium, a 30-day grace period is granted. If the premium is not paid within this period, the policyholder may choose to reinstate the policy, which may require proof of insurability and payment of overdue premiums..."

            **Response:**  
            If you miss a premium payment, your policy provides a 30-day grace period during which you can make the payment without losing coverage. If the payment is not made within this period, the policy may lapse. However, you may have the option to reinstate the policy by providing proof of insurability and paying the overdue premiums.  
            **Citations:**  
            Document 1: Policy X, Page 10  
            Document 2: Policy Y, Page 11  
            Document 3: Policy Z, Page 9
        """},
    ]

    response = openai.chat.completions.create(
        model="gpt-4o-mini",
        messages=messages
    )

    return response.choices[0].message.content.split('\n')

In [50]:
# Generate the response - For Query 1

response = generate_response(query_1, top_3_RAG_1)
print("Query 1: ","\n",query_1,"\n_________________________________________________________________________________________________________________\n_________________________________________________________________________________________________________________\n")
# Print the response
print("\n".join(response))

Query 1:  
 what types of coverage does this policy include? 
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________

The policy includes various types of coverage, although the specific details are spread across different sections of the documents. Here’s a summary of coverage types mentioned:

1. **Medical Services Coverage**: This generally includes coverage for a licensed Doctor of Medicine (M.D.) or Osteopathic Medicine (D.O.) for medically necessary services as detailed.
   
2. **Coverage Whitelist**: The policy outlines certain medical services and procedures that are specifically included or excluded from coverage under different sections of the policy.

3. **Participation Privileges**: These details refer to the coverage benefits and conditions under which members can participate in the plan, although speci

In [53]:
# Generate the response - For Query 2

response = generate_response(query_2, top_3_RAG_2)
print("Query 2: ","\n",query_2,"\n_________________________________________________________________________________________________________________\n_________________________________________________________________________________________________________________\n")
# Print the response
print("\n".join(response))

Query 2:  
 what documentation is required when filing a claim? 
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________

To file a claim, you typically need to provide specific documentation that verifies the incident or loss you are claiming. Necessary documents often include:

1. **Claim Form:** A completed claim form specific to the type of insurance.
2. **Proof of Loss:** Documentation showing the date and nature of the incident leading to the claim.
3. **Supporting Evidence:** Any evidence supporting the claim, such as photographs, police reports, or receipts related to the loss.
4. **Identification:** A valid identification to verify your identity as the policyholder.
5. **Invoices or Estimates:** Relevant invoices or repair estimates if applicable, especially for property damage claims.

Additionally, if you 

In [57]:
top_3_RAG_3['Metadatas_3']

9    {'Page_No.': 'Page 48', 'Section': 'c . If a b...
1    {'Page_No.': 'Page 54', 'Section': 'f . claim ...
6    {'Page_No.': 'Page 47', 'Section': 'M ember's ...
Name: Metadatas_3, dtype: object

In [58]:
# Generate the response - For Query 3

response = generate_response(query_3, top_3_RAG_3)
print("Query 3: ","\n",query_3,"\n_________________________________________________________________________________________________________________\n_________________________________________________________________________________________________________________\n")
# Print the response
print("\n".join(response))

Query 3:  
 what happens if i miss a payment? 
_________________________________________________________________________________________________________________
_________________________________________________________________________________________________________________

If you miss a payment, typically, you will have a 30-day grace period to make the payment without facing any penalties. During this grace period, your insurance coverage usually remains in effect. If the payment is not made by the end of the grace period, your policy may lapse, and you will lose coverage. However, many policies also allow you the option to reinstate the policy later, although this may require proof of insurability and payment of any overdue premiums.

**Citations:**  
Document 1: Policy X, Page 10  
Document 2: Policy Y, Page 11  
Document 3: Policy Z, Page 9
