Step 1: Document Collection

Step 1: Setup Environment

In [None]:
!pip install langchain chromadb openai pypdf PyPDF2 pandas sentence-transformers langchain-community langchain-huggingface

Collecting langchain-huggingface
  Downloading langchain_huggingface-0.3.1-py3-none-any.whl.metadata (996 bytes)
Downloading langchain_huggingface-0.3.1-py3-none-any.whl (27 kB)
Installing collected packages: langchain-huggingface
Successfully installed langchain-huggingface-0.3.1


Step 2: Document Upload

In [None]:
from google.colab import drive
drive.mount('/content/drive')
from google.colab import files
uploaded = files.upload()

Mounted at /content/drive


Saving gov1.pdf to gov1.pdf
Saving gov2.pdf to gov2.pdf
Saving Health1.pdf to Health1.pdf
Saving Health2.pdf to Health2.pdf
Saving Life1.pdf to Life1.pdf
Saving Life2.pdf to Life2.pdf
Saving Motor1.pdf to Motor1.pdf
Saving Motor2.pdf to Motor2.pdf


Step 3: Document Processing Function

In [None]:
from langchain_community.document_loaders import PyPDFLoader

def process_documents(file_paths, document_type):
    """
    Document type: 'health', 'motor', 'life', 'government'
    """
    documents = []

    for file_path in file_paths:
        if file_path.endswith('.pdf'):
            loader = PyPDFLoader(file_path)
            docs = loader.load()

            for doc in docs:
                doc.metadata['document_type'] = document_type
                doc.metadata['source_file'] = file_path

            documents.extend(docs)

    return documents


health_docs = process_documents(['Health1.pdf', 'Health2.pdf'], 'health')
motor_docs = process_documents(['Motor1.pdf', 'Motor2.pdf'], 'motor')
life_docs = process_documents(['Life1.pdf', 'Life2.pdf'], 'life')
government_docs = process_documents(['gov1.pdf','gov2.pdf'], 'government')




--- Health Documents ---
Total documents loaded: 67
Metadata of first document:
{'producer': 'Corel PDF Engine Version 25.0.0.230', 'creator': 'CorelDRAW 2024', 'creationdate': '2024-09-18T16:23:13+05:30', 'moddate': '2024-09-18T16:23:13+05:30', 'author': 'Narayanan', 'title': 'Policy - Star Comprehensive Insurance Policy - V.20.cdr', 'source': 'Health1.pdf', 'total_pages': 14, 'page': 0, 'page_label': '1', 'document_type': 'health', 'source_file': 'Health1.pdf'}
-------------------------

--- Motor Documents ---
Total documents loaded: 33
Metadata of first document:
{'producer': 'Corel PDF Engine Version 16.0.0.707', 'creator': 'CorelDRAW X6', 'creationdate': '2018-04-07T13:58:32+05:30', 'moddate': '2024-06-13T12:18:25+05:30', 'title': '', 'source': 'Motor1.pdf', 'total_pages': 24, 'page': 0, 'page_label': '1', 'document_type': 'motor', 'source_file': 'Motor1.pdf'}
-------------------------

--- Life Documents ---
Total documents loaded: 71
Metadata of first document:
{'producer': '

Step 4: Text Chunking (Important step!)

In [None]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000,
    chunk_overlap=200,
    separators=["\n\n", "\n", ". ", " ", ""]
)

all_documents = health_docs + motor_docs + life_docs + government_docs

splits = text_splitter.split_documents(all_documents)

print(f"Total chunks created: {len(splits)}")

Total chunks created: 1039


Step 5: Vector Database Setup

In [None]:
from langchain_huggingface import HuggingFaceEmbeddings
from langchain.vectorstores import Chroma

embeddings = HuggingFaceEmbeddings(
    model_name="sentence-transformers/all-MiniLM-L6-v2"
)

vectorstore = Chroma.from_documents(
    documents=splits,
    embedding=embeddings,
    persist_directory="/content/policy_vectordb"
)

Step 6: Query Processing Function

In [None]:
def extract_query_info(query):
    """
    Extracts age, gender, location, procedure from the query.
    """
    import re

    info = {}

    # Age extract karo
    age_pattern = r'(\d+)[-\s]*(year|yr|Y|M|male|female)'
    age_match = re.search(age_pattern, query, re.IGNORECASE)
    if age_match:
        info['age'] = int(age_match.group(1))

    # Gender
    if re.search(r'\b(male|M|man)\b', query, re.IGNORECASE):
        info['gender'] = 'male'
    elif re.search(r'\b(female|F|woman)\b', query, re.IGNORECASE):
        info['gender'] = 'female'

    # Location
    cities = ['pune', 'mumbai', 'delhi', 'bangalore', 'chennai']
    for city in cities:
        if city.lower() in query.lower():
            info['location'] = city
            break

    # Procedure/condition
    procedures = ['knee surgery', 'heart surgery', 'accident', 'maternity']
    for proc in procedures:
        if proc.lower() in query.lower():
            info['procedure'] = proc
            break

    return info

# Test karo
test_query = "46-year-old male, knee surgery in Pune, 3-month-old insurance policy"
extracted_info = extract_query_info(test_query)
print(extracted_info)

{'age': 46, 'gender': 'male', 'location': 'pune', 'procedure': 'knee surgery'}


Step 7: Search Function

In [None]:
def search_relevant_docs(query, document_type=None, k=5):


    if document_type:
        filter_dict = {"document_type": document_type}
        results = vectorstore.similarity_search(
            query,
            k=k,
            filter=filter_dict
        )
    else:
        results = vectorstore.similarity_search(query, k=k)

    return results
query = "knee surgery coverage"
results = search_relevant_docs(query, document_type="health")
for i, doc in enumerate(results):
    print(f"\n--- Result {i+1} ---")
    print(f"Source: {doc.metadata.get('source_file', 'Unknown')}")
    print(f"Type: {doc.metadata.get('document_type', 'Unknown')}")
    print(f"Content: {doc.page_content[:200]}...")


--- Result 1 ---
Source: Health1.pdf
Type: health
Content: b) To authorize doctors providing treatments or giving expert opinion and any other 
authority to supply the Company any information that may be required.  If the 
obligations are not met with due to ...

--- Result 2 ---
Source: Health1.pdf
Type: health
Content: b) To authorize doctors providing treatments or giving expert opinion and any other 
authority to supply the Company any information that may be required.  If the 
obligations are not met with due to ...

--- Result 3 ---
Source: Health1.pdf
Type: health
Content: Intra Vitreal 
injections
Robotic 
surgeries
Stereotactic
radio 
surgeries
Bronchical
Thermoplasty
Vaporisation 
of the 
prostate 
(Green laser 
treatment or 
holmium 
laser 
treatment)
IONM-(Intra 
O...

--- Result 4 ---
Source: Health1.pdf
Type: health
Content: Intra Vitreal 
injections
Robotic 
surgeries
Stereotactic
radio 
surgeries
Bronchical
Thermoplasty
Vaporisation 
of the 
prostate 
(Green laser 
tre

Step 8: Gemini **Integration**

In [None]:

!pip install google-generativeai

import google.generativeai as genai


genai.configure(api_key="AIzaSyCsjCgWLwuEXt2OIGRaFerZm8WhTEoiDYA")

model = genai.GenerativeModel('gemini-2.0-flash-exp')



Step 9: Decision Making Function

In [None]:
def make_decision(query, retrieved_docs, extracted_info):



    context = ""
    for i, doc in enumerate(retrieved_docs):
        context += f"\n--- Document {i+1} ---\n"
        context += f"Source: {doc.metadata.get('source_file', 'Unknown')}\n"
        context += f"Content: {doc.page_content}\n"


    prompt = f"""
    You are an insurance policy expert. Based on the following context and query, make a decision.

    QUERY: {query}
    EXTRACTED INFO: {extracted_info}

    CONTEXT FROM POLICY DOCUMENTS:
    {context}

    Please provide a decision in the following JSON format:
    {{
        "decision": "approved" or "rejected",
        "amount": "coverage amount if applicable, else null",
        "justification": "clear explanation with specific clause references",
        "confidence": "high/medium/low",
        "clauses_used": ["list of specific clauses referenced"]
    }}

    Be specific and reference exact clauses from the documents.
    """

    try:
        response = model.generate_content(prompt)
        return response.text
    except Exception as e:
        return f"Error in decision making: {str(e)}"


test_query = "46M, knee surgery, Pune, 3-month policy"
test_docs = search_relevant_docs("knee surgery coverage", document_type="health", k=3)
test_info = extract_query_info(test_query)

decision = make_decision(test_query, test_docs, test_info)
print("Decision:", decision)

Decision: ```json
{
        "decision": "approved",
        "amount": "Not specified, needs further information. The context indicates limits per treatment/procedure, but doesn't specify the coverage for knee surgery specifically.",
        "justification": "Based on the available information, the policy covers individuals between certain age ranges (Document 1 & 2, clause 6 implicitly suggests age is a factor), and the provided age (46) falls within a likely acceptable range. Document 3 outlines limits for various procedures. Although knee surgery is not directly listed, the sum insured is limited per person per policy period for each treatment/procedure.  The exact amount covered for knee surgery requires looking up specific procedure limits in the policy documents that were not provided in the context.",
        "confidence": "medium",
        "clauses_used": ["Health1.pdf, Document 1, Clause 6", "Health1.pdf, Document 3, mentions limits per person per policy period for each treatme

In [None]:
def process_insurance_query(user_query, policy_type=None):

    print(f"Processing query: {user_query}")


    extracted_info = extract_query_info(user_query)
    print(f"Extracted info: {extracted_info}")

    relevant_docs = search_relevant_docs(
        user_query,
        document_type=policy_type,
        k=5
    )
    print(f"Found {len(relevant_docs)} relevant documents")


    decision = make_decision(user_query, relevant_docs, extracted_info)

    return {
        "query": user_query,
        "extracted_info": extracted_info,
        "decision": decision,
        "documents_used": len(relevant_docs)
    }

result = process_insurance_query(
    "46 year old male needs knee surgery in Pune, has 3 month old health policy",
    policy_type="health"
)

print("\n=== FINAL RESULT ===")
print(result)

Processing query: 46 year old male needs knee surgery in Pune, has 3 month old health policy
Extracted info: {'age': 46, 'gender': 'male', 'location': 'pune', 'procedure': 'knee surgery'}
Found 5 relevant documents

=== FINAL RESULT ===
{'query': '46 year old male needs knee surgery in Pune, has 3 month old health policy', 'extracted_info': {'age': 46, 'gender': 'male', 'location': 'pune', 'procedure': 'knee surgery'}, 'decision': '```json\n{\n        "decision": "rejected",\n        "amount": null,\n        "justification": "Based on the provided policy document from Health2.pdf, the policy \'my: Optima Secure\' includes the following exclusion: \'Any treatment of orthopedic diseases or conditions except for fractures, dislocations and / or Injuries suffered during the Policy Period.\' Since the query concerns a 46-year-old male needing knee surgery and it\'s not specified that the knee condition is due to a recent fracture, dislocation or injury during the policy period, the claim is

In [None]:
import json
import re

def format_json_response(gemini_response):

    try:

        json_match = re.search(r'\{.*\}', gemini_response, re.DOTALL)
        if json_match:
            json_str = json_match.group()
            return json.loads(json_str)
        else:

            return {
                "decision": "manual_review_required",
                "amount": None,
                "justification": gemini_response,
                "confidence": "low",
                "clauses_used": []
            }
    except:
        return {
            "decision": "error",
            "amount": None,
            "justification": "Failed to process response",
            "confidence": "low",
            "clauses_used": []
        }


def complete_insurance_pipeline(user_query, policy_type=None):
    """
    Final complete function
    """

    extracted_info = extract_query_info(user_query)
    relevant_docs = search_relevant_docs(user_query, document_type=policy_type, k=5)
    raw_decision = make_decision(user_query, relevant_docs, extracted_info)


    formatted_decision = format_json_response(raw_decision)


    final_response = {
        "status": "success",
        "query": user_query,
        "extracted_info": extracted_info,
        "decision_details": formatted_decision,
        "processing_info": {
            "documents_searched": len(relevant_docs),
            "model_used": "gemini-2.0-flash-exp"
        }
    }

    return final_response

In [None]:

test_queries = [
    "46M, knee surgery, Pune, 3-month policy",
    "25 year old female, maternity claim, Mumbai, 1 year policy",
    "Car accident claim, 35 year old male, Delhi",
    "Life insurance claim, 60 year old, heart attack"
]

print("=== TESTING COMPLETE SYSTEM ===\n")

for i, query in enumerate(test_queries):
    print(f"Test {i+1}: {query}")
    try:
        result = complete_insurance_pipeline(query)
        print(f"Decision: {result['decision_details']['decision']}")
        print(f"Justification: {result['decision_details']['justification'][:100]}...")
        print("-" * 50)
    except Exception as e:
        print(f"Error: {str(e)}")
        print("-" * 50)

=== TESTING COMPLETE SYSTEM ===

Test 1: 46M, knee surgery, Pune, 3-month policy
Decision: rejected
Justification: Based on the provided context from the HDFC ERGO General Insurance Company Limited - Optima Secure p...
--------------------------------------------------
Test 2: 25 year old female, maternity claim, Mumbai, 1 year policy
Decision: rejected
Justification: Based on the provided policy documents (Health1.pdf and Health2.pdf), maternity expenses are subject...
--------------------------------------------------
Test 3: Car accident claim, 35 year old male, Delhi
Decision: approved
Justification: Based on the provided documents, the claim for a car accident involving a 35-year-old male in Delhi ...
--------------------------------------------------
Test 4: Life insurance claim, 60 year old, heart attack
Decision: approved
Justification: Based on Document 5 (Life1.pdf), under the 'Death Benefit' section, a lump sum is payable if the Lif...
---------------------------------------

In [None]:

import pickle


vectorstore.persist()


system_components = {
    'embeddings': embeddings,
    'functions': {
        'extract_query_info': extract_query_info,
        'search_relevant_docs': search_relevant_docs,
        'complete_pipeline': complete_insurance_pipeline
    }
}


with open('/content/drive/MyDrive/insurancess_system_components.pkl', 'wb') as f:
    pickle.dump(system_components, f)

print("System components saved to Google Drive!")
print("Vectorstore persisted to /content/policy_vectordb")


System components saved to Google Drive!
Vectorstore persisted to /content/policy_vectordb


In [None]:
# Load the components
with open('/content/drive/MyDrive/insurancess_system_components.pkl', 'rb') as f:
    loaded_components = pickle.load(f)

# Re-initialize the vectorstore from the persisted directory
from langchain_community.vectorstores import Chroma
from langchain_huggingface import HuggingFaceEmbeddings

embeddings = loaded_components['embeddings']
vectorstore = Chroma(persist_directory="/content/policy_vectordb", embedding_function=embeddings)

# You can now use the loaded components and vectorstore
loaded_functions = loaded_components['functions']
extract_query_info = loaded_functions['extract_query_info']
search_relevant_docs = loaded_functions['search_relevant_docs']
complete_insurance_pipeline = loaded_functions['complete_pipeline']


new_query = "I am a 30 year old man and I have a life insurance policy. I had a car accident. Am I covered?"
result = complete_insurance_pipeline(new_query)

import json
print(json.dumps(result, indent=4))

  vectorstore = Chroma(persist_directory="/content/policy_vectordb", embedding_function=embeddings)


{
    "status": "success",
    "query": "I am a 30 year old man and I have a life insurance policy. I had a car accident. Am I covered?",
    "extracted_info": {
        "age": 30,
        "gender": "male",
        "procedure": "accident"
    },
    "decision_details": {
        "decision": "approved",
        "amount": null,
        "justification": "Based on the provided information, the life insurance policy would cover death if it occurs as a result of an accident. Document 3 defines \"Injury\" as \"accidental physical bodily harm excluding any Illness, solely and directly caused by an external, violent, visible and evident means which is verified and certified by a Medical Practitioner.\" Therefore, if the car accident resulted in injuries that led to the policyholder's death, the life insurance policy would be applicable. Furthermore, Document 5 states \"Claim payment will only be made if confirmatory diagnosis of the conditions covered is received by the Company while the insure