ParseText()-PDF loading & chunking

In [2]:
from pypdf import PdfReader
from langchain_text_splitters import RecursiveCharacterTextSplitter

def parseText(pdf_path):
    reader = PdfReader(pdf_path)

    policy_text = ""
    for page in reader.pages:
        policy_text += page.extract_text()

    splitter = RecursiveCharacterTextSplitter(
        chunk_size=400,
        chunk_overlap=50
    )

    chunks = splitter.split_text(policy_text)
    return chunks


SearchText()->Embedding + FAISS retrieval

In [3]:
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np

embed_model = SentenceTransformer("all-MiniLM-L6-v2")

def build_index(chunks):
    embeddings = embed_model.encode(chunks)
    dimension = embeddings.shape[1]

    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(embeddings))

    return index, embeddings


def searchText(question, chunks, index, top_k=3):
    q_embedding = embed_model.encode([question])
    distances, indices = index.search(q_embedding, top_k)

    return "\n".join([chunks[i] for i in indices[0]])


ClassifyText()

In [4]:
def classifyText(question):
    if "rent" in question.lower():
        return "room_rent"
    elif "claim" in question.lower():
        return "claims"
    else:
        return "general"


SummarizeText()

In [5]:
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch

model_name = "Qwen/Qwen2-1.5B-Instruct"

tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
)

def summarizeText(context, question):
    prompt = f"""
You are an insurance policy assistant.

STRICT RULES:
- Use ONLY the provided policy context.
- DO NOT assume or guess.
- If the answer is not explicitly stated, say:
  "The policy does not explicitly mention this information."

Explain in simple language.

Policy Context:
{context}

Question:
{question}

Answer:
"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    output = model.generate(
        **inputs,
        max_new_tokens=200,
        do_sample=True
    )

    return tokenizer.decode(output[0], skip_special_tokens=True)


Some parameters are on the meta device because they were offloaded to the disk.


DisplayInformation()

In [None]:
def displayInformation(answer):
    print("\n Answer:")
    print(answer)


In [None]:
chunks = parseText("policy.pdf")
index, _ = build_index(chunks)
question = "What is the maximum room rent allowed?"
intent = classifyText(question)
context = searchText(question, chunks, index)
answer = summarizeText(context, question)
displayInformation(answer)



ðŸ“„ Answer:

You are an insurance policy assistant.

STRICT RULES:
- Use ONLY the provided policy context.
- DO NOT assume or guess.
- If the answer is not explicitly stated, say:
  "The policy does not explicitly mention this information."

Explain in simple language.

Policy Context:
Insured per illness/injury. 
 
The amounts payable under points no. 2 and 3 shall be at the rate applicable to the entitled room 
category. In case the insured opts for a room with rent higher than the entitled category as under poi nt 
no. 1, the charges payable under point 1, 2 and 3 shall be limited to the charges applicable to the entitled 
category.
restricted maximum up to 25% of the Sum Insured per illness/injury. 
 
2. Medical Practitioner , Surgeon, Anesthetist, Consultants, and Specialists Fees - All admissible claims under 
this section during the policy period restricted maximum up to 40% of the Sum Insured per illness/injury. 
 
3. Anesthesia, Blood, Oxygen, Operation Theatre Expens es, Su