In [1]:
import os
import re
from docx import Document
import tiktoken
import ollama
import faiss
import numpy as np

from langchain_text_splitters import (
    TokenTextSplitter,
    RecursiveCharacterTextSplitter
)
from langchain_google_genai import (
    GoogleGenerativeAIEmbeddings,
    ChatGoogleGenerativeAI
)
from langchain_community.document_loaders import UnstructuredWordDocumentLoader, Docx2txtLoader

from dotenv import load_dotenv
from langchain_community.vectorstores import Pinecone
from tqdm.auto import tqdm as notebook_tqdm

from langchain_community.retrievers import PineconeHybridSearchRetriever
from langchain_experimental.text_splitter import SemanticChunker

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
load_dotenv()
# Load environment variables    
google_api_key = os.getenv("GOOGLE_API_KEY")
pc_api_key = os.getenv("PINECONE_API_KEY")

In [3]:
def load_docx_documents(file_path):
    """
    Load documents from a .docx file.
    """
    # Use the UnstructuredWordDocumentLoader to load the document
    loader = UnstructuredWordDocumentLoader(file_path)
    documents = loader.load()
    
    # If no documents are found, try using Docx2TextLoader
    if not documents:
        docx_loader = Docx2txtLoader(file_path)
        documents = docx_loader.load()
    
    return documents

In [4]:
class OllamaEmbeddingsWrapper:
    def __init__(self, model_name="nomic-embed-text"):
        self.model_name = model_name
    
    def embed_documents(self, texts):
        """Interface expected by LangChain"""
        # Process each text individually
        all_embeddings = []
        for text in texts:
            response = ollama.embeddings(model=self.model_name, prompt=text)
            all_embeddings.append(response['embedding'])
        return all_embeddings
    
    def embed_query(self, text):
        """Also needed for some LangChain components"""
        response = ollama.embeddings(model=self.model_name, prompt=text)
        return response['embedding']


In [5]:
def split_documents_into_chunks(pages):
    """
    Split documents into smaller chunks.
    """
    # Initialize the text splitter with the wrapper
    embeddings_wrapper = OllamaEmbeddingsWrapper(model_name="nomic-embed-text")
    semantic_chunker = SemanticChunker(
        embeddings=embeddings_wrapper,
        breakpoint_threshold_type="standard_deviation",
        number_of_chunks=100,
        min_chunk_size=5,
        breakpoint_threshold_amount=10
    )
    split_docs = semantic_chunker.split_documents(pages)

# semantic_chunker = SemanticChunker(
#         embeddings=ollama.embeddings(model="nomic-embed-text"),
#         breakpoint_threshold_type="standard_deviation",
#         number_of_chunks=100,
#         breakpoint_threshold_amount=70
#     )
# docs = load_docx_documents("data/Sample_BRD_Policy_Management_System.docx")
# text = semantic_chunker.create_documents(docs[0].page_content)
# len(text)  # Check the number of chunks created

    token_splitter = TokenTextSplitter(
        encoding_name="o200k_base",
        chunk_size=50,
        chunk_overlap=0,
    )
    final_chunks = token_splitter.split_documents(split_docs)

    return final_chunks

In [6]:
def create_vector_store(chunks):
    """
    Create a vector store from the chunks.
    """
    embedder = OllamaEmbeddingsWrapper(model_name="nomic-embed-text")
    document_texts = [doc.page_content for doc in chunks]
    document_embeddings = embedder.embed_documents(document_texts)  # Use embed_documents instead of embedding

    dimension = len(document_embeddings[0]) if document_embeddings else 0  # In case document_embeddings is a list
    index = faiss.IndexFlatL2(dimension)
    index.add(np.array(document_embeddings).astype(np.float32))

    return index, document_texts, embedder

In [None]:
def retrieve_context(query, embedder, index, documents, k=3):
    query_embedding = embedder.embed_query(query)  # Use embed_query instead of embedding
    distances, indices = index.search(np.array([query_embedding]).astype(np.float32), k)
    return [documents[i] for i in indices[0]]

In [8]:
def generate_answer_ollama(query, context):
    """
    Generate an answer using the Ollama model.
    """
    formatted_context = "\n".join(context)
    prompt = f"""You are an expert trained on the following documents information:
               {formatted_context}
                
                Answer the question: {query}

                Answer correctly with maximum accuracy. Ensure that the answer is relevant to the question and the context provided.
                Also mention in where the information was found in the documents.
"""
    response = ollama.generate(
        model="llama3.2:1b",
        prompt=prompt,
        options={
            "temperature": 0.4
        }
    )
    return response['response']

In [9]:
def main(query):
    # Load and process DOCX file
    file_path = "data/Sample_BRD_Policy_Management_System.docx"
    pages = load_docx_documents(file_path)
    split_docs = split_documents_into_chunks(pages)
    
    # Create vector store
    index, document_texts, embedder = create_vector_store(split_docs)
    
    # Retrieve context
    context = retrieve_context(query, embedder, index, document_texts)
    
    # Generate answer
    answer = generate_answer_ollama(query, context)
    return answer

In [11]:
import markdown
from IPython.display import Markdown, display

def display_markdown_result():
    markdown_result = f"""
# Response to Query: "{query}"

{result}
"""
    display(Markdown(markdown_result))

if __name__ == "__main__":
    query = "Find rules related to increasing Sum Insured for Activ Health policies."
    
    result = main(query)
    #print("Raw Answer:", result)
    print("\n----- Markdown Formatted Answer -----\n")
    display_markdown_result()


----- Markdown Formatted Answer -----




# Response to Query: "Find rules related to increasing Sum Insured for Activ Health policies."

Based on the provided Upsell Criteria Rules, I've identified the following rules related to increasing Sum Insured for Activ Health policies:

1. **Age Limit**: The maximum age limit for activating an Activ Health policy is 45 years (Rule: Age Limit ≤50 years). This rule indicates that the policy can be activated at any age up to 50 years.

2. **Max Sum Insured**: There are no specific rules related to increasing the max sum insured for Activ Health policies in this document. However, it's essential to note that the maximum sum insured is typically determined by the insurance company and may vary depending on individual circumstances.

3. **Activ One**: The information provided does not mention any rules or criteria related to activating an Activ One policy. It seems that Activ One is a separate type of policy or product offered by the insurance company, and this document only provides guidelines for Activ Health policies.

4. **Sum Insured Increase Criteria**: There are no specific rules or criteria mentioned in this document that would allow the sum insured to be increased based on certain conditions. The focus seems to be on determining the maximum age limit and age-related restrictions rather than adjusting the sum insured amount.

The information was found in:

* Upsell Criteria Rules (not explicitly stated, but implied as a separate document or section within the main policy documents)
* Policy terms and conditions for Activ Health policies
* Insurance company's website, policy documentation, or customer support resources.
