# Notebook containing Preprocessing, Indexing and Inferencing modules for the ChatBot Backend development 

## Importing Libraries

In [2]:
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import fitz
import json
import hashlib
import shutil
import openai
from dotenv import load_dotenv
load_dotenv()

  from .autonotebook import tqdm as notebook_tqdm


True

In [None]:
# Load embedding model from online
# model = SentenceTransformer('all-MiniLM-L6-v2')

In [None]:
# Saving the emdebdding model to the local directory 
# model.save('models/my_miniLM_model')

In [None]:
# Load embedding model from the local directory
model = SentenceTransformer('models/my_miniLM_model')

## Data Preprocessing

In [3]:
path = os.path.join(os.getcwd(),'SSAA_Documents/Valid Documents')
files = os.listdir(path)
files

['0685i00000CR2gfAAD.docx',
 '0685i00000IxmTCAAZ.docx',
 '0685i00000ED4iNAAT.docx',
 '0685i00000GMzwSAAT.pdf',
 '2024 Operations Checklist.pdf',
 'Site-Maintenance-Checklist-1 (1).docx',
 '068Mo00000OA0dhIAD.pdf',
 '0685i00000COnTYAA1.pdf',
 'StoreProtect Australia - Overview.pdf',
 '0685i00000CPMiuAAH.pdf',
 '0685i00000JTytPAAT.pdf',
 '068Mo00000TExCQIA1.pptx',
 '0685i00000CQRm5AAH.docx',
 'Members List January 2025.xlsx',
 '0685i00000CRB13AAH.docx',
 'Final_Member Webinar_Storage Agreements 2023.pdf',
 'SSAA-Leader-Guide_Editable-Version-1-1.docx',
 'Australian-Traffic-Study-overview.pdf',
 'COVID-19-MEMBERS-GUIDE-V4-1.pdf',
 '0685i00000CPMvQAAX.pdf',
 'SSAA-DSM-Resources_Complimentary-First-Month_Text-Templates-Only.docx',
 'AU-Privacy-Collection-Statement-220822-.docx',
 '0685i00000CQGYpAAP.docx',
 '0685i00000GKMkpAAH.docx',
 '0685i00000CR2nFAAT.docx',
 '0685i00000CQRgJAAX.docx',
 '0685i00000CQINQAA5.docx',
 'SSAA-Ski-Summit-Summary.pdf',
 'StorerCheck-Information-Form_July-2022-1 

In [4]:
len(files)

341

## Function to load extract contents from docx files 

In [None]:
from docx import Document

def load_docx_text(path):
    """
    Loads a .docx file and extracts all non-empty paragraphs as a list of strings.

    Args:
        path (str): Path to the .docx file.

    Returns:
        List[str]: List of paragraph texts (non-empty, stripped).
    """
    doc = Document(path)  # Load the document
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]  # Filter out empty paragraphs
    return paragraphs


## Function to create paragraph chunks from the contents extracted from docx files

In [None]:
def chunk_into_pages(paragraphs, max_length=800):
    """
    Combine paragraphs into chunks (pages) where each chunk's length is up to max_length characters.

    Args:
        paragraphs (List[str]): List of paragraph strings.
        max_length (int): Maximum character length per chunk.

    Returns:
        List[str]: List of text chunks (pages).
    """
    pages = []
    current = ""

    for para in paragraphs:
        # Check if adding this paragraph keeps us under max_length
        if len(current) + len(para) < max_length:
            current += para + " "
        else:
            # Save current chunk and start a new one with this paragraph
            pages.append(current.strip())
            current = para + " "

    # Append any remaining text as the last chunk
    if current:
        pages.append(current.strip())

    return pages


In [None]:
def extract_docx(file):
    """
    Extract text from a .docx file, split into pages of paragraphs.

    Args:
        file (str): Filename of the .docx document located in 'SSAA_Documents/Valid Documents/'.

    Returns:
        List[str]: List of text chunks (pages) created from paragraphs.
    """
    # Construct the full path to the file
    path = os.path.join(os.getcwd(), 'SSAA_Documents', 'Valid Documents', file)
    
    # Load paragraphs from the .docx
    paragraphs = load_docx_text(path)
    
    # Chunk paragraphs into pages of max length
    pages = chunk_into_pages(paragraphs)
    
    return pages


## Funciton to extract contents from PDF files

In [None]:
def extract_pdf(file):
    """
    Extract text from a PDF file, returning non-empty pages as a list of strings.

    Args:
        file (str): Filename of the PDF located in 'SSAA_Documents/Valid Documents/'.

    Returns:
        List[str]: List of text content for each page.
    """
    path = os.path.join(os.getcwd(), 'SSAA_Documents', 'Valid Documents', file)
    doc = fitz.open(path)
    pages = [page.get_text().strip() for page in doc if page.get_text().strip()]
    doc.close()
    return pages

## Function to hash a given content

In [None]:
def hash_document_text(text):
    """
    Generate a SHA-256 hash for the given text string.

    Args:
        text (str): The input text to hash.

    Returns:
        str: The hexadecimal SHA-256 hash of the input text.
    """
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

## Function to extract contents from a file (either PDF or DOCX)

In [None]:
def extract_full_text(file_path):
    """
    Extract full text from a document file (.pdf or .docx).

    Args:
        file_path (str): The full path to the document file.

    Returns:
        str or None: The extracted text as a single string, or None if unsupported format.
    """
    ext = file_path.split('.')[-1].lower()
    file_name = os.path.basename(file_path)

    if ext == 'pdf':
        return "\n".join(extract_pdf(file_name))
    elif ext == 'docx':
        return "\n".join(extract_docx(file_name))
    else:
        return None

## Function to remove duplicate document from a folder

In [None]:
def remove_duplicate_documents(doc_folder = 'SSAA_Documents/Valid Documents'):
    """
    Removes duplicate PDF and DOCX documents from the specified folder based on document content.

    Parameters:
    -----------
    doc_folder : str
        Path to the folder containing documents to check for duplicates (relative to current working directory).
        Default is 'SSAA_Documents/Valid Documents'.

    Returns:
    --------
    None
        This function performs in-place operations: it moves duplicate files to a folder called 'Duplicate_Documents'
        and prints status messages. It does not return any value.
    """

    seen_hashes = set()  # Store unique document hashes
    doc_path = os.path.join(os.getcwd(), doc_folder)
    files = os.listdir(doc_path)

    # Create a folder to store duplicates if not already present
    duplicates_folder = os.path.join(os.getcwd(), "Duplicate_Documents")
    os.makedirs(duplicates_folder, exist_ok=True)

    for file in files:
        full_path = os.path.join(doc_path, file)

        # Only process PDF and DOCX files
        if not file.lower().endswith(('.pdf', '.docx')):
            continue

        # Extract full text from the document
        text = extract_full_text(full_path)
        if not text:
            print(f"⚠️ Skipped (no text extracted): {file}")
            continue

        # Generate SHA-256 hash of the cleaned text
        doc_hash = hash_document_text(text.strip())

        if doc_hash in seen_hashes:
            # If hash is already seen, move the file to duplicates folder
            shutil.move(full_path, os.path.join(duplicates_folder, file))
            print(f"❌ Removed duplicate: {file}")
        else:
            # New unique document
            seen_hashes.add(doc_hash)

    print("✅ Duplicate removal complete.")

In [15]:
remove_duplicate_documents()

❌ Removed duplicate: 2023_SWI005 Hazardous Chemicals.docx
❌ Removed duplicate: State-of-the-Industry-2020_Member-Report.pdf
❌ Removed duplicate: Form-1I-SMS-Run-Sheet-for-Late-Notices.docx
❌ Removed duplicate: SSAA-NZ-Form-14-Privacy-Warrant-of-Fitness-190420-1 (4).docx
❌ Removed duplicate: Form 1P - Agreement TandCs Update Template 2324.docx
❌ Removed duplicate: 0685i00000CQFYEAA5.pdf
❌ Removed duplicate: 0685i00000CRB7NAAX.docx
❌ Removed duplicate: 7-Day-Overdue-letter.docx
❌ Removed duplicate: 0685i00000CQ2i2AAD.pdf
❌ Removed duplicate: 0685i00000KFLK5AAP.pdf
❌ Removed duplicate: A4_SSAA_Convention-booklet_PRINT.pdf
❌ Removed duplicate: StorerCheck-Information-Form_July-2022-1 (6).pdf
❌ Removed duplicate: SSAA-NZ-Form-14-Privacy-Warrant-of-Fitness-190420-1 (2).docx
❌ Removed duplicate: Form-13-Privacy-NZPolicy-Template-V1-November-2020-2.docx
❌ Removed duplicate: 2023_SWI007 Ladder Safety.docx
❌ Removed duplicate: NZ_Facility Rules Template.docx
❌ Removed duplicate: 0685i00000CQRmpA

In [16]:
files = os.listdir('SSAA_Documents/Valid Documents')
len(files)

341

In [17]:
dupfiles = os.listdir('Duplicate_Documents')
len(dupfiles)

192

## Main function for preprocessing the documents 

In [None]:
def preprocessing():
    """
    Loads and deduplicates pages from all .pdf and .docx documents in the 'Valid Documents' folder.

    For each page of text (from PDF or DOCX files), it removes duplicates using a SHA-256 hash.
    It collects the cleaned content and associated metadata.

    Parameters:
    -----------
    None

    Returns:
    --------
    tuple[list[str], list[dict]]
        - all_pages: A list of unique text pages (str) from all valid documents.
        - metadata: A list of dictionaries, each containing:
            • 'document': Name of the source file (str)
            • 'page_number': Page number (int, starting at 1)
            • 'text': The actual page content (str)
    """

    all_pages = []    # Stores the unique page texts
    metadata = []     # Stores metadata for each unique page
    seen_hashes = set()  # Tracks unique hashes to detect duplicate pages

    path = os.path.join(os.getcwd(), 'SSAA_Documents/Valid Documents')
    files = os.listdir(path)

    for file in files:
        # Determine file type and extract pages
        if file.split('.')[-1] == 'docx':
            pages = extract_docx(file)
        elif file.split('.')[-1] == 'pdf':
            pages = extract_pdf(file)
        else:
            continue  # Skip unsupported file types

        # Process each page
        for i, page in enumerate(pages):
            content_hash = hash_document_text(page.strip())

            if content_hash in seen_hashes:
                continue  # Skip duplicate page

            seen_hashes.add(content_hash)
            all_pages.append(page)
            metadata.append({
                "document": file,
                "page_number": i + 1,
                "text": page
            })

    return all_pages, metadata


In [19]:
all_pages, metadata = preprocessing()

In [20]:
len(metadata)

3473

## Data Indexing: Function to vectorize pages using the embedding model

In [None]:
def vectorize_pages(pages, model):
    """
    Converts a list of page texts into numerical embeddings using a sentence embedding model.

    Parameters:
    -----------
    pages : list[str]
        A list of textual page contents to be vectorized.
    model : Any
        A sentence embedding model that has an `.encode()` method (e.g., SentenceTransformer).

    Returns:
    --------
    np.ndarray
        A 2D NumPy array of vector embeddings, one row per page.
    """
    # Generate embeddings for all pages; output is a NumPy array
    embeddings = model.encode(pages, convert_to_numpy=True)
    return embeddings


## Data Indexing: Function to create FAISS index for the document embeddings

In [None]:
def indexer_faiss():
    """
    Creates a FAISS index from document embeddings and saves both the index and metadata to disk.

    Assumes the following global variables are already defined:
    - all_pages (list[str]): List of text pages to be indexed.
    - model: A sentence embedding model with an `.encode()` method.
    - metadata (list[dict]): Metadata for each document page (filename, page number, text).

    Outputs:
    --------
    Saves the following files to disk:
    - 'vector_index.faiss' : FAISS index file for fast similarity search.
    - 'metadata.json'      : Corresponding metadata for indexed documents.
    """
    # Generate embeddings from all document pages
    embeddings = vectorize_pages(all_pages, model)

    # Initialize a FAISS index with the appropriate dimensionality
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)

    # Add embeddings to the index
    index.add(embeddings)

    # Save FAISS index and metadata to disk
    faiss.write_index(index, "vector_index.faiss")
    with open("metadata.json", "w", encoding="utf-8") as f:
        json.dump(metadata, f)


In [24]:
indexer_faiss()

## Function to retrieve top relevant documents from FAISS index for a query 

In [None]:
def search_index(query,index,metadata_list,model,top_k= 3):
    """
    Searches a FAISS index for the top-k most relevant documents based on the query.

    Parameters:
    -----------
    query (str): The user query to search against the index.
    index (faiss.Index): The FAISS index object used for similarity search.
    metadata_list (list of dict): Metadata for each indexed document page.
    model: Sentence embedding model with an `.encode()` method.
    top_k (int): Number of top results to return (default is 3).

    Returns:
    --------
    list of dict: Top-k matching documents with the following keys:
        - 'document' (str): Filename of the document.
        - 'page_number' (int): Page number of the match.
        - 'text' (str): Preview text from the matching page.
        - 'score' (float): Similarity score (lower is better for L2 distance).
    """
    # Encode the query into the same embedding space as the documents
    query_vec = model.encode([query])

    # Perform similarity search on the FAISS index
    D, I = index.search(query_vec, top_k)

    results = []
    for idx in I[0]:
        metadata = metadata_list[idx]
        results.append({
            "document": metadata["document"],
            "page_number": metadata["page_number"],
            "text": metadata["text"][:500],  # Limit text preview to 500 chars
            "score": D[0][list(I[0]).index(idx)]  # Retrieve corresponding distance score
        })
    return results


In [None]:

# Loading Indexer and metadata
index = faiss.read_index("vector_index.faiss")
with open("metadata.json", "r") as f:
    metadata_list = json.load(f)

## Function to generate answer using Open AI model for a given query and context

In [9]:
# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=api_key)  # Replace with your actual key

def get_answer_from_openai(query: str, content: str, model: str = "gpt-3.5-turbo-0125") -> str:
    """
    Uses OpenAI's ChatCompletion API to answer a question based on the provided context.

    Args:
        query (str): The user's question.
        content (str): The context/content from which the answer should be extracted.
        model (str): OpenAI model to use (default is gpt-3.5-turbo-0125).

    Returns:
        str: The generated answer from OpenAI.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."},
                {"role": "user", "content": f"Context: {content}\n\nQuestion: {query}"}
            ],
            temperature=0.2,
            max_tokens=300
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        return f"Error occurred: {e}"

## Function to flag queries related to Insurance or Liability

In [10]:
# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=api_key)  # Replace with your actual key

def query_flagging(query: str, model: str = "gpt-3.5-turbo-0125") -> str:
    """
    Determines if a query is related to insurance or liability.

    Args:
        query (str): The user's question.
        model (str): OpenAI model to use (default is gpt-3.5-turbo-0125).

    Returns:
        str: "1" if the query is related to insurance/liability, "0" otherwise.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "Return 1 if the user's query is related to insurance or liability. Otherwise, return 0. Reply with only 1 or 0."},
                {"role": "user", "content": query}
            ],
            temperature=0.0,
            max_tokens=1
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        return f"Error occurred: {e}"

In [4]:
query = "What is the insurance for deceased storer?"
query_flagging(query)

'1'

In [None]:
def generate_answer(query):
    """
    Generates an answer to a user query using indexed documents and a language model.

    Parameters:
    -----------
    query (str): The user question to be answered.

    Returns:
    --------
    tuple:
        - answer (str): The generated answer based on the top retrieved documents.
        - documents (list of str): Unique list of document names used to generate the answer.
        - status (str): The output from the query_flagging function, typically indicating query type or validity.
    """
    # Step 1: Flag the query to determine intent, category, or perform validation
    status = query_flagging(query)

    # Step 2: Perform semantic search to retrieve relevant content
    results = search_index(query, index, metadata_list, model)

    context = ''
    documents = []

    # Step 3: Compile context from top results
    for res in results:
        print(f"\n📄 Document: {res['document']} | Page: {res['page_number']} | Score: {res['score']:.2f}")
        print(f"📝 Content Preview:\n{res['text']}...")
        print('-' * 80)

        documents.append(res['document'])
        context += res['text']

    # Step 4: Call LLM to generate the final answer based on the compiled context
    answer = get_answer_from_openai(query, context)

    # Remove duplicates in document list
    documents = list(set(documents))

    return answer, documents, status


In [None]:
#Testing the inferencing module
query = "What is arrear management?"
answer,documents,status = generate_answer(query)
print(f"Open AI Response {answer}")
print(f'Documents referred {documents}')
print(f'Status {status}')


📄 Document: SSAA Arrears Management Guide 2024 (2).pdf | Page: 1 | Score: 0.39
📝 Content Preview:
Arrears 
Management 
Guide
2024...
--------------------------------------------------------------------------------

📄 Document: 068Mo00000TEddWIAT.pdf | Page: 1 | Score: 0.47
📝 Content Preview:
Arrears 
Management 
Guide NZ
2024...
--------------------------------------------------------------------------------

📄 Document: SSAA Arrears Management Guide 2024 (2).pdf | Page: 3 | Score: 0.65
📝 Content Preview:
Overview
What is Arrears Management? 
Arrears are fees that have not been paid by their due date – that is, the fees become overdue.
Arrears management is the process of handling overdue fees from customers. It involves 
identifying accounts that have fallen behind on their payments, initiating timely communications to 
remind customers of their obligations, and utilising strategies to recover outstanding amounts. 
This can include sending reminders, offering payment plans, applying 