In [56]:
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import fitz
import json

In [57]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Data Preprocessing

In [58]:

path = os.path.join(os.getcwd(),'Documents')
files = os.listdir(path)
files

['Final_Member Webinar_Storage Agreements 2023.pdf',
 'AU-Privacy-Collection-Statement-220822-.docx',
 'SSAA-Ski-Summit-Summary.pdf',
 'SSAA Arrears Management Guide 2024 (2).pdf',
 'Urbis-Storage-Index-December-2020-8p.pdf',
 'SSAA_Batteries in Self Storage 2024.pdf',
 'Final Draft_Managed Storage Specific Terms (1).docx',
 'State of the Industry_Consumer Insights.pdf',
 'Form-13-Privacy-NZPolicy-Template-V1-November-2020-2.docx',
 'Urbis-Storage-Index-August-2020.pdf',
 'Remote Management in Self Storage.pdf',
 'Trends Transforming Australasia_Simon Kuestenmacher.pdf',
 'SSAA_GC24_SoTI Launch Presentation.pdf',
 'Member Update_Storage Agreements 2023_F.pdf',
 'NZ-Customer-Storage-Agreement-Guidelines.v1.04-January-2023.pdf',
 'Urbis-Self-Storage-Sep-2017.pdf',
 'Urbis-Self-Storage-JUL-2018_LR.pdf',
 'Final Draft_Facility Rules Template.docx',
 'Urbis-Storage-Index.December-2016.pdf',
 'Urbis-Storage-Index-December-2021 (3).pdf',
 'SSAA 2023 Australian Standard Self Storage Licence Ag

In [59]:
# Step 1: Load .docx and extract paragraphs
def load_docx_text(path):
    doc = Document(path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    return paragraphs

In [60]:
# Step 2: Chunk paragraphs into simulated pages
def chunk_into_pages(paragraphs, max_words=800):
    pages = []
    current_page = []
    word_count = 0

    for para in paragraphs:
        words = para.split()
        if word_count + len(words) > max_words:
            pages.append(' '.join(current_page))
            current_page = []
            word_count = 0
        current_page.append(para)
        word_count += len(words)
    
    if current_page:
        pages.append(' '.join(current_page))
    return pages

In [61]:
def extract_docx(file):
    path = os.path.join(os.getcwd(),f'Documents/{file}')
    paragraphs = load_docx_text(path)
    pages = chunk_into_pages(paragraphs)
    return pages    

In [62]:
def extract_pdf(file):
    path = os.path.join(os.getcwd(),f'Documents/{file}')
    doc = fitz.open(path)
    pages = []
    for page in doc:
        text = page.get_text().strip()
        if text:
            pages.append(text)
    
    doc.close()
    return pages

In [63]:
def preprocessing():
    all_pages = []
    metadata = []
    path = os.path.join(os.getcwd(),'Documents')
    files = os.listdir(path)

    for file in files:
        if file.split('.')[-1] in ['docx']:
            pages = extract_docx(file)
            
        elif file.split('.')[-1] in ['pdf']:
            pages = extract_pdf(file)
        else:
            continue

        for i, page in enumerate(pages):
                all_pages.append(page)
                metadata.append({
                    "document": file,
                    "page_number": i + 1,
                    "text": page
                })
    return (all_pages,metadata)

In [64]:
all_pages, metadata = preprocessing()

# Data Indexer

In [65]:
def vectorize_pages(pages,model):
    embeddings = model.encode(pages, convert_to_numpy=True)
    return embeddings

In [66]:
def indexer_faiss():
    embeddings = vectorize_pages(all_pages,model)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)


    # 1. First time: after indexing
    faiss.write_index(index, "vector_index.faiss")
    with open("metadata.json", "w", encoding="utf-8") as f:
        json.dump(metadata, f)


In [67]:
indexer_faiss()

In [68]:
def search_index(query, index, metadata_list, model, top_k=3):
    query_vec = model.encode([query])
    D, I = index.search(query_vec, top_k)
    results = []
    for idx in I[0]:
        metadata = metadata_list[idx]
        results.append({
            "document": metadata["document"],
            "page_number": metadata["page_number"],
            "text": metadata["text"][:500],  # preview
            "score": D[0][list(I[0]).index(idx)]
        })
    return results

In [69]:

# Loading Indexer and metadata
index = faiss.read_index("vector_index.faiss")
with open("metadata.json", "r") as f:
    metadata_list = json.load(f)

# Example query
query = "What is Arrears Management?"
results = search_index(query, index, metadata_list, model)

print(f'Query : {query}')
# Display results
for res in results:
    print(f"\n📄 Document: {res['document']} | Page: {res['page_number']} | Score: {res['score']:.2f}")
    print(f"📝 Content Preview:\n{res['text']}...")
    print('-' * 80)

Query : What is Arrears Management?

📄 Document: SSAA Arrears Management Guide 2024 (2).pdf | Page: 1 | Score: 0.31
📝 Content Preview:
Arrears 
Management 
Guide
2024...
--------------------------------------------------------------------------------

📄 Document: Arrears Management Guide_NZ.pdf | Page: 1 | Score: 0.42
📝 Content Preview:
Arrears 
Management 
Guide NZ
2024...
--------------------------------------------------------------------------------

📄 Document: SSAA Arrears Management Guide 2024 (2).pdf | Page: 3 | Score: 0.52
📝 Content Preview:
Overview
What is Arrears Management? 
Arrears are fees that have not been paid by their due date – that is, the fees become overdue.
Arrears management is the process of handling overdue fees from customers. It involves 
identifying accounts that have fallen behind on their payments, initiating timely communications to 
remind customers of their obligations, and utilising strategies to recover outstanding amounts. 
This can include sending

In [70]:
import openai
from dotenv import load_dotenv
load_dotenv()
# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=api_key)  # Replace with your actual key

def get_answer_from_openai(query: str, content: str, model: str = "gpt-3.5-turbo-0125") -> str:
    """
    Uses OpenAI's ChatCompletion API to answer a question based on the provided context.

    Args:
        query (str): The user's question.
        content (str): The context/content from which the answer should be extracted.
        model (str): OpenAI model to use (default is gpt-3.5-turbo-0125).

    Returns:
        str: The generated answer from OpenAI.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."},
                {"role": "user", "content": f"Context: {content}\n\nQuestion: {query}"}
            ],
            temperature=0.2,
            max_tokens=300
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        return f"Error occurred: {e}"

In [71]:
# Example query
query = "deceased storer"
results = search_index(query, index, metadata_list, model)
context=''
for res in results:
    context += res['text']

print(f'Query : {query}')
# Display results
for res in results:
    print(f"\n📄 Document: {res['document']} | Page: {res['page_number']} | Score: {res['score']:.2f}")
    print(f"📝 Content Preview:\n{res['text']}...")
    print('-' * 80)

answer = get_answer_from_openai(query,context)
print(f'Answer from OpenAI model :\n{answer}')

Query : deceased storer

📄 Document: NZ-Customer-Storage-Agreement-Guidelines.v1.04-January-2023.pdf | Page: 47 | Score: 0.91
📝 Content Preview:
SSAA CUSTOMER STORAGE AGREEMENT GUIDELINES (CSA) 
© Self Storage Association of Australasia 2023 
47 
KJW - 449872/6 - 194743.3 
 
 
 
19. 
DECEASED STORER 
 
 
When a Storer dies, a Facility is not able to ‘release’ goods to any person other than the person to 
whom probate is granted. A probate document is an official court issued document. It will include a 
court stamp. 
 
19.1 Storer dies without a will 
Where a person dies without a will it is called “intestate”. The next of kin will need ...
--------------------------------------------------------------------------------

📄 Document: Self Storage Legal Landscape - Australia_Ritika Sardar and Ethan Holden.pdf | Page: 12 | Score: 0.95
📝 Content Preview:
Access to a space by others – relatives/executor 
when storer deceased (illustration cont.)
• On a storer’s death, the lock should not be