In [44]:
import os
from docx import Document
from sentence_transformers import SentenceTransformer
import faiss
import numpy as np
import fitz
import json
import hashlib
import shutil

In [2]:
# Load embedding model
model = SentenceTransformer('all-MiniLM-L6-v2')

In [9]:
model.save('models/my_miniLM_model')

In [12]:
model = SentenceTransformer('models/my_miniLM_model')

# Data Preprocessing

In [13]:

path = os.path.join(os.getcwd(),'Documents')
files = os.listdir(path)
files

['0685i00000CR2gfAAD.docx',
 '0685i00000IxmTCAAZ.docx',
 '0685i00000ED4iNAAT.docx',
 '0685i00000KG4zZAAT.pdf',
 '0685i00000GMzwSAAT.pdf',
 '068Mo00000OA0dhIAD.pdf',
 '0685i00000COnTYAA1.pdf',
 '0685i00000CPMiuAAH.pdf',
 '0685i00000JTytPAAT.pdf',
 '068Mo00000TExCQIA1.pptx',
 '0685i00000CQRm5AAH.docx',
 '068J3000004AFZ8IAO.pdf',
 '0685i00000CRB13AAH.docx',
 'Final_Member Webinar_Storage Agreements 2023.pdf',
 '0685i00000CPMvQAAX.pdf',
 'AU-Privacy-Collection-Statement-220822-.docx',
 '0685i00000CQGYpAAP.docx',
 '0685i00000GKMkpAAH.docx',
 '0685i00000CR2nFAAT.docx',
 '0685i00000CQRgJAAX.docx',
 '0685i00000CQINQAA5.docx',
 'SSAA-Ski-Summit-Summary.pdf',
 '0685i00000COmvjAAD.pdf',
 '0685i00000CPZoMAAX.pdf',
 '0685i00000CQfvpAAD.pdf',
 '0685i00000CQ2jRAAT.pdf',
 '068J3000002BYLxIAO.docx',
 '0685i00000MY8fWAAT.pdf',
 '068J3000001dN5UIAU.pdf',
 '068Mo00000QZCNaIAP.pdf',
 '068Mo00000Oxq9iIAB.pdf',
 '068Mo00000OxXXpIAN.pdf',
 '0685i00000COnSkAAL.pdf',
 '0685i00000CQ39TAAT.pdf',
 'SSAA Arrears Ma

In [14]:
# Step 1: Load .docx and extract paragraphs
def load_docx_text(path):
    doc = Document(path)
    paragraphs = [p.text.strip() for p in doc.paragraphs if p.text.strip()]
    return paragraphs

In [None]:
# # Step 2: Chunk paragraphs into simulated pages
# def chunk_into_pages(paragraphs, max_words=800):
#     pages = []
#     current_page = []
#     word_count = 0

#     for para in paragraphs:
#         words = para.split()
#         if word_count + len(words) > max_words:
#             pages.append(' '.join(current_page))
#             current_page = []
#             word_count = 0
#         current_page.append(para)
#         word_count += len(words)
    
#     if current_page:
#         pages.append(' '.join(current_page))
#     return pages

In [39]:
def chunk_into_pages(paragraphs, max_length=800):
    pages = []
    current = ""
    for para in paragraphs:
        if len(current) + len(para) < max_length:
            current += para + " "
        else:
            pages.append(current.strip())
            current = para + " "
    if current:
        pages.append(current.strip())
    return pages

In [16]:
def extract_docx(file):
    path = os.path.join(os.getcwd(),f'Documents/{file}')
    paragraphs = load_docx_text(path)
    pages = chunk_into_pages(paragraphs)
    return pages    

In [None]:
# def extract_pdf(file):
#     path = os.path.join(os.getcwd(),f'Documents/{file}')
#     doc = fitz.open(path)
#     pages = []
#     for page in doc:
#         text = page.get_text().strip()
#         if text:
#             pages.append(text)
    
#     doc.close()
#     return pages

In [40]:
def extract_pdf(file):
    path = os.path.join(os.getcwd(), f'Documents/{file}')
    doc = fitz.open(path)
    pages = [page.get_text().strip() for page in doc if page.get_text().strip()]
    doc.close()
    return pages

In [41]:
def hash_document_text(text):
    return hashlib.sha256(text.encode('utf-8')).hexdigest()

In [42]:
def extract_full_text(file_path):
    ext = file_path.split('.')[-1].lower()
    file_name = os.path.basename(file_path)
    if ext == 'pdf':
        return "\n".join(extract_pdf(file_name))
    elif ext == 'docx':
        return "\n".join(extract_docx(file_name))
    else:
        return None

In [45]:
def remove_duplicate_documents(doc_folder='Documents'):
    seen_hashes = set()
    doc_path = os.path.join(os.getcwd(), doc_folder)
    files = os.listdir(doc_path)

    # Optional: move duplicates to separate folder
    duplicates_folder = os.path.join(os.getcwd(), "Duplicate_Documents")
    os.makedirs(duplicates_folder, exist_ok=True)

    for file in files:
        full_path = os.path.join(doc_path, file)

        if not file.endswith(('.pdf', '.docx')):
            continue

        text = extract_full_text(full_path)
        if not text:
            continue

        doc_hash = hash_document_text(text.strip())

        if doc_hash in seen_hashes:
            # Duplicate found: move to duplicates folder
            shutil.move(full_path, os.path.join(duplicates_folder, file))
            print(f"❌ Removed duplicate: {file}")
        else:
            seen_hashes.add(doc_hash)

    print("✅ Duplicate removal complete.")

In [46]:
remove_duplicate_documents()

❌ Removed duplicate: 0685i00000MY7fPAAT.pdf
❌ Removed duplicate: 0685i00000CQRfBAAX.docx
❌ Removed duplicate: State of the Industry_Consumer Insights.pdf
❌ Removed duplicate: 0685i00000CPRH4AAP.pdf
❌ Removed duplicate: 0685i00000ED4lqAAD.docx
❌ Removed duplicate: 0685i00000IF0UJAA1.pdf
❌ Removed duplicate: 068Mo00000P7XH8IAN.pdf
❌ Removed duplicate: Trends Transforming Australasia_Simon Kuestenmacher.pdf
❌ Removed duplicate: 0685i00000CR1ZaAAL.docx
❌ Removed duplicate: 0685i00000CQHFBAA5.pdf
❌ Removed duplicate: 0685i00000ED3nMAAT.docx
❌ Removed duplicate: 0685i00000GKLklAAH.docx
❌ Removed duplicate: Member Update_Storage Agreements 2023_F.pdf
❌ Removed duplicate: NZ-Customer-Storage-Agreement-Guidelines.v1.04-January-2023.pdf
❌ Removed duplicate: 0685i00000F5pI6AAJ.docx
❌ Removed duplicate: 0685i00000CQFYFAA5.docx
❌ Removed duplicate: 0685i00000CRAnVAAX.docx
❌ Removed duplicate: 0685i00000GKS1AAAX.pdf
❌ Removed duplicate: 0685i00000ED4iPAAT.docx
❌ Removed duplicate: 0685i00000IGBPQAA5

In [60]:
files = os.listdir('Documents')
len(files)

364

In [53]:
def preprocessing():
    all_pages = []
    metadata = []
    seen_hashes = set()

    path = os.path.join(os.getcwd(), 'Documents')
    files = os.listdir(path)

    for file in files:
        if file.split('.')[-1] == 'docx':
            pages = extract_docx(file)
        elif file.split('.')[-1] == 'pdf':
            pages = extract_pdf(file)
        else:
            continue

        for i, page in enumerate(pages):
            content_hash = hash_text(page.strip())

            if content_hash in seen_hashes:
                continue  # Skip duplicate content

            seen_hashes.add(content_hash)
            all_pages.append(page)
            metadata.append({
                "document": file,
                "page_number": i + 1,
                "text": page
            })

    return all_pages, metadata

In [None]:
# def preprocessing():
#     all_pages = []
#     metadata = []
#     path = os.path.join(os.getcwd(),'Documents')
#     files = os.listdir(path)

#     for file in files:
#         if file.split('.')[-1] in ['docx']:
#             pages = extract_docx(file)
            
#         elif file.split('.')[-1] in ['pdf']:
#             pages = extract_pdf(file)
#         else:
#             continue

#         for i, page in enumerate(pages):
#                 all_pages.append(page)
#                 metadata.append({
#                     "document": file,
#                     "page_number": i + 1,
#                     "text": page
#                 })
#     return (all_pages,metadata)

In [55]:
all_pages, metadata = preprocessing()

In [56]:
len(metadata)

3650

# Data Indexer

In [20]:
def vectorize_pages(pages,model):
    embeddings = model.encode(pages, convert_to_numpy=True)
    return embeddings

In [21]:
def indexer_faiss():
    embeddings = vectorize_pages(all_pages,model)
    dim = embeddings.shape[1]
    index = faiss.IndexFlatL2(dim)
    index.add(embeddings)


    # 1. First time: after indexing
    faiss.write_index(index, "vector_index.faiss")
    with open("metadata.json", "w", encoding="utf-8") as f:
        json.dump(metadata, f)


In [57]:
indexer_faiss()

In [61]:
def search_index(query, index, metadata_list, model, top_k=3):
    query_vec = model.encode([query])
    D, I = index.search(query_vec, top_k)
    results = []
    for idx in I[0]:
        metadata = metadata_list[idx]
        results.append({
            "document": metadata["document"],
            "page_number": metadata["page_number"],
            "text": metadata["text"][:500],  # preview
            "score": D[0][list(I[0]).index(idx)]
        })
    return results

In [65]:

# Loading Indexer and metadata
index = faiss.read_index("vector_index.faiss")
with open("metadata.json", "r") as f:
    metadata_list = json.load(f)

# Example query
query = "What is the purpose of an Alternate Contact Person (ACP) and what rights do they have?"
results = search_index(query, index, metadata_list, model)

print(f'Query : {query}')
# Display results
for res in results:
    print(f"\n📄 Document: {res['document']} | Page: {res['page_number']} | Score: {res['score']:.2f}")
    print(f"📝 Content Preview:\n{res['text']}...")
    print('-' * 80)

Query : What is the purpose of an Alternate Contact Person (ACP) and what rights do they have?

📄 Document: 0685i00000GKKY5AAP.docx | Page: 4 | Score: 0.60
📝 Content Preview:
Alternative Contact Person or ACP means the alternative contact person that You specify in the Schedule. Associate means the ACP, Your agent and/or a third party: (a) who enters the Space (or the Facility) at Your request, invitation or direction; or (b) whose entry to the Space (or the Facility) was facilitated by any of Your acts or omissions, including, but not limited to, the provision of a key, access card, codes or Bluetooth-enabled device; or (c) who gains unauthorised entry to the Space ...
--------------------------------------------------------------------------------

📄 Document: 068Mo00000Oxz9hIAB.pdf | Page: 10 | Score: 0.63
📝 Content Preview:
Access to a space by others – 
alternate contact person
• ACP is an agent whose decisions in relation to the 
agreement are binding on the storer.
• Can be cont

In [66]:
results

[{'document': '0685i00000GKKY5AAP.docx',
  'page_number': 4,
  'text': 'Alternative Contact Person or ACP means the alternative contact person that You specify in the Schedule. Associate means the ACP, Your agent and/or a third party: (a) who enters the Space (or the Facility) at Your request, invitation or direction; or (b) whose entry to the Space (or the Facility) was facilitated by any of Your acts or omissions, including, but not limited to, the provision of a key, access card, codes or Bluetooth-enabled device; or (c) who gains unauthorised entry to the Space ',
  'score': np.float32(0.5995209)},
 {'document': '068Mo00000Oxz9hIAB.pdf',
  'page_number': 10,
  'text': 'Access to a space by others – \nalternate contact person\n• ACP is an agent whose decisions in relation to the \nagreement are binding on the storer.\n• Can be contacted in case the storer cannot be \nreached. Can discuss any matter relating to the \nagreement with them.\nACP can access the Space to remove the goods 

In [5]:
import openai
from dotenv import load_dotenv
load_dotenv()
# Set your OpenAI API key
api_key = os.getenv("OPENAI_API_KEY")

client = openai.OpenAI(api_key=api_key)  # Replace with your actual key

def get_answer_from_openai(query: str, content: str, model: str = "gpt-3.5-turbo-0125") -> str:
    """
    Uses OpenAI's ChatCompletion API to answer a question based on the provided context.

    Args:
        query (str): The user's question.
        content (str): The context/content from which the answer should be extracted.
        model (str): OpenAI model to use (default is gpt-3.5-turbo-0125).

    Returns:
        str: The generated answer from OpenAI.
    """
    try:
        response = client.chat.completions.create(
            model=model,
            messages=[
                {"role": "system", "content": "You are a helpful assistant that answers questions based on the given context."},
                {"role": "user", "content": f"Context: {content}\n\nQuestion: {query}"}
            ],
            temperature=0.2,
            max_tokens=300
        )
        return response.choices[0].message.content.strip()

    except Exception as e:
        return f"Error occurred: {e}"

In [7]:
def generate_answer(query):
    results = search_index(query, index, metadata_list, model)
    context=''
    for res in results:
        context += res['text']
    
    answer = get_answer_from_openai(query,context)
    return answer

In [8]:
query = "deceased storer"
generate_answer(query)

'When a Storer passes away, the Facility can only release the stored goods to the person who has been granted probate by the court. If the Storer dies without a will, it is referred to as "intestate." In such cases, the next of kin will need to provide proof of being the executor or administrator of the deceased\'s estate in order to access the stored items. The executor or administrator should sign the necessary authorization forms to gain access to the storage unit and its contents.'

In [6]:
# Example query
query = "deceased storer"
results = search_index(query, index, metadata_list, model)
context=''
for res in results:
    context += res['text']

print(f'Query : {query}')
# Display results
for res in results:
    print(f"\n📄 Document: {res['document']} | Page: {res['page_number']} | Score: {res['score']:.2f}")
    print(f"📝 Content Preview:\n{res['text']}...")
    print('-' * 80)

answer = get_answer_from_openai(query,context)
print(f'Answer from OpenAI model :\n{answer}')

Query : deceased storer

📄 Document: NZ-Customer-Storage-Agreement-Guidelines.v1.04-January-2023.pdf | Page: 47 | Score: 0.91
📝 Content Preview:
SSAA CUSTOMER STORAGE AGREEMENT GUIDELINES (CSA) 
© Self Storage Association of Australasia 2023 
47 
KJW - 449872/6 - 194743.3 
 
 
 
19. 
DECEASED STORER 
 
 
When a Storer dies, a Facility is not able to ‘release’ goods to any person other than the person to 
whom probate is granted. A probate document is an official court issued document. It will include a 
court stamp. 
 
19.1 Storer dies without a will 
Where a person dies without a will it is called “intestate”. The next of kin will need ...
--------------------------------------------------------------------------------

📄 Document: Self Storage Legal Landscape - Australia_Ritika Sardar and Ethan Holden.pdf | Page: 12 | Score: 0.95
📝 Content Preview:
Access to a space by others – relatives/executor 
when storer deceased (illustration cont.)
• On a storer’s death, the lock should not be