In [2]:
# !pip install -qU llama-index
# !pip install llama-cpp-python
#!conda install -c conda-forge gradio
#!pip install pycryptodome==3.15.0
#!pip install pymupdf
#!pip install gradio

In [11]:
import os
import re
import pandas as pd

import gradio as gr

from tqdm import tqdm
from llama_index import SimpleDirectoryReader
from pathlib import Path

In [12]:
from llama_index import download_loader
from llama_index import (KeywordTableIndex,
                         GPTVectorStoreIndex,
                         SimpleDirectoryReader,
                         load_index_from_storage,
                         LLMPredictor,
                         StorageContext)
from llama_index.llms import OpenAI

In [13]:
# pdf_reader = download_loader("PDFReader")
# loader = pdf_reader()

# all_documents = []

# # Assuming pdf_folder_path is the path to your folder of PDFs
# pdf_folder_path = "./pdf_files/"

# # Iterate through each file in the folder
# for filename in tqdm(os.listdir(pdf_folder_path)):
#     # Check if the file is a PDF
#     if filename.endswith(".pdf"):
#         # Construct the full path to the PDF file
#         pdf_path = os.path.join(pdf_folder_path, filename)
#         # Load and process the PDF file
#         documents = loader.load_data(file=pdf_path)
#         all_documents.extend(documents)


In [32]:
test_doc = os.path.join(pdf_folder_path, os.listdir(pdf_folder_path)[0])
test_doc

'./pdf_files/ncc-volume-three-20221222.pdf'

In [33]:
docs = loader.load_data(file=Path(test_doc))

## Loading Documents

In [49]:
pdf_folder_path = "./pdf_files/"

In [50]:
PDFReader = download_loader("PDFReader")
loader = PDFReader()

### Cleaning the documents
- eliminating last page
- regex detect and clear the first group of pages

In [91]:
def clip_non_content_in_doc(doc):
    word_to_find="About the NCC"
    pattern = re.compile(rf'{word_to_find}\b(.*?)$', re.DOTALL)
    
    page_idx = -1
    
    match_ = None
    
    while match_==None and page_idx<len(doc) :
        page_idx += 1
        match_ = pattern.search(doc[page_idx].text)
        
    # Check if a match was found
    if match_:
        print('Match found...clipping...', end='')
        return doc[page_idx: -1] # removing the last page
    else:
        # if regex match didn't occur
        # remove the fist and last page only
        return doc[1:-1]

In [92]:
all_documents = []

In [93]:
for filename in tqdm(os.listdir(pdf_folder_path)):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(pdf_folder_path, filename)
        documents = loader.load_data(file=Path(pdf_path))
        # clipping pages
        documents = clip_non_content_in_doc(documents)
        all_documents.extend(documents)
        print('loaded')

 33%|████████▎                | 1/3 [00:12<00:25, 12.88s/it]

Match found...clipping...loaded


 67%|████████████████▋        | 2/3 [00:24<00:12, 12.04s/it]

Match found...clipping...loaded


100%|█████████████████████████| 3/3 [00:44<00:00, 14.71s/it]

Match found...clipping...loaded





## Loading essential components

In [7]:
os.environ["OPENAI_API_KEY"] = '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'

In [9]:
from llama_index.llms.openai import OpenAI  # Import the OpenAI class from Llama Index
from llama_index import ServiceContext, VectorStoreIndex
from llama_index.embeddings.openai import OpenAIEmbedding
import openai


openai.api_key = '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'

# Define the OpenAI model
llm = OpenAI(
    model="gpt-3.5-turbo",  # Specify the OpenAI model you want to use
    api_key='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'  # Replace with your OpenAI API key
)

# Define Embedding Model
embed_model = OpenAIEmbedding(model="text-embedding-ada-002")

# Abstract llm, embedding model
service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
)

# Create index
# index = VectorStoreIndex.from_documents(documents, service_context=service_context)


In [10]:
data_storage_index_dir = "ncc_index_storage"

In [11]:
if data_storage_index_dir in os.listdir():
    # rebuild storage context -- uncomment the below lines to save openAI api calls for indexing
    # this saves openAI bom
    print(f'found vector_index at {data_storage_index_dir}. Loading...', end='')
    storage_context = StorageContext.from_defaults(persist_dir=data_storage_index_dir)

    # load index
    index = load_index_from_storage(storage_context, index_id='vector_index')
    
    print('done!')
else:
    # create index --- takes a few minutes to run
    # downloads llama
    print(f'{data_storage_index_dir} vector index not found. Creating index...',end='')
    print('done!')
    index = GPTVectorStoreIndex.from_documents(all_documents,  service_context=service_context)
    print(f'storing vector_index at {data_storage_index_dir}...', end='')
    index.set_index_id('vector_index')
    index.storage_context.persist(data_storage_index_dir)
    print('done!')
    

found vector_index at ncc_index_storage. Loading...done!


In [12]:
# load query engine
query_engine = index.as_query_engine()

## Testing

In [14]:
# Query the index --- takes a minute or time in CPU
response = query_engine.query('What are United buildings?')
response.response

'United buildings are buildings that are deemed as one when two or more buildings that are adjoining each other are connected and used as a single building. In order to be considered a united building, the buildings must be connected through openings in the walls dividing them and together comply with all the requirements of the NCC (Building Code of Australia) as if they were a single building. United buildings typically apply to Class 2 to 9 buildings and are not required to comply with additional NCC provisions. However, if an external wall becomes an internal wall as a result of the interconnection, it must comply with the requirements for an internal wall. If interconnected buildings do not jointly comply with all the requirements applicable to a single building, they remain as separate buildings.'

In [42]:
response.get_formatted_sources(50)

'> Source (Doc id: 58f784ef-6344-4f3b-bd12-33298728d813): Governing requirements\nNCC 2022 Volume Two - Bu...\n\n> Source (Doc id: 570e6b71-3ddf-4dd7-8952-488d66072480): Governing requirements\nNCC 2022 Volume One - Bu...'

In [17]:
#print(documents[20].text)

In [18]:
queries = ['What are United buildings?', # 
           'Which class building provides long term accomodation for a no. of unrelated people. Explain in detail', # Class 3 buildings
           'Define the difference between Class 2 building and Class 4 building?',
           'What is JAS-ANZ?', # joint accreditation system of aus and nz
           'Explain the different type of Other legistlation affecting buildings!',
           'Whats the topic of section G, and explin in detail?', # Anciallary provisions
           'Give the content for Schedule 8 `South Australia` in the pattern: Section A - Governing requirements, Section B - Structure, ...',
           'Define Section D: Access and egress'
          ] 

In [19]:
result = {
    'queries': [],
    'responses': [],
    'index_engine': []
}

for q in tqdm(queries):
    result['queries'].append(q)
    # get response
    res =  query_engine.query(q)
    result['responses'].append(res.response)
    result['index_engine'].append('gpt3.5-index')
    

100%|████████████████████████████████████████████████████████████████████████████████████| 8/8 [02:39<00:00, 19.95s/it]


In [21]:
result_df = pd.DataFrame(result)

In [22]:
result_df

Unnamed: 0,queries,responses,index_engine
0,What are United buildings?,United buildings are buildings that are deemed...,gpt3.5-index
1,Which class building provides long term accomo...,Class 3 buildings provide long-term accommodat...,gpt3.5-index
2,Define the difference between Class 2 building...,Class 2 buildings are buildings that contain t...,gpt3.5-index
3,What is JAS-ANZ?,JAS-ANZ is an accreditation body that accredit...,gpt3.5-index
4,Explain the different type of Other legistlati...,The other legislation affecting buildings can ...,gpt3.5-index
5,"Whats the topic of section G, and explin in de...","The topic of section G is ""Ancillary provision...",gpt3.5-index
6,Give the content for Schedule 8 `South Austral...,Section A - Governing requirements\nSection B ...,gpt3.5-index
7,Define Section D: Access and egress,Section D: Access and egress focuses on specif...,gpt3.5-index


In [23]:
result_df.to_excel("answer.xlsx", index=False)

## Gradio demo

In [24]:
def pdf_bot_response(user_query, history):
    return query_engine.query(user_query).response

demo = gr.ChatInterface(pdf_bot_response)

demo.launch(share=True)

Running on local URL:  http://127.0.0.1:7870

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


