In [None]:
# Standard libraries
import os
import sys
import json
import time
import random
from collections import OrderedDict

# Network communication and web scraping
import requests
import urllib.request

# Progress bar
from tqdm import tqdm

# LangChain
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings.openai import OpenAIEmbeddings
from langchain.vectorstores import Chroma, FAISS
from langchain import OpenAI, VectorDBQA
from langchain.chat_models import AzureChatOpenAI, ChatOpenAI
from langchain.chains import RetrievalQAWithSourcesChain
from langchain.docstore.document import Document
from langchain.chains.question_answering import load_qa_chain
from langchain.chains.qa_with_sources import load_qa_with_sources_chain

# Azure AI
from azure.core.credentials import AzureKeyCredential
from azure.ai.formrecognizer import DocumentAnalysisClient, FormRecognizerClient, DocumentModelAdministrationClient

# HTML
import html

# Table formatting
from tabulate import tabulate

# Utilities for PDF parsing
from common.utils import parse_pdf, read_pdf_files, text_to_base64, get_search_results, model_tokens_limit, num_tokens_from_docs, num_tokens_from_string

# Prompt generation
from common.prompts import COMBINE_QUESTION_PROMPT, COMBINE_PROMPT, COMBINE_PROMPT_TEMPLATE

# Environment variables and loading
from dotenv import load_dotenv
load_dotenv("credentials.env")

# Directory and file management
os.makedirs("data/books/",
            exist_ok=True)

# Constants and global variables
BLOB_CONTAINER_NAME = "auflastung"
BASE_CONTAINER_URL = f"https://storagegenaiasinfo.blob.core.windows.net/{BLOB_CONTAINER_NAME}/"
LOCAL_FOLDER = "./data/books"
# Options: gpt-3.5-turbo, gpt-3.5-turbo-16k, gpt-4, or gpt-4-32k
MODEL = "gpt-35-turbo"

os.makedirs(LOCAL_FOLDER,
            exist_ok=True)

# Set ENV variables
os.environ["OPENAI_API_BASE"] = os.environ["AZURE_OPENAI_ENDPOINT"]
os.environ["OPENAI_API_KEY"] = os.environ["AZURE_OPENAI_API_KEY"]
os.environ["OPENAI_API_VERSION"] = os.environ["AZURE_OPENAI_API_VERSION"]
os.environ["OPENAI_API_TYPE"] = "azure"


# Display Python executable path and installed packages
print(sys.executable)
!conda list

In [None]:
embedder = OpenAIEmbeddings(deployment="text-embedding-ada-002",
                            chunk_size=1)

## 1 - Manual Document Cracking with Push to Vector-based Index

In [None]:
def list_files_in_folder(folder_path):
    """
    List all PDF files in the specified folder and its subfolders.
    
    @param folder_path: str
    The path to the directory where the search for PDF files starts.
    
    @return: list
    A list of full paths to the PDF files found within the folder_path.
    """
    pdf_file_paths = [os.path.join(root, file) for root, dirs, files in os.walk(folder_path) for file in files if file.lower().endswith(".pdf")]
    return pdf_file_paths



def table_to_html(table):
    """
    Convert a table object to an HTML table representation.
    
    @param table: object
    An object representing the table, with properties row_count, cells, whereeach cell has properties row_index, column_index, column_span, row_span, kind, and content.
    
    @return: str
    A string that represents the HTML markup for the table.
    """
    table_html = ["<table>"]

    rows = [sorted([cell for cell in table.cells if cell.row_index == i], key=lambda cell: cell.column_index) for i in range(table.row_count)]
    
    for row_cells in rows:
        row_html = ["<tr>"]
        for cell in row_cells:
            # Determine the cell tag type
            tag = "th" if cell.kind in ("columnHeader", "rowHeader") else "td"
            # Build the span attributes
            cell_spans = f' colSpan="{cell.column_span}"' if cell.column_span > 1 else ""
            cell_spans += f' rowSpan="{cell.row_span}"' if cell.row_span > 1 else ""
            # Construct cell HTML
            row_html.append(f"<{tag}{cell_spans}>{html.escape(cell.content)}</{tag}>")
        row_html.append("</tr>")
        table_html.append("".join(row_html))
    
    table_html.append("</table>")
    return "".join(table_html)



file_paths_list = list_files_in_folder(LOCAL_FOLDER)
print(len(file_paths_list))

In [None]:
def parse_custom_pdf(file,
                     form_recognizer=False,
                     formrecognizer_endpoint=None,
                     formrecognizerkey=None,
                     model="prebuilt-document",
                     from_url=False,
                     verbose=False):
    """
    Analyze a PDF document using Azure Form Recognizer and extract the page map with section headings and table HTML.

    @param file_path: Path to the PDF file to be analyzed.
    @param model: The model ID used for analysis. Default is "prebuilt-document" for prebuilt document model.
    @param verbose: If True, prints additional information during processing.

    @return: A list of tuples, each containing page number, offset, page text, and section heading for each page.
    """
    
    # Initialize Form Recognizer
    credential = AzureKeyCredential(os.environ["FORM_RECOGNIZER_KEY"])
    form_recognizer_client = DocumentAnalysisClient(endpoint=os.environ["FORM_RECOGNIZER_ENDPOINT"],
                                                    credential=credential)
    
    # Analyze document
    with open(file, "rb") as document_file:
        poller = form_recognizer_client.begin_analyze_document(model_id=model,
                                                               document=document_file)
    results = poller.result()
    
    # Initialize variables for page mapping
    page_map = []
    offset = 0

    # Process each page
    for page_num, page in enumerate(results.pages):

        ### Title
        page_text = ""

        section_heading = " ".join(
            paragraph.content
            for paragraph in results.paragraphs
            if paragraph.bounding_regions[0].page_number == page_num + 1 and paragraph.role == "sectionHeading"
        )
        
        # Process tables on current page
        tables_on_page = [table for table in results.tables if table.bounding_regions[0].page_number == page_num + 1]
        page_offset = page.spans[0].offset
        page_length = page.spans[-1].offset + page.spans[-1].length - page_offset
        table_chars = [-1] * page_length

        for table_id, table in enumerate(tables_on_page):
            for span in table.spans:
                for i in range(span.length):
                    idx = span.offset - page_offset + i
                    if 0 <= idx < page_length:
                        table_chars[idx] = table_id

        # Build page text
        added_tables = set()
        for idx, table_id in enumerate(table_chars):
            if table_id == -1:
                page_text += results.content[page_offset + idx]
            elif table_id not in added_tables:
                page_text += table_to_html(tables_on_page[table_id])
                added_tables.add(table_id)

        # Append page information
        page_text += " "
        page_map.append((page_num, offset, page_text, section_heading))
        offset += len(page_text)

    return page_map

In [None]:
# Dictionary to map books to page maps
book_pages_map = {}

# Loop through each book in 'file_paths_list'
for book in file_paths_list:
    
    # Start time
    start_time = time.time()
    
    # Parse the PDF
    book_map = parse_custom_pdf(file=book,
                                model="prebuilt-layout",
                                verbose=True)
    book_pages_map[book] = book_map
    
    # Elapsed time
    end_time = time.time() - start_time
    
    # Print the time taken and number of pages found
    print(f"Parsing took: {end_time:.6f} seconds\n{book} contained {len(book_map)} pages")

## Create Vector-based index

In [None]:
book_index_name = "custom-auflastung-use-headings" #choose index name, only small letters

In [None]:
# Create Azure Search Vector-based Index
headers = {'Content-Type': 'application/json','api-key': os.environ['AZURE_SEARCH_KEY']}
params = {'api-version': os.environ['AZURE_SEARCH_API_VERSION']}

In [None]:
index_payload = {
    "name": book_index_name,
    "fields": [
        {"name": "id", "type": "Edm.String", "key": "true", "filterable": "true" },
        {"name": "title","type": "Edm.String","searchable": "true","retrievable": "true"},
        {"name": "chunk","type": "Edm.String","searchable": "true","retrievable": "true"},
        {"name": "chunkVector","type": "Collection(Edm.Single)","searchable": "true","retrievable": "true","dimensions": 1536,"vectorSearchConfiguration": "vectorConfig"},
        {"name": "name", "type": "Edm.String", "searchable": "true", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {"name": "location", "type": "Edm.String", "searchable": "false", "retrievable": "true", "sortable": "false", "filterable": "false", "facetable": "false"},
        {"name": "page_num","type": "Edm.Int32","searchable": "false","retrievable": "true"},
        {"name": "sectionheading","type": "Edm.String","searchable": "true","retrievable": "true"},
        
    ],
    "vectorSearch": {
        "algorithmConfigurations": [
            {
                "name": "vectorConfig",
                "kind": "hnsw"
            }
        ]
    },
    "semantic": {
        "configurations": [
            {
                "name": "my-semantic-config",
                "prioritizedFields": {
                    "titleField": {
                        "fieldName": "title"
                    },
                    "prioritizedContentFields": [
                        {
                            "fieldName": "chunk"
                        }
                    ],
                    "prioritizedKeywordsFields": [
                        {
                            "fieldName": "sectionheading"
                        }
                    ]
                }
            }
        ]
    }
}

r = requests.put(os.environ['AZURE_SEARCH_ENDPOINT'] + "/indexes/" + book_index_name,
                 data=json.dumps(index_payload), headers=headers, params=params)
print(r.status_code)
print(r.ok)

In [None]:
for bookname,bookmap in book_pages_map.items():
    for page in tqdm(bookmap):
        try:
            page_num = page[0] + 1
            content = page[2]
            book_url = BASE_CONTAINER_URL + bookname
            sectionheading=page[3]
            upload_payload = {
                "value": [
                    {
                        "id": text_to_base64(bookname + str(page_num)),
                        "title": f"{bookname}_page_{str(page_num)}",
                        "chunk": content,
                        "chunkVector": embedder.embed_query(content if content != "" else "-------"),
                        "name": bookname,
                        "location": book_url,
                        "page_num": page_num,
                        "sectionheading": sectionheading,
                        "@search.action": "upload"
                    },
                ]
            }

            r = requests.post(os.environ['AZURE_SEARCH_ENDPOINT'] + "/indexes/" + book_index_name + "/docs/index",
                              data=json.dumps(upload_payload),
                              headers=headers,
                              params=params)
            if r.status_code != 200:
                print(r.status_code)
                print(r.text)
        except Exception as e:
            print(f"Exception: {e}")
            continue

## Query the Index

In [None]:
# QUESTION = "what normally rich dad do that is different from poor dad?"
# QUESTION = "Tell me a summary of the book Boundaries"
# QUESTION = "Dime que significa la radiacion del cuerpo negro"
# QUESTION = "what is the acronym of the main point of Made to Stick book"
# QUESTION = "who won the soccer worldcup in 1994?" # this question should have no answer
QUESTION = "Show all the compliance references for the standart EN 45502"#"Show the file with document number D1330781"

In [None]:
vector_indexes = [book_index_name]

ordered_results = get_search_results(QUESTION,
                                     vector_indexes,
                                     k=10,
                                     reranker_threshold=1,
                                     vector_search=True,
                                     similarity_k=2,
                                     query_vector=embedder.embed_query(QUESTION))

**Note**: that we are picking a larger k=10 since these chunks are NOT of 5000 chars each like prior notebooks, but instead each page is a chunk.

In [None]:
COMPLETION_TOKENS = 1000
llm = AzureChatOpenAI(deployment_name=MODEL,
                      temperature=0.5,
                      max_tokens=COMPLETION_TOKENS)

In [None]:
top_docs = []
for key,value in ordered_results.items():
    location = value["location"] if value["location"] is not None else ""
    top_docs.append(Document(page_content=value["chunk"], metadata={"source": location+os.environ['BLOB_SAS_TOKEN']}))
        
print(f"Number of chunks: {len(top_docs)}")

In [None]:
# Calculate number of tokens of our docs
if(len(top_docs) > 0):
    tokens_limit = model_tokens_limit(MODEL) # this is a custom function we created in common/utils.py
    prompt_tokens = num_tokens_from_string(COMBINE_PROMPT_TEMPLATE) # this is a custom function we created in common/utils.py
    context_tokens = num_tokens_from_docs(top_docs) # this is a custom function we created in common/utils.py
    
    requested_tokens = prompt_tokens + context_tokens + COMPLETION_TOKENS
    
    chain_type = "map_reduce" if requested_tokens > 0.9 * tokens_limit else "stuff"  
    
    print(f"""System prompt token count: {prompt_tokens}
    Max Completion Token count: {COMPLETION_TOKENS}
    Combined docs (context) token count: {context_tokens}
    --------
    Requested token count: {requested_tokens}
    Token limit for {MODEL}: {tokens_limit}
    Chain Type selected: {chain_type}""")
        
else:
    print("NO RESULTS FROM AZURE SEARCH")

In [None]:
if chain_type == "stuff":
    chain = load_qa_with_sources_chain(llm,
                                       chain_type=chain_type, 
                                       prompt=COMBINE_PROMPT)
elif chain_type == "map_reduce":
    chain = load_qa_with_sources_chain(llm,
                                       chain_type=chain_type, 
                                       question_prompt=COMBINE_QUESTION_PROMPT,
                                       combine_prompt=COMBINE_PROMPT,
                                       return_intermediate_steps=True)

In [None]:
%%time
# Try with other language as well
response = chain({"input_documents": top_docs,
                  "question": QUESTION,
                  "language": "English"})

In [None]:
display(Markdown(response['output_text']))