## Imports

In [2]:
### Imports
# Import packages
from llama_index import VectorStoreIndex
from llama_index import SimpleDirectoryReader
import logging
import sys
from llama_index import ServiceContext, LLMPredictor, OpenAIEmbedding, PromptHelper
from llama_index.llms import OpenAI
from llama_index.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser
from oepul_chat.readers.custom_pdf_reader import CustomPDFReader
from oepul_chat.readers.custom_html_reader import CustomHTMLReader
from oepul_chat.readers.custom_full_pdf_reader import CustomFullPDFReader
from oepul_chat.rag_oepul_string_query_engine import RAGOEPULStringQueryEngine
from llama_index import SimpleDirectoryReader
import random
from llama_index.schema import MetadataMode
from langchain.embeddings.huggingface import HuggingFaceEmbeddings
from llama_index import LangchainEmbedding, ServiceContext
from llama_index.llms import LangChainLLM
from llama_index import download_loader
from llama_index.query_engine import RetrieverQueryEngine
from llama_index import DocumentSummaryIndex, get_response_synthesizer
from llama_index.prompts import PromptTemplate
# import QueryBundle
from llama_index import QueryBundle

# import NodeWithScore
from llama_index.schema import NodeWithScore

# Retrievers
from llama_index.retrievers import (
    BaseRetriever,
    VectorIndexRetriever,
    KeywordTableSimpleRetriever,
)

from llama_index import (
    VectorStoreIndex,
    SimpleKeywordTableIndex,
    SimpleDirectoryReader,
    ServiceContext,
    StorageContext,
)

from typing import List
import pickle


from IPython.core.display import display, HTML




PDFReader = download_loader("PDFReader")


# Import local library
import os
import sys
sys.path.append(os.path.join(os.getcwd(), '..'))

# Autoreload local library
%load_ext autoreload
%autoreload 2

### Utilities
def view_response(obj):
    export = """<div style="font-size: 14px;line-height: 1.5;"><strong>Antwort</strong>:<br>"""
    export += f"{obj.response}<br><br>"
    export += """<strong>Quellen:</strong><br>"""
    export += """<ul>"""

    for source_node in obj.source_nodes:
        export += f"<li>"
        export += f"<strong>{source_node.node.metadata['File Name']}</strong><br>"

        export += f"<strong>Header Path</strong>: {source_node.node.metadata['Header Path']}<br>"
        export += f"<strong>Score</strong>: {source_node.score}<br>"
        export += f"<strong>Text</strong>: <i>{source_node.node.text}</i><br>"
        # export += f"<strong>Metadata</strong>: {source_node.node.metadata}<br>"
        export += "<br></li>"

    export += """</ul></div>"""

    display(HTML(export))


def view_docs(docs):
    export = """<div style="font-size: 14px;line-height: 1.5;"><ul>"""
    for doc in docs:
        export += f"<li>"
        export += f"<strong>{doc.metadata['File Name']}</strong><br>"
        export += f"<strong>Header Path</strong>: {doc.metadata['Header Path']}<br>"
        export += f"<strong>Text</strong>: <i>{doc.text}</i><br>"
        # export += f"<strong>Metadata</strong>: {source_node.node.metadata}<br>"
        export += "<br></li>"
    export += """</ul></div>"""
    display(HTML(export))


def load_data(filepath, filetype, reader):
    """Load markdown docs from a directory, excluding all other file types."""
    print(f'loading data... {filepath}')
    loader = SimpleDirectoryReader(
        input_dir=filepath,
        file_extractor={filetype: reader},
        recursive=True
    )

    data = loader.load_data()

    # print short summary
    print("Loaded {} documents".format(len(data)))
    print("First document metadata: {}".format(data[1].metadata))
    print("First document text: {}".format(data[1].text[0:80]))

    return data

  from IPython.core.display import display, HTML


## Logging

In [None]:
### Logging setup
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

### Normal Embedding index for all docs
I built a CustomPDFReader which can be found in `oepul_chat/custom_pdf_reader.py` it extracts the structure out of the PDF and embeds it in the metadata field `header_path` i think this is one od the first crucial steps as it gives each text element a context in all of the files. With llama index we can then give this fiel to the retriever or the LLM or both. For the AMA Docs and Bio Austria Guide the files are loaded via the default loaders.

In [93]:
# load all official OEPUL docs with custom PDF reader
oepul_official_docs = load_data("data/OEPUL_PDF/", ".pdf", CustomPDFReader())
# # load html guide from BIO Austria
bio_austria_guide = load_data("data/BIO_AUSTRIA", ".html", CustomHTMLReader())
# # Load rest of AMA docs with simple pdf reader
ama_official_docs = load_data("data/AMA", ".pdf", PDFReader())

# merge documents lists
docs_list = [oepul_official_docs]# [oepul_official_docs, ama_official_docs, bio_austria_guide]  #
documents = [doc for docs in docs_list
             for doc in docs]

# Hide certain metadata form llm and embed
for doc in documents:
    doc.excluded_llm_metadata_keys = ["Content Type", "page_label"]
    doc.excluded_embed_metadata_keys = ["Content Type"]

# write docs to pickle
with open('data/documents.pickle', 'wb') as f:
    pickle.dump(documents, f)

loading data... data/OEPUL_PDF/
Loaded 442 documents
First document metadata: {'File Name': 'O6_14_Almbewirtschaftung_2023_04.pdf', 'Content Type': 'text', 'Header Path': 'Almbewirtschaftung/ÖPUL 2023'}
First document text: Almbewirtschaftung STAND April 2023
loading data... data/BIO_AUSTRIA
Loaded 38 documents
First document metadata: {'File Name': 'aktueller-planungsstand-zu-bio-im-oepul-2023.html', 'Content Type': 'text', 'Header Path': 'Bio-Maßnahme/Bio-Basisprämie:/Auflagen Bio-Basisprämie:/Anlage von Biodiversitäts-Flächen/Vier Möglichkeiten im Grünland:/Einzuhalten bei Biodiversitäts-Flächen am Acker:/Optionale Module (einjährig):/Acker/Grünland:/N2000/WRRL(Wasserrahmenrichtlinie):/Beitrag teilen/Bio-Maßnahme/Bio-Basisprämie:/Auflagen Bio-Basisprämie:/Anlage von Biodiversitäts-Flächen/Vier Möglichkeiten im Grünland:/Einzuhalten bei Biodiversitäts-Flächen am Acker:/Optionale Module (einjährig):/Acker/Grünland:/N2000/WRRL(Wasserrahmenrichtlinie):/Beitrag teilen', 'tag': 'p'}
First

In [94]:
# use cached docs instead
with open('data/documents.pickle', 'rb') as f:
    documents = pickle.load(f)

In [95]:
# Define node parser
node_parser = SimpleNodeParser.from_defaults(
    text_splitter=TokenTextSplitter(chunk_size=800, chunk_overlap=20)
)

# Parse nodes from docs 
nodes = node_parser.get_nodes_from_documents(documents)

# Create VectorStoreIndex
index = VectorStoreIndex(nodes)

# Persist it for later use
index.storage_context.persist(persist_dir="indices/vector_index/")

# Summary index

In [None]:
# Read in each doc as one file
full_docs = oepul_official_docs = load_data(
    "data/", ".pdf", CustomFullPDFReader())

# Use chatgpt model
chatgpt = OpenAI(temperature=0, model="gpt-3.5-turbo-1106")

# Create service context
service_context = ServiceContext.from_defaults(llm=chatgpt)

# Custom prompt for summarization
summary_prompt = PromptTemplate(
    "Du bist ein System welches Zusammenfassungen von Maßnahmen für Landwirte in Österreich aus dem Programm Österreichischen Programm für umweltgerechte Landwirtschaft kurz OEPUL erstellt.\n"
    "Hier die Informationen zu den ÖPUL Förderungen/ Maßnahmen:\n"
    "---------------------\n"
    "{context_str}\n"
    "---------------------\n"
    "Fasse die Maßnahme zusammen, achte besonders auf die Bedingungen und Förderhöhen.\n"
    "Der Landwirt sollte schnell erfassen können, ob die Maßnahme für ihn in Frage kommt.\n"
    "Zusammenfassung: "
)


# Create response synthesizer which summarizes the nodes with custom prompt
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
    summary_template=summary_prompt
)

# Create document summary index
doc_summary_index = DocumentSummaryIndex.from_documents(
    documents=full_docs,
    service_context=service_context,
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

# Persist it for later use
index.storage_context.persist(persist_dir="indices/summary_index/")

## Tree index

# Compare indices