## Imports

In [18]:
from llama_index import VectorStoreIndex
from llama_index import SimpleDirectoryReader
import logging
import sys
from llama_index import ServiceContext, LLMPredictor, OpenAIEmbedding, PromptHelper
from llama_index.llms import OpenAI
from llama_index.text_splitter import TokenTextSplitter
from llama_index.node_parser import SimpleNodeParser


## Logging setup

In [14]:
logging.basicConfig(stream=sys.stdout, level=logging.DEBUG)
logging.getLogger().addHandler(logging.StreamHandler(stream=sys.stdout))

## Data prep

In [None]:


from llama_docs_bot.markdown_docs_reader import MarkdownDocsReader
from llama_index import SimpleDirectoryReader


def load_markdown_docs(filepath):
    """Load markdown docs from a directory, excluding all other file types."""
    loader = SimpleDirectoryReader(
        input_dir=filepath,
        exclude=["*.rst", "*.ipynb", "*.py", "*.bat", "*.txt", "*.png", "*.jpg",
                 "*.jpeg", "*.csv", "*.html", "*.js", "*.css", "*.pdf", "*.json"],
        file_extractor={".md": MarkdownDocsReader()},
        recursive=True
    )

    return loader.load_data()


In [41]:
# Load in pdfs as llama index documents
documents = SimpleDirectoryReader("./data/pdfs").load_data()

DEBUG:llama_index.readers.file.base:> [SimpleDirectoryReader] Total files added: 27
> [SimpleDirectoryReader] Total files added: 27
> [SimpleDirectoryReader] Total files added: 27


In [46]:
print("Loaded {} documents".format(len(documents)))
print("First document metadata: {}".format(documents[0].metadata))
print("First document text: {}".format(documents[0].text[0:180]))

Loaded 283 documents
First document metadata: {'page_label': '1', 'file_name': 'O6_10_Erosionsschutz_Wein_Obst_Hopfen_2022_12.pdf'}
First document text: Informationsblatt ÖPUL 2023  
Erosionsschutz  Wein, Obst und Ho pfen Seite 1 von 6 www.eama.at  | www.ama.at   
  
 
ÖPUL 2023  
Erosionsschutz  Wein, Obst  und Hopfen  
STAND Deze


For now the documents are splitted by page, this is IMO suboptimal a hierachical split could be beneficial as these documents are always strutured simmilary.

But for now we will accept this and go over to the next step which is splitting the documents into chunks.

In [48]:
llm = OpenAI(model='text-davinci-003', temperature=0, max_tokens=256)

embed_model = OpenAIEmbedding()

node_parser = SimpleNodeParser.from_defaults(
    text_splitter=TokenTextSplitter(chunk_size=1024, chunk_overlap=20)
)

prompt_helper = PromptHelper(
    context_window=4096,
    num_output=256,
    chunk_overlap_ratio=0.1,
    chunk_size_limit=None
)

service_context = ServiceContext.from_defaults(
    llm=llm,
    embed_model=embed_model,
    node_parser=node_parser,
    prompt_helper=prompt_helper
)



In [49]:

index = VectorStoreIndex.from_documents(documents)
query_engine = index.as_query_engine(service_context=service_context)


In [None]:
response = query_engine.query(
    "Welche Maßnahme trägt zur Verringerung der Treibhausgasemission bei?")
print(response)
