In [5]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, TableReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

In [22]:
converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    save_json=True
)

PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/test/Sasol Sustainability Report_2021_22Sep21_10h30_0_0 - short.pdf")

docs = converter.convert(file_path=PDF_PATH, meta=None)

In [26]:
def convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"


def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents with `content_type` table.
    """
    for doc in docs:
        if doc.content_type == "table":
            convert_table_to_markdown(doc)

In [27]:
convert_tables_to_markdown(docs)

In [28]:
document_store = InMemoryDocumentStore()

document_store.write_documents(docs)

In [36]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2", document_store=document_store
)

document_store.update_embeddings(retriever=retriever)

config_sentence_transformers.json: 100%|██████████| 116/116 [00:00<00:00, 702kB/s]
modules.json: 100%|██████████| 349/349 [00:00<00:00, 2.55MB/s]
README.md: 100%|██████████| 10.6k/10.6k [00:00<00:00, 52.4MB/s]
sentence_bert_config.json: 100%|██████████| 53.0/53.0 [00:00<00:00, 338kB/s]
config.json: 100%|██████████| 612/612 [00:00<00:00, 4.56MB/s]
pytorch_model.bin: 100%|██████████| 90.9M/90.9M [00:18<00:00, 4.95MB/s]
tokenizer_config.json: 100%|██████████| 350/350 [00:00<00:00, 2.43MB/s]
vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 2.69MB/s]
tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 1.75MB/s]
special_tokens_map.json: 100%|██████████| 112/112 [00:00<00:00, 823kB/s]
1_Pooling/config.json: 100%|██████████| 190/190 [00:00<00:00, 1.37MB/s]
Batches: 100%|██████████| 1/1 [00:01<00:00,  1.51s/it]ocs/s]
Updating Embedding:   0%|          | 0/4 [00:01<?, ? docs/s]


RuntimeError: Embedding dimensions of the model (384) don't match the embedding dimensions of the document store (768). Initiate InMemoryDocumentStore again with arg embedding_dim=384.