In [1]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, TableReader
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [2]:
converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    save_json=True
)

PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/test/Sasol Sustainability Report_2021_22Sep21_10h30_0_0 - short.pdf")

docs = converter.convert(file_path=PDF_PATH, meta=None)

In [3]:
len(docs)

4

In [4]:
def sliding_window(df, window_size):
    """
    Split a DataFrame into smaller DataFrames using a sliding window approach.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    list of pandas.DataFrame
        A list of DataFrames, each one representing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]
    tables = [table.reset_index(drop=True) for table in tables]

    return tables

In [5]:
def split_table(doc, window_size=5):
    """
    Split a table into smaller tables using a sliding window approach.

    Parameters
    ----------
    doc : Document
        The document containing the table to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a smaller table.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr != 'content':
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def split_tables(docs: list[Document], window_size: int=5):
    """
    Return a list of documents, each containing text or a smaller table.

    Parameters
    ----------
    docs : list[Document]
        List of documents.

    window_size : int
        The size of the sliding window to use when splitting tables.

    Returns
    -------
    new_docs : list[Document]
        List of documents, each containing text or a smaller table.
    """
    new_docs = []

    for doc in docs:
        if doc.content_type == "table":
            new_docs.extend(split_table(doc, window_size))
        else:
            new_docs.append(doc)

    return new_docs


In [6]:
docs_split = split_tables(docs)

In [7]:
len(docs_split)

140

In [8]:
def convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"


def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents with `content_type` table.
    """
    for doc in docs:
        if doc.content_type == "table":
            convert_table_to_markdown(doc)

In [9]:
convert_tables_to_markdown(docs_split)

In [11]:
# TODO: Move to utils.py module

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str="cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(docs_split[1].content)

379

In [15]:
document_store = InMemoryDocumentStore(embedding_dim=384)

document_store.delete_documents()
document_store.write_documents(docs_split, duplicate_documents='fail')

In [16]:
document_store.get_document_count()

4

In [17]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2", document_store=document_store
)

In [18]:
document_store.update_embeddings(retriever=retriever)

Batches: 100%|██████████| 1/1 [00:00<00:00,  3.39it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 33328.81 docs/s]     


In [29]:
document_store.get_document_count()

4

It seems not all the documents are getting written to the document store...