In [110]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

In [93]:
converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    save_json=True
)

PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/test/Sasol Sustainability Report_2021_22Sep21_10h30_0_0 - short.pdf")

docs = converter.convert(file_path=PDF_PATH, meta=None)

In [94]:
def sliding_window(df, window_size):
    """
    Split a DataFrame into smaller DataFrames using a sliding window approach.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    list of pandas.DataFrame
        A list of DataFrames, each one representing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]

    return tables

In [95]:
def split_table(doc, window_size=5):
    """
    Split a table into smaller tables using a sliding window approach.

    Parameters
    ----------
    doc : Document
        The document containing the table to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a smaller table.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr not in ["content", "id"]:
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def split_tables(docs: list[Document], window_size: int=5):
    """
    Return a list of documents, each containing text or a smaller table.

    Parameters
    ----------
    docs : list[Document]
        List of documents.

    window_size : int
        The size of the sliding window to use when splitting tables.

    Returns
    -------
    new_docs : list[Document]
        List of documents, each containing text or a smaller table.
    """
    new_docs = []

    for doc in docs:
        if doc.content_type == "table":
            new_docs.extend(split_table(doc, window_size))
        else:
            new_docs.append(doc)

    return new_docs


In [96]:
docs_split = split_tables(docs)

In [97]:
len(docs_split)

140

In [98]:
def convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"


def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents with `content_type` table.
    """
    for doc in docs:
        if doc.content_type == "table":
            convert_table_to_markdown(doc)

In [99]:
convert_tables_to_markdown(docs_split)

In [100]:
# TODO: Move to utils.py module

import tiktoken

def num_tokens_from_string(string: str, encoding_name: str="cl100k_base") -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

num_tokens_from_string(docs_split[1].content)

379

In [101]:
# TODO: Try to use other document stores (e.g. FAISS).

document_store = InMemoryDocumentStore(embedding_dim=384)

document_store.delete_documents()
document_store.write_documents(docs_split)

In [102]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

# TODO: Look into other (non-OpenAI) embedding models that can be used with Haystack v1.

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2", document_store=document_store
)

In [103]:
document_store.update_embeddings(retriever=retriever)

Batches: 100%|██████████| 5/5 [00:00<00:00, 10.28it/s] docs/s]
Documents Processed: 10000 docs [00:00, 19648.59 docs/s]       


In [136]:
# Test the retriever

# Try the Retriever
retrieved_tables = retriever.retrieve("What was the GHG Scope 2 emissions in the year 2021?", top_k=3)

# Get highest scored table
print(retrieved_tables[2].content)


Batches: 100%|██████████| 1/1 [00:00<00:00, 59.18it/s]

|    | Natural Capital - Our environment            | Footnote   | 2021   | 2020   | 2019   | 2018   | Level of assurance 2021   |
|----|----------------------------------------------|------------|--------|--------|--------|--------|---------------------------|
|  7 | Americas                                     |            | 1 225  | 1 695  | 688    | 707    |                           |
|  8 | Mozambique                                   |            | 42     | 46     | 53     | 54     |                           |
|  9 | Other strategic business units and Functions |            | 782    | 679    | 745    | 659    |                           |
| 10 | Greenhouse gases (GHG) (kilotons)            | 12         |        |        |        |        |                           |
| 11 | Direct methane (CH2)                         |            | 116,14 | 106,00 | 105,04 | 109,18 | Reasonable                |





In [114]:
rag_prompt = PromptTemplate(
    prompt="""Use the following pieces of context to answer the question at the end.
              The context may be text or a markdown table.
              If you don't know the answer, just say 'None', don't try to make up an answer.

              \n\n Context: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)


prompt_node = PromptNode(model_name_or_path="gpt-3.5-turbo", api_key=OPENAI_API_KEY, default_prompt_template=rag_prompt)

In [115]:
# TODO: How to specify the number of documents retrieved by the retriever to use in the prompt?

pipe = Pipeline()
pipe.add_node(component=retriever, name="retriever", inputs=["Query"])
pipe.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [131]:
output = pipe.run(query="What was the GHG Scope 1 emissions in the year 2021?")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 29.32it/s]


56,972 kilotons


In [132]:
output = pipe.run(query="What was the GHG Scope 2 emissions in the year 2021?")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 82.94it/s]


None


Working very well. Only issue I have seen so far is not being able to answer "What was the 
GHG Scope 2 emissions in the year 2021?". 