In [3]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


In [4]:
def convert_validation_pdf():
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
        save_json=False
    )

    for fn in file_path.glob("*.pdf"):
        print(f"Converting {fn}")
        docs = converter.convert(file_path=fn, meta=None)
        converted_docs.extend(docs)

    return converted_docs

In [5]:
docs = convert_validation_pdf()

Converting /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [1-2].pdf
Converting /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [11].pdf
Converting /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [5-6].pdf
Converting /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [7-8].pdf
Converting /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [3-4].pdf
Converting /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [9-10].pdf


In [6]:
# converter = AzureConverter(
#     endpoint="https://azureconverter.cognitiveservices.azure.com/",
#     credential_key=AZURE_CONVERTER_KEY,
#     model_id="prebuilt-layout",  # Was "prebuilt-document"
#     save_json=False
# )

# PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal.pdf")

# docs = converter.convert(file_path=PDF_PATH, meta=None)

In [None]:
# TODO: Implement way to save docs to disk and load them back in. Would be good to do at this
# stage (before loaded into a document store), so it's possible to experiment with the
# indexing pipeline without having to re-convert the PDFs.

In [8]:
print(f'Number of documents: {len(docs)}')

Number of documents: 23


In [9]:
def split_tables(docs: list[Document], window_size: int=5):
    """
    Return a list of documents, each containing text or a smaller table.

    Parameters
    ----------
    docs : list[Document]
        List of documents.

    window_size : int
        The size of the sliding window to use when splitting tables.

    Returns
    -------
    new_docs : list[Document]
        List of documents, each containing text or a smaller table.
    """
    new_docs = []

    for doc in docs:
        if doc.content_type == "table":
            new_docs.extend(_split_table(doc, window_size))
        else:
            new_docs.append(doc)

    return new_docs


def _split_table(doc, window_size=5):
    """
    Split a table into smaller tables using a sliding window approach.

    Parameters
    ----------
    doc : Document
        The document containing the table to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a smaller table.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = _sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr not in ["content", "id"]:
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def _sliding_window(df, window_size):
    """
    Split a DataFrame into smaller DataFrames using a sliding window approach.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    list of pandas.DataFrame
        A list of DataFrames, each one representing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]

    return tables


In [10]:
docs = split_tables(docs)

print(f"Number of documents: {len(docs)}")

Number of documents: 738


In [11]:
def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents with `content_type` table.
    """
    for doc in docs:
        if doc.content_type == "table":
            _convert_table_to_markdown(doc)


def _convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"

In [12]:
convert_tables_to_markdown(docs)

In [13]:
# TODO: Try to use other document stores (e.g. FAISS).

document_store = InMemoryDocumentStore(embedding_dim=384)

document_store.delete_documents()
document_store.write_documents(docs)

In [14]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

# TODO: Look into other (non-OpenAI) embedding models that can be used with Haystack v1.

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2", document_store=document_store
)

document_store.update_embeddings(retriever=retriever)

Batches: 100%|██████████| 24/24 [00:02<00:00, 11.96it/s]ocs/s]
Documents Processed: 10000 docs [00:02, 4899.03 docs/s]        


In [23]:
# Testing the retriever

# Try the Retriever
retrieved_tables = retriever.retrieve("What was the GHG Scope 1 emissions in the year 2021?", top_k=3)

# Get highest scored table
print(retrieved_tables[2].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 34.78it/s]

|    | Natural Capital - Our environment   | 2023   | 2022   | 2021   | 2020   | LoA 2023   | Footnote   |
|----|-------------------------------------|--------|--------|--------|--------|------------|------------|
|  2 | North America                       | 0.82   | 0,83   | 0,74   | 0,59   |            |            |
|  3 | Chemicals Africa                    |        |        |        |        |            |            |
|  4 | Atmospheric emissions (kilotons)    |        |        |        |        |            |            |
|  5 | Nitrogen oxides (NOx) (kilotons)    | 122,04 | 118,70 | 124,00 | 143,60 | Reasonable |            |
|  6 | ENERGY                              | 120,52 | 117,20 | 122,30 | 141,50 |            |            |





In [153]:
rag_prompt = PromptTemplate(
    prompt="""Use the following pieces of context to answer the question at the end.
              The context may be text or a markdown table.
              Just retrieve the answer from the context. Please don't do any unit conversion.
              If you don't know the answer, please return 'None' for the answer and unit.
              Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

              \n\n Context: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

prompt_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo-1106",
    api_key=OPENAI_API_KEY,
    default_prompt_template=rag_prompt,
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

In [154]:
# TODO: How to specify the number of documents retrieved by the retriever to use in the prompt?

querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
querying_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [164]:
output = querying_pipeline.run(query="What was the GHG Scope 1 emissions in the year 2021?")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 29.87it/s]


Answer: 66 273, Unit: kilotons


In [161]:
output = querying_pipeline.run(query="What was the GHG Scope 1 emissions in the year 2023?  Please give your answer in the unit kilotons.")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 40.77it/s]


Answer: 643, Unit: kilotons


The `gpt-3.5-turbo` model has a context window of 4,096 tokens. As a result, my prompt is often
being truncated so that the prompt length and answer length (100 tokens) fit within the max token
limit. The updated GPT-3.5 model (`gpt-3.5-turbo-0125`) has a larger context window of 
16,385 tokens. Would be good to use this, if possible (may require using haystack 2.0)
The slightly older GPT-3.5 model `gpt-3.5-turbo-1106` has a larger context window and is available 
with haystack 1.0. I'll use this for now.

In [166]:
output = querying_pipeline.run(query="What was the GHG Scope 3 emissions in the year 2023?")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 65.52it/s]


Answer: 64 392, Unit: kilotons


Working very well. Only issue I have seen so far is not being able to answer "What was the 
GHG Scope 2 emissions in the year 2021?". 

**TODO:** Add to querying pipeline a step to parse the returned output 'Answer: <>, Unit: <>'
to a single number. For cases where 'Unit' is not `None`, this will likely involve another
`PromptNode` to do the conversion.

## Validation

In [167]:
import pandas as pd

VALIDATION_FILE = Path("/home/tomw/unifi-pdf-llm/data/validate/rag_esg_metric_validation.csv")

In [174]:
def validate_rag(querying_pipeline: Pipeline):
    validation_df = pd.read_csv(VALIDATION_FILE)
    results_df = validation_df.copy(deep=True)

    # Add row to results_df for the generated answer
    results_df["Generated"] = None

    for idx, row in validation_df.iterrows():
        year = row["Year"]
        metric = row["Metric"]
        unit = row["Unit"]

        # if unit is not None:
        #     query = f"What was the {metric} in the year {year}?"
        # else:
        #     query = f"What was the {metric} in the year {year}? Please give your answer in the unit {unit}."

        query = f"What was the {metric} in the year {year}?"

        output = querying_pipeline.run(query=query)
        answer = output["answers"][0].answer

        results_df.at[idx, "Generated"] = answer

    return results_df


In [175]:
results = validate_rag(querying_pipeline)

Batches: 100%|██████████| 1/1 [00:00<00:00, 26.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 29.15it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.16it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 118.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 103.63it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 105.71it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 94.05it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 95.26it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 88.01it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 56.27it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.61it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 149.13it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 105.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 19.25it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 72.99it/s]


In [176]:
results

Unnamed: 0,Company,Year,Metric,Unit,Answer,Source,Content Type,Page,Notes,Generated
0,SASOL,2023,Number of permanent employees,,28657,SASOL Sustainability Report 2023 20-09_0.pdf,Table,17,May get confuesed with employee numbers in tab...,"Answer: 28 657, Unit: None"
1,SASOL,2023,Employee turnover,,1725,SASOL Sustainability Report 2023 20-09_0.pdf,Text,18,,"Answer: 1 725, Unit: None"
2,SASOL,2023,GHG Scope 1 emissions,kilotons,58644,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,,"Answer: 120.52, Unit: kilotons"
3,SASOL,2023,GHG Scope 2 emissions,kilotons,5748,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,,"Answer: None, Unit: None"
4,SASOL,2023,GHG Scope 3 emissions,kilotons,36664,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,,"Answer: None, Unit: None"
5,SASOL,2022,GHG Scope 1 emissions,kilotons,57284,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,,"Answer: 61 559, Unit: kilotons"
6,SASOL,2022,GHG Scope 2 emissions,kilotons,6607,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,,"Answer: None, Unit: None"
7,SASOL,2023,Air emissions of the following pollutants: (2)...,kilotons,12204,SASOL Sustainability Report 2023 20-09_0.pdf,Table,58,Located elsewhere in pdf,"Answer: 122.04, Unit: kilotons"
8,SASOL,2023,Air emissions of the following pollutants: (3)...,kilotons,16688,SASOL Sustainability Report 2023 20-09_0.pdf,Table,58,Located elsewhere in pdf,"Answer: None, Unit: None"
9,SASOL,2022,Air emissions of the following pollutants: (3)...,kilotons,16187,SASOL Sustainability Report 2023 20-09_0.pdf,Table,58,Located elsewhere in pdf,"Answer: None, Unit: None"
