In [2]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

In [100]:
converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    model_id="prebuilt-layout",  # Was "prebuilt-document"
    save_json=False
)

PDF_PATH = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal.pdf")

docs = converter.convert(file_path=PDF_PATH, meta=None)

In [None]:
# TODO: Implement way to save docs to disk and load them back in. Would be good to do at this
# stage (before loaded into a document store), so it's possible to experiment with the
# indexing pipeline without having to re-convert the PDFs.

In [101]:
print(f'Number of documents: {len(docs)}')

Number of documents: 3


In [89]:
def split_tables(docs: list[Document], window_size: int=5):
    """
    Return a list of documents, each containing text or a smaller table.

    Parameters
    ----------
    docs : list[Document]
        List of documents.

    window_size : int
        The size of the sliding window to use when splitting tables.

    Returns
    -------
    new_docs : list[Document]
        List of documents, each containing text or a smaller table.
    """
    new_docs = []

    for doc in docs:
        if doc.content_type == "table":
            new_docs.extend(_split_table(doc, window_size))
        else:
            new_docs.append(doc)

    return new_docs


def _split_table(doc, window_size=5):
    """
    Split a table into smaller tables using a sliding window approach.

    Parameters
    ----------
    doc : Document
        The document containing the table to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a smaller table.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = _sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr not in ["content", "id"]:
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def _sliding_window(df, window_size):
    """
    Split a DataFrame into smaller DataFrames using a sliding window approach.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    list of pandas.DataFrame
        A list of DataFrames, each one representing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]

    return tables


In [90]:
docs = split_tables(docs)

print(f"Number of documents: {len(docs)}")

Number of documents: 140


In [91]:
def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents with `content_type` table.
    """
    for doc in docs:
        if doc.content_type == "table":
            _convert_table_to_markdown(doc)


def _convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"

In [92]:
convert_tables_to_markdown(docs)

In [93]:
# TODO: Try to use other document stores (e.g. FAISS).

document_store = InMemoryDocumentStore(embedding_dim=384)

document_store.delete_documents()
document_store.write_documents(docs)

In [94]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

# TODO: Look into other (non-OpenAI) embedding models that can be used with Haystack v1.

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2", document_store=document_store
)

document_store.update_embeddings(retriever=retriever)

Batches: 100%|██████████| 5/5 [00:00<00:00, 12.47it/s] docs/s]
Documents Processed: 10000 docs [00:00, 24072.74 docs/s]       


In [95]:
# Testing the retriever

# Try the Retriever
retrieved_tables = retriever.retrieve("What was the GHG Scope 2 emissions in the year 2021?", top_k=3)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 140.97it/s]

|    | Natural Capital - Our environment            | Footnote   | 2021   | 2020          | 2019         | 2018   | Level of assurance 2021   |
|----|----------------------------------------------|------------|--------|---------------|--------------|--------|---------------------------|
| 46 |                                              |            |        | 313           | 223          | 162    | Restated                  |
| 47 | Americas                                     |            | 264    | 313           | 223          | 162    | 2018-2020                 |
| 48 | Mozambique                                   |            | -      |               |              | -      |                           |
| 49 | Other strategic business units and Functions |            | 25     | 28            | 36           | 37     |                           |
| 50 | Indirect carbon dioxide (CO2) Scope 3        |            | Refer  | to page 4, 30 | to 32 of the | CCR    | Limited             




In [96]:
rag_prompt = PromptTemplate(
    prompt="""Use the following pieces of context to answer the question at the end.
              The context may be text or a markdown table.
              If you don't know the answer, just say 'None', don't try to make up an answer.

              \n\n Context: {join(documents)} \n\n Question: {query} \n\n Answer:""",
    output_parser=AnswerParser(),
)

prompt_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo",
    api_key=OPENAI_API_KEY,
    default_prompt_template=rag_prompt,
    model_kwargs={"temperature": 0.0}
)

In [97]:
# TODO: How to specify the number of documents retrieved by the retriever to use in the prompt?

querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
querying_pipeline.add_node(component=prompt_node, name="prompt_node", inputs=["retriever"])

In [98]:
output = querying_pipeline.run(query="What was the GHG Scope 1 emissions in the year 2021?")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 98.17it/s]


56,972


In [99]:
output = querying_pipeline.run(query="What was the GHG Scope 2 emissions in the year 2021?")

print(output["answers"][0].answer)

Batches: 100%|██████████| 1/1 [00:00<00:00, 34.02it/s]


None


Working very well. Only issue I have seen so far is not being able to answer "What was the 
GHG Scope 2 emissions in the year 2021?". 

## Validation

In [57]:
import pandas as pd


VALIDATION_FILE = Path("/home/tomw/unifi-pdf-llm/data/validate/rag_esg_metric_validation.csv")

In [55]:
VALIDATION_FILE = Path("/home/tomw/unifi-pdf-llm/data/validate/rag_esg_metric_validation.csv")
VALIDATION_PDF = Path("")

def validate_rag(querying_pipeline: Pipeline):
    validation_df = pd.read_csv(VALIDATION_FILE)
    results_df = validation_df.copy(deep=True)

    # Add row to results_df for the generated answer
    results_df["Generated"] = None

    for idx, row in validation_df.iterrows():
        year = row["Year"]
        metric = row["Metric"]
        unit = row["Unit"]

        if unit is not None:
            query = f"What was the {metric} in the year {year}?"
        else:
            query = f"What was the {metric} in the year {year}? Please give your answer in the unit {unit}."

        output = querying_pipeline.run(query=query)
        answer = output["answers"][0].answer

        results_df.at[idx, "Generated"] = answer

    return results_df


In [58]:
validation_df = pd.read_csv(VALIDATION_FILE)

In [60]:
validation_df.head()

Unnamed: 0,Company,Year,Metric,Unit,Answer,Source,Content Type,Page,Notes
0,SASOL,2023,Number of permanent employees,,26294,SASOL Sustainability Report 2023 20-09_0.pdf,Table,17,May get confuesed with employee numbers in tab...
1,SASOL,2023,Employee turnover,,1725,SASOL Sustainability Report 2023 20-09_0.pdf,Text,18,
2,SASOL,2023,GHG Scope 1 emissions,kilotons,58644,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,
3,SASOL,2023,GHG Scope 2 emissions,kilotons,5748,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,
4,SASOL,2023,GHG Scope 3 emissions,kilotons,36664,SASOL Sustainability Report 2023 20-09_0.pdf,Table,40,
