In [51]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

import pandas as pd
from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers
from haystack.nodes import BaseComponent
from loguru import logger

pd.set_option('display.max_rows', 100)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

AMKEY_TO_METRIC_PATH = "/home/tomw/unifi-pdf-llm/data/AMKEY_GoldenStandard.csv"
"""Path to csv file mapping AMKEY to metric description."""

AMKEY_TO_SYNONYM_PATH = "/home/tomw/unifi-pdf-llm/data/ActivityMetricsSynonyms.csv"
"""Path to csv file mapping AMKEY to company metric description."""

AMKEY_TO_UNIT_PATH = "/home/tomw/unifi-pdf-llm/data/AMKEY_unit_conversion.csv"
"""Path to csv file mapping AMKEY to required unit."""

'Path to csv file mapping AMKEY to required unit.'

## Convert PDF

In [2]:
# Temp

def convert_validation_pdf() -> list[Document]:
    """
    Returns a list of Documents from the validation PDF.

    Uses the AzureConverter to convert the PDF to tables and text documents.

    Returns
    -------
    converted_docs : list[Document]
        The list of Documents from the validation PDF.
    """
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
        save_json=True
    )

    for fn in file_path.glob("*.pdf"):
        print(f"Converting {fn}")
        docs = converter.convert(file_path=fn, meta=None)
        converted_docs.extend(docs)

    return converted_docs


def load_validation_pdf_from_json() -> list[Document]:
    """
    Return a list of Documents from the validation PDF, loaded from JSON files.

    Requires AzureConverter to have been run on the PDF and saved the JSON files.

    Returns
    -------
    converted_docs : list[Document]
        The list of Documents from the validation PDF.
    """
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
    )

    for fn in file_path.glob("*.json"):
        print(f"Loading {fn}")
        docs = converter.convert_azure_json(file_path=fn)
        converted_docs.extend(docs)

    return converted_docs

In [3]:
docs = load_validation_pdf_from_json()

Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [7-8].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [5-6].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [11].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [3-4].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [1-2].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [9-10].json


## Preprocess Documents

In [4]:
def preprocess_documents(
        docs: list[Document],
        window_size: int=5
    ) -> list[Document]:
    """
    Preprocess the documents.

    Parameters
    ----------
    docs : list[Document]
        The documents to preprocess.

    window_size : int
        The size of the sliding window used to split the tables.

    Returns
    -------
    docs : list[Document]
        The preprocessed documents.
    """
    preprocessed_docs = []

    for doc in docs:
        if doc.content_type == "table":
            doc.content = clean_table_column_names(doc.content)
            sliced_table_docs = slice_table_document(doc, window_size)
            preprocessed_docs.extend(sliced_table_docs)
        else:
            preprocessed_docs.append(doc)

    convert_tables_to_markdown(preprocessed_docs)

    return preprocessed_docs


def clean_table_column_names(df: pd.DataFrame, replace: str=' - ') -> pd.DataFrame:
    """
    Return a DataFrame with newlines removed from column headers.

    Parameters
    ----------
    df : pd.Dataframe
        The DataFrame to clean.

    replace: str
        The string to replace newlines with.

    Returns
    -------
    df : pd.Dataframe
        The dataframe with newlines removed from column headers.
    """
    df.columns = df.columns.str.replace('\n', replace)
    return df


def slice_table_document(doc: Document, window_size: int=5) -> list[Document]:
    """
    Return a list of documents, each containing a table with `window_size` rows.

    A sliding window approach is used to split the table into smaller tables. The
    returned documents have the same metadata as the original document, except for
    the content and id.

    Parameters
    ----------
    doc : Document
        Document with content_type "table".

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a table with `window_size` rows.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = _sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr not in ["content", "id"]:
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def _sliding_window(df: pd.DataFrame, window_size: int) -> list[pd.DataFrame]:
    """
    Return a list of DataFrames, each containing a window of the original DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    tables : list[pandas.DataFrame]
        A list of DataFrames, each containing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]

    return tables


def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents, some of which may have `content_type` 'table'.
    """
    for doc in docs:
        if doc.content_type == "table":
            _convert_table_to_markdown(doc)


def _convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(index=False, tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"

In [5]:
docs = preprocess_documents(docs, window_size=1)

print(f"Number of documents: {len(docs)}\n")

print(docs[24].content)

Number of documents: 806

| Sasol in Society - Spend   | 2023 - Rm   | 2022 - Rm   | 2021 - Rm   |   2020 - Rm | LOA 2023   | Footnote   |
|----------------------------|-------------|-------------|-------------|-------------|------------|------------|
| Sasolburg                  | 1 360       | 1 366       | 1 586       |        1440 |            |            |


## Retrieval Augmented Generation

### Document Store

In [40]:
# TODO: Try to use other document stores (e.g. FAISS).

document_store = InMemoryDocumentStore(embedding_dim=384)

document_store.delete_documents()
document_store.write_documents(docs)

### Retriever

In [56]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

# TODO: Look into other (non-OpenAI) embedding models that can be used with Haystack v1.

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    document_store=document_store,
    top_k=3
)

document_store.update_embeddings(retriever=retriever)

Batches: 100%|██████████| 24/24 [00:00<00:00, 38.67it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 15564.22 docs/s]       


In [57]:
# Testing the retriever

retrieved_tables = retriever.retrieve("What was the number of permanent employees 2021?", top_k=3)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 208.81it/s]

|    | Human Capital - Our people   | 2023   | 2022   | 2021   | 2020   | LoA 2023   | Footnote   |
|----|------------------------------|--------|--------|--------|--------|------------|------------|
|  1 | Permanent employees          | 28 657 | 28 279 | 28 725 |        |            |            |





In [58]:
# Testing the retriever

retrieved_tables = retriever.retrieve("What was the Number of fatalities in the year 2023?", top_k=3)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 168.49it/s]

|    | Human Capital - Our people               | 2023   |   2022 | 2021   | 2020   | LoA 2023   | Footnote   |
|----|------------------------------------------|--------|--------|--------|--------|------------|------------|
| 43 | Employee and service provider fatalities | I      |      1 |        |        |            |            |





In [87]:
# Testing the retriever

retrieved_tables = retriever.retrieve("Black-owned spend", top_k=5)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 124.62it/s]

|    | Sasol in Society - Spend   | 2023 - Rm   | 2022 - Rm   | 2021 - Rm   | 2020 - Rm   | LOA 2023   | Footnote   |
|----|----------------------------|-------------|-------------|-------------|-------------|------------|------------|
| 16 | Black-owned spend          | 41 700      | 33 600      | 23 800      |             |            |            |





### LLM

In [92]:
rag_prompt = PromptTemplate(
    prompt="""Use the following pieces of context to answer the question at the end.
              The context may be text or a markdown table.
              Just retrieve the answer from the context. Please don't do any unit conversion.
              If you don't know the answer, please return 'None' for the answer and unit.
              Do not return any words other than 'Answer' and 'Unit' in the answer.
              Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

              \n\n Context: {join(documents)} \n\n Question: {query}? {append} \n\n Answer:""",
    output_parser=AnswerParser(),
)

generation_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo-1106",  # Using 'gpt-3.5-turbo-1106' as it has a larger context window.
    api_key=OPENAI_API_KEY,
    default_prompt_template=rag_prompt,
    output_variable="generated_answer",
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

In [103]:
generation_node_2 = PromptNode(
    model_name_or_path="gpt-3.5-turbo-1106",  # Using 'gpt-3.5-turbo-1106' as it has a larger context window.
    api_key=OPENAI_API_KEY,
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

In [104]:
generation_node_2("What is the capital of France?")

['The capital of France is Paris.']

In [60]:
class GeneratedAnswerParser(BaseComponent):
    """
    Parse the output returned by the generation node.

    The output is expected to be in the format "Answer: <number or None>, Unit: <unit or None>".
    """
    outgoing_edges = 1

    def run(self, generated_answer):
        """
        Parse the output returned by the generation node.

        The output is expected to be in the format "Answer: <number or None>, Unit: <unit or None>".

        Parameters
        ----------
        generated : list[Answer]
            The output returned by the generation node.

        Returns
        -------
        dict
            A dictionary containing the answer and unit.
        """
        output = generated_answer[0].answer

        answer, unit = output.split(", ")
        answer = answer.split(": ")[1]
        unit = unit.split(": ")[1]

        if answer == "None":
            answer = None
        else:
            answer = answer.replace(" ", "")
            answer = answer.replace(",", "")
            answer = int(answer)

        return {"answer": answer, "unit": unit}, "output_1"

    def run_batch(self, **kwargs):
        # TODO: Implement batch processing.
        pass


gen_parser = GeneratedAnswerParser()

In [61]:
# Unit conversion. TODO: Move

def create_unit_conversion_prompt(value, unit, target_unit):
    prompt=f"""You are an expert unit converter. You are aware of how to convert
    between different units within the same system of measurement.
    For example, 1236 million = 1236 * 1 million = 1236 * 1000000 = 1236000000.
    For example, to convert from Rm to R, you would multiply by 1000000. This is because
    1 Rm = 1000000 R.
    Please return a single number as your answer. Do not elaborate or give
    any context.\n\n

    What is {value} {unit} in {target_unit}? \n\n Answer:"""

    return prompt


unit_conversion_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo",
    api_key=OPENAI_API_KEY,
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

query = create_unit_conversion_prompt(1.24, "Rm", "R")

unit_conversion_node(query)

['1240000']

### Querying Pipeline

In [93]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
querying_pipeline.add_node(component=generation_node, name="prompt_node", inputs=["retriever"])
querying_pipeline.add_node(component=gen_parser, name="gen_parser", inputs=["prompt_node"])

In [94]:
output = querying_pipeline.run(
    query="What was the Black-owned spend in the year 2023?",
    params={
        "append": "Do not include the word 'Level' in the answer.",
        "retriever": {"debug": True, "top_k": 5},
        "prompt_node": {"debug": True},
        "gen_parser": {"debug": True},
    }
)

print(f"Answer: {output['answer']}, Unit: {output['unit']}")

ValueError: No node(s) or global parameter(s) named append found in pipeline.

In [76]:
# TODO: Write function to convert the answer to a more human-readable format.
# In particular, the markdown tables aren't very readable as a single line of text.

output["_debug"]

{'retriever': {'input': {'root_node': 'Query',
   'query': 'What was the Black-owned spend in the year 2023?',
   'top_k': 5,
   'debug': True},
  'output': {'documents': [<Document: {'content': '|    | Human Capital - Our people                | 2023   |   2022 |   2021 | 2020   | LoA 2023   |   Footnote |\n|----|-------------------------------------------|--------|--------|--------|--------|------------|------------|\n| 52 | Investment in black employees (R million) | 724,64 |    698 |    884 | 748,00 |            |          6 |', 'content_type': 'text', 'score': 0.501289664925134, 'meta': {'preceding_context': '- Significant fires, explosions and releases\n2\n6', 'following_context': '- Major fires, explosions and releases\n-\n-', 'page': 2}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'ad9e935dda72423edad23396d1eaa0d6'}>,
    <Document: {'content': '|    | Natural Capital - Our environment      |   2023 |   2022 |   2021 |   2020 | LoA 2023   | Footnote   |\n|----|-------

The `gpt-3.5-turbo` model has a context window of 4,096 tokens. As a result, my prompt is often
being truncated so that the prompt length and answer length (100 tokens) fit within the max token
limit. The updated GPT-3.5 model (`gpt-3.5-turbo-0125`) has a larger context window of 
16,385 tokens. Would be good to use this, if possible (may require using haystack 2.0)
The slightly older GPT-3.5 model `gpt-3.5-turbo-1106` has a larger context window and is available 
with haystack 1.0. I'll use this for now.

In [65]:
# Testing appending additional instructions to the query.

output = querying_pipeline.run(
    query="What was the B-BBEE status in the year 2021? Do not include the word 'Level' in the answer.",
    params={
        "retriever": {"debug": True},
        "prompt_node": {"debug": True},
        "gen_parser": {"debug": True},
    }
)

print(output['answer'])
print(output['unit'])

Batches: 100%|██████████| 1/1 [00:00<00:00, 55.06it/s]


4
None


In [66]:
output = querying_pipeline.run(
    query="What was the Employee turnover in the year 2021?",
    params={
        "retriever": {"debug": True},
        "prompt_node": {"debug": True},
        "gen_parser": {"debug": True},
    }
)

print(output['answer'])
print(output['unit'])

Batches: 100%|██████████| 1/1 [00:00<00:00, 39.46it/s]


3869
None


Should there be two pipelines - one for queries which may require unit conversion,
and another for queries that don't? For example, when retrieving the employee turnover,
the unit should always be 'None'. So why ask the model to try and retrieve this - want 
to make it easier for the model when possible.

In [67]:
querying_pipeline.draw()

Working very well. Only issue I have seen so far is not being able to answer "What was the 
GHG Scope 2 emissions in the year 2021?". 

## Class querying pipeline

I want more control over the pipeline. For example, I want the retriever to be called
with only the metric description (not the year), and for some AMKEYs I want to append 
additional instructions to the question. 

A class based solution might work well. Could be initilaised with the list of documents.

In [70]:
amkey_to_metric = pd.read_csv(AMKEY_TO_METRIC_PATH)
print(amkey_to_metric.head())

# Retrieve the metric for AMKEY=3
metric = amkey_to_metric[amkey_to_metric["AMKEY"] == 49]["ActivityMetric"].item()
print(metric)

   AMKEY                                     ActivityMetric
0      3              Advisory fees as per income statement
1      6  Air emissions of the following pollutants: (1) CO
2      7  Air emissions of the following pollutants: (2)...
3      8  Air emissions of the following pollutants: (3)...
4      9  Air emissions of the following pollutants: (4)...
B-BBEE Scorecard Level


In [73]:
amkey_to_synonym = pd.read_csv(AMKEY_TO_SYNONYM_PATH)
print(amkey_to_synonym.head())

# Retrieve the metric for AMKEY=7 and Group=Impala
metric = amkey_to_synonym[(amkey_to_synonym["AMKEY"] == 49) & (amkey_to_synonym["Group"] == "Sasol")]["ClientMetric"]

   AMKEY   Group                                     ActivityMetric  \
0      7  Impala  Air emissions of the following pollutants: (2)...   
1      7   Sasol  Air emissions of the following pollutants: (2)...   
2      8   Sasol  Air emissions of the following pollutants: (3)...   
3      8     Ssw  Air emissions of the following pollutants: (3)...   
4      8  Impala  Air emissions of the following pollutants: (3)...   

                            ClientMetric  
0                     Total indirect Nox  
1       Nitrogen oxides (NOx) (kilotons)  
2       Sulphur oxides (SOx ) (kilotons)  
3                          SO2 emissions  
4  Total direct SO2 + Total indirect SO2  


In [74]:
metric

17    B-BBEE verification certificate
Name: ClientMetric, dtype: object

In [160]:
# Check if metric is empty
if metric.empty:
    print("No metric found.")

No metric found.


In [171]:
amkey_to_unit = pd.read_csv(AMKEY_TO_UNIT_PATH)
print(amkey_to_unit.head())

unit = amkey_to_unit[amkey_to_unit["AMKEY"] == 6]["Unit"]

   AMKEY                                     ActivityMetric  Unit
0      3              Advisory fees as per income statement  rand
1      6  Air emissions of the following pollutants: (1) CO   NaN
2      7  Air emissions of the following pollutants: (2)...   NaN
3      8  Air emissions of the following pollutants: (3)...   NaN
4      9  Air emissions of the following pollutants: (4)...   NaN


In [151]:
class QueryPipeline2:
    def __init__(
            self,
            docs: list[Document]
        ):
        """
        Initalise the components of the query pipeline.

        Parameters
        ----------
        docs : list[Document]
            The documents to provide context for the queries.
        """
        self.docs = docs

        self.document_store = None
        self.retriever = None
        self.generation_llm = None

        self.initialise_document_store()
        self.initialise_retriever()
        self.initialise_generation_llm()

    def initialise_document_store(self):
        logger.info("Initialising document store")
        self.document_store = InMemoryDocumentStore(embedding_dim=384)
        self.document_store.delete_documents()
        self.document_store.write_documents(docs)

    def initialise_retriever(self, top_k=3):
        logger.info("Initialising retriever")
        self.retriever = EmbeddingRetriever(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            document_store=self.document_store,
            top_k=top_k
        )
        self.document_store.update_embeddings(retriever=self.retriever)

    def initialise_generation_llm(self):
        logger.info("Initialising generation LLM")
        self.generation_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo-1106",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def query(self, metric: str, year: int):
        """
        Return the value of a metric for a given year.

        Uses retrieval augmented generation to answer the query.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.
        """
        context_documents = self.retriever.retrieve(metric)

        prompt = self._create_generation_prompt(metric, year, context_documents)

        answer = self.generation_llm(prompt)[0]

        value, unit = self.parse_answer(answer)

        return value, unit

    def _create_generation_prompt(
            self,
            metric: str,
            year: int,
            docs: list[Document]
        )-> str:
        """
        Create a prompt for the generation LLM.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.

        Returns
        -------
        prompt : str
            The prompt for the generation LLM.
        """
        query = f"What was the {metric} in the year {year}?"

        # TODO: Generalise this to other metrics.
        if metric in ["B-BBEE status", "B-BBEE scorecard level"]:
            query += " Do not include the word 'Level' in the answer."

        context = "\n\n".join([doc.content for doc in docs])

        prompt = f"""Use the following pieces of context to answer the question at the end.
                    The context may be text or a markdown table.
                    Just retrieve the answer from the context. Please don't do any unit conversion.
                    If you don't know the answer, please return 'None' for the answer and unit.
                    Do not return any words other than 'Answer' and 'Unit' in the answer.
                    Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

                    \n\n Context: {context} \n\n Question: {query} \n\n Answer:"""

        return prompt

    def parse_answer(self, answer: str) -> tuple[int | None, str | None]:
        """
        Parse the answer returned by the generation LLM.

        Parameters
        ----------
        answer : str
            The answer returned by the generation LLM. This is expected to be in the
            format "Answer: <number or None>, Unit: <unit or None>".

        Returns
        -------
        value : int | None
            The value from the answer.

        unit : str | None
            The unit from the answer.
        """
        value, unit = answer.split(", ")
        value = value.split(": ")[1]
        unit = unit.split(": ")[1]

        if value == "None":
            value = None
        else:
            value = value.replace(" ", "")
            value = value.replace(",", "")
            value = int(value)

        return value, unit


In [152]:
query_class = QueryPipeline2(docs)

2024-02-18 22:03:20.107 | INFO     | __main__:initialise_document_store:25 - Initialising document store
2024-02-18 22:03:20.115 | INFO     | __main__:initialise_retriever:31 - Initialising retriever
Batches: 100%|██████████| 24/24 [00:00<00:00, 35.90it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 14478.99 docs/s]       
2024-02-18 22:03:22.888 | INFO     | __main__:initialise_generation_llm:40 - Initialising generation LLM


In [153]:
query_class.query("Black-owned spend", 2023)

Batches: 100%|██████████| 1/1 [00:00<00:00, 208.05it/s]


(41700, 'Rm')

## Validation of querying pipeline

In [68]:
VALIDATION_FILE = Path("/home/tomw/unifi-pdf-llm/data/validate/rag_esg_metric_validation.csv")

In [154]:
def validate_rag(docs: list[Document]) -> pd.DataFrame:
    validation_df = pd.read_csv(VALIDATION_FILE)
    results_df = validation_df.copy(deep=True)
    query_class = QueryPipeline2(docs)

    # Add row to results_df for the generated answer
    results_df["Generated"] = None

    for idx, row in validation_df.iterrows():
        year = row["Year"]
        metric = row["Metric"]

        value, _ = query_class.query(metric, year)

        results_df.at[idx, "Generated"] = value

    return results_df


In [155]:
results = validate_rag(querying_pipeline)

2024-02-18 22:04:59.204 | INFO     | __main__:initialise_document_store:25 - Initialising document store
2024-02-18 22:04:59.211 | INFO     | __main__:initialise_retriever:31 - Initialising retriever
Batches: 100%|██████████| 24/24 [00:00<00:00, 31.38it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 12705.90 docs/s]       
2024-02-18 22:05:02.079 | INFO     | __main__:initialise_generation_llm:40 - Initialising generation LLM
Batches: 100%|██████████| 1/1 [00:00<00:00, 217.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 85.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 88.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 170.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 200.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 208.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.86it/s]


In [157]:
results

Unnamed: 0,Company,Year,Metric,Unit,Source,Content Type,Page,Notes,Answer,Generated
0,SASOL,2023,Number of permanent employees,,SASOL Sustainability Report 2023 20-09_0.pdf,Table,17,May get confuesed with employee numbers in tab...,28657,28657
1,SASOL,2023,Employee turnover,,SASOL Sustainability Report 2023 20-09_0.pdf,Text,18,,1725,1725
2,SASOL,2023,B-BBEE status,,SASOL Sustainability Report 2023 20-09_0.pdf,Table,57,,3,3
3,SASOL,2023,B-BBEE scorecard level,,SASOL Sustainability Report 2023 20-09_0.pdf,Table,57,,3,3
4,SASOL,2023,Black women-owned spend,rand,SASOL Sustainability Report 2023 20-09_0.pdf,Text,11,May be retrieved from table. Requires conversi...,28500,28500
5,SASOL,2023,Black-owned spend,rand,SASOL Sustainability Report 2023 20-09_0.pdf,Text,11,Required conversion from million rand to rand,41700,41700
6,SASOL,2023,Number of fatalities,,SASOL Sustainability Report 2023 20-09_0.pdf,Text,11,,2,2
7,SASOL,2023,Number of undergraduate and postgraduate bursa...,,SASOL Sustainability Report 2023 20-09_0.pdf,Text,18,,544,544


## Query Class

In [81]:
class QueryPipeline:
    """Retrieve AMKEY values from documents using a RAG approach."""

    def __init__(
            self,
            docs: list[Document],
            company: str,
            amkey_to_metric_path: str,
            amkey_to_synonym_path: str,
            amkey_to_unit_path : str
        ):
        """
        Initalise the components of the query pipeline.

        Parameters
        ----------
        docs : list[Document]
            The documents to provide context for the queries.

        company : str
            The company the documents are for.

        amkey_to_metric_path : str
            Path to a csv file mapping AMKEY to metric.

        amkey_to_synonym_path : str
            Path to a csv file mapping AMKEY and company to metric synonym.

        amkey_to_unit_path : str
            Path to a csv file mapping AMKEY to desired unit.
        """
        self.docs = docs
        self.company = company
        self.amkey_to_metric_path = amkey_to_metric_path
        self.amkey_to_synonym_path = amkey_to_synonym_path
        self.amkey_to_unit_path = amkey_to_unit_path

        self.document_store = None
        self.retriever = None
        self.generation_llm = None
        self.unit_conversion_llm = None
        self.amkey_to_metric = None
        self.amkey_to_synonym = None
        self.amkey_to_unit = None

        self.initialise_document_store()
        self.initialise_retriever()
        self.initialise_generation_llm()
        self.initialise_unit_conversion_llm()
        self.initialise_mappings()

    def initialise_document_store(self):
        logger.info("Initialising document store")
        self.document_store = InMemoryDocumentStore(embedding_dim=384)
        self.document_store.delete_documents()
        self.document_store.write_documents(docs)

    def initialise_retriever(self, top_k=3):
        logger.info("Initialising retriever")
        self.retriever = EmbeddingRetriever(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            document_store=self.document_store,
            top_k=top_k
        )
        self.document_store.update_embeddings(retriever=self.retriever)

    def initialise_generation_llm(self):
        logger.info("Initialising generation LLM")
        self.generation_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo-1106",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_unit_conversion_llm(self):
        logger.info("Initialising unit conversion LLM")
        self.unit_conversion_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_mappings(self):
        logger.info("Initialising mappings")
        self.amkey_to_metric = pd.read_csv(self.amkey_to_metric_path)
        self.amkey_to_synonym = pd.read_csv(self.amkey_to_synonym_path)
        self.amkey_to_unit = pd.read_csv(self.amkey_to_unit_path)

    def query(self, amkey: int, year: int):
        """
        Return the value associated with an AMKEY for a given year.

        Uses retrieval augmented generation to retrieve the value.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric to retrieve.

        year : int
            The year to retrieve the metric for.

        Returns
        -------
        value : int
            The value associated with the AMKEY for the given year.
        """
        logger.debug(f"Retrieving AMKEY: {amkey}")

        metric = self.retrieve_metric_description(amkey)
        logger.debug(f"Retrieving metric: {metric}")

        context_documents = self.retriever.retrieve(metric)
        context_str = "\n\n".join([doc.content for doc in context_documents])
        logger.debug(f"Retrieved context documents:\n {context_str}")

        append = self._retrieve_additional_appended_instructions(amkey)
        logger.debug(f"Appending: {append}")

        prompt = self._create_generation_prompt(metric, year, context_documents, append)

        answer = self.generation_llm(prompt)[0]
        logger.debug(f"Generated answer: {answer}")

        value, unit = self.parse_answer(answer)

        required_unit = self.retrieve_unit(amkey)
        logger.debug(f"Required unit: {required_unit}")

        if required_unit is not None and value is not None:
            if unit != required_unit:
                unit_conversion_prompt = self.create_unit_conversion_prompt(value, unit, required_unit)
                value = self.unit_conversion_llm(unit_conversion_prompt)

        return value

    def _retrieve_additional_appended_instructions(self, amkey: int) -> str:
        """
        Return additional instructions to append to the query.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric to retrieve.

        Returns
        -------
        append : str
            Additional instructions to append to the query.
        """
        if amkey in [47, 48, 49]:
            append = "Do not include the word 'Level' in the answer."
        else:
            append = ""

        return append

    def _create_generation_prompt(
            self,
            metric: str,
            year: int,
            docs: list[Document],
            append: str
        )-> str:
        """
        Create a prompt for the generation LLM.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.

        docs : list[Document]
            The documents to provide context for the queries.

        append : str
            Additional instructions to append to the query.

        Returns
        -------
        prompt : str
            The prompt for the generation LLM.
        """
        query = f"What was the {metric} in the year {year}?"

        context = "\n\n".join([doc.content for doc in docs])

        prompt = f"""Use the following pieces of context to answer the question at the end.
                    The context may be text or a markdown table.
                    Just retrieve the answer from the context. Please don't do any unit conversion.
                    If you don't know the answer, please return 'None' for the answer and unit.
                    Do not return any words other than 'Answer' and 'Unit' in the answer.
                    Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

                    \n\n Context: {context} \n\n Question: {query} {append}\n\n Answer:"""

        return prompt

    def create_unit_conversion_prompt(self, value: int, unit: str, target_unit: str) -> str:
        prompt=f"""You are an expert unit converter. You are aware of how to convert
                    between different units within the same system of measurement.
                    For example, 1236 million = 1236 * 1 million = 1236 * 1000000 = 1236000000.
                    For example, to convert from Rm to R, you would multiply by 1000000. This is because
                    1 Rm = 1000000 R.
                    Please return a single number as your answer. Do not elaborate or give
                    any context.\n\n

                    What is {value} {unit} in {target_unit}? \n\n Answer:"""

        return prompt

    def parse_answer(self, answer: str) -> tuple[float | None, str | None]:
        """
        Parse the answer returned by the generation LLM.

        Parameters
        ----------
        answer : str
            The answer returned by the generation LLM. This is expected to be in the
            format "Answer: <number or None>, Unit: <unit or None>".

        Returns
        -------
        value : float | None
            The value from the answer.

        unit : str | None
            The unit from the answer.
        """
        logger.debug(f"Parsing answer: {answer}")

        value, unit = answer.split(", ")
        value = value.split(": ")[1]
        unit = unit.split(": ")[1]

        if value.strip() == "None":
            value = None
        else:
            value = value.replace(" ", "")
            value = value.replace(",", "")
            value = float(value)

        return value, unit

    def retrieve_metric_description(self, amkey: int) -> str:
        """
        Return the description of a metric.

        If a company-specific description is available, it is returned. Otherwise, the
        generic description is returned.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str
            The description of the metric.
        """
        metric = self.retrieve_company_metric_description(amkey)
        if metric is None:
            metric = self.retrieve_generic_metric_description(amkey)

        return metric

    def retrieve_company_metric_description(self, amkey: int) -> str | None:
        """
        Return the company-specific description of a metric, if available.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str | None
            The company-specific description of the metric, if available.
            Otherwise, None.
        """
        metric = self.amkey_to_synonym[
            (self.amkey_to_synonym["AMKEY"] == amkey)
            & (self.amkey_to_synonym["Group"] == self.company)
        ]["ClientMetric"]

        if metric.empty:
            metric = None
        else:
            metric = metric.item()

        return metric

    def retrieve_generic_metric_description(self, amkey: int) -> str:
        """
        Return the generic description of a metric.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str
            The description of the metric.

        Raises
        ------
        ValueError
            If the AMKEY is invalid.
        """
        try:
            metric = self.amkey_to_metric[
                self.amkey_to_metric["AMKEY"] == amkey
            ]["ActivityMetric"].item()
        except Exception:
            raise ValueError(f"Invalid AMKEY {amkey}")

        return metric

    def retrieve_unit(self, amkey: int) -> str | None:
        """
        Return the required unit for a metric.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        unit : str | None
            The required unit for the metric, if specified. Otherwise, None.
        """
        try:
            unit = self.amkey_to_unit[self.amkey_to_unit["AMKEY"] == amkey]["Unit"][0]
        except KeyError:
            unit = None

        return unit


In [37]:
# TODO: Test the QueryPipeline class.
query_pipeline = QueryPipeline(
    docs=docs,
    company="Sasol",
    amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
    amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
    amkey_to_unit_path=AMKEY_TO_UNIT_PATH
)

2024-02-22 21:42:22.953 | INFO     | __main__:initialise_document_store:53 - Initialising document store
2024-02-22 21:42:22.961 | INFO     | __main__:initialise_retriever:59 - Initialising retriever
Batches: 100%|██████████| 27/27 [00:00<00:00, 28.04it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 10051.46 docs/s]       
2024-02-22 21:42:25.967 | INFO     | __main__:initialise_generation_llm:68 - Initialising generation LLM
2024-02-22 21:42:25.968 | INFO     | __main__:initialise_unit_conversion_llm:76 - Initialising unit conversion LLM
2024-02-22 21:42:25.968 | INFO     | __main__:initialise_mappings:84 - Initialising mappings


In [38]:
query_pipeline.query(
    amkey=49,
    year=2021
)

2024-02-22 21:42:27.039 | DEBUG    | __main__:query:108 - Retrieving AMKEY: 49
2024-02-22 21:42:27.041 | DEBUG    | __main__:query:111 - Retrieving metric: B-BBEE verification certificate
Batches: 100%|██████████| 1/1 [00:00<00:00, 192.07it/s]
2024-02-22 21:42:27.075 | DEBUG    | __main__:query:115 - Retrieved context documents:
 | Annual Financial Statements   | Deloitte & Touche         |
|-------------------------------|---------------------------|
| B-BBEE contributor Level      | 1 st Verification Networx |

|              | Unit of measure   |   2021 |   2020 |   2019 |   2018 |
|--------------|-------------------|--------|--------|--------|--------|
| B-BBEE Level |                   |      4 |      4 |      4 |      3 |

| Annual Financial Statements   | Deloitte & Touche                                                        |
|-------------------------------|--------------------------------------------------------------------------|
| Operational certification     | Includes 

4.0

### Validating Tongaat 2021

In [10]:
from mapping import COMPANY_YEAR_PDF_MAPPING

AZURE_CONVERTER_DIR = "/home/tomw/unifi-pdf-llm/data/azureconverter_outputs"
"""Path to directory with json outputs from AzureConverter."""

TRAIN_CSV_PATH = "/home/tomw/unifi-pdf-llm/data/Train.csv"
"""Path to the Train.csv file."""


def load_documents(company: str, year: int) -> list[Document]:
    """
    Load documents for a company and year.

    Requires the corresponding pdf file(s) to have been previously converted to json
    using the AzureConverter.

    Parameters
    ----------
    company : str
        The company to load documents for.

    year : int
        The year to load documents for.

    Returns
    -------
    company_docs : list[Document]
        The documents for the company and year.

    Raises
    ------
    ValueError
        If no documents are found for the company and year.
    """
    company_docs = []
    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
    )

    try:
        file_name_list = COMPANY_YEAR_PDF_MAPPING[company][year]
    except KeyError:
        raise ValueError(f"No documents found for {company} in {year}")

    for file_name in file_name_list:
        file_name = file_name.replace(".pdf", ".json")
        file_path = Path(AZURE_CONVERTER_DIR) / file_name
        logger.info(f"Loading documents from {file_path}")
        docs = converter.convert_azure_json(file_path=file_path)
        company_docs.extend(docs)

    return company_docs


In [69]:
def validate_retrieval(company: str, year: int=2021) -> pd.DataFrame:
    """
    Returns a DataFrame with the results of the retrieval validation.
    """
    train_df = pd.read_csv(TRAIN_CSV_PATH)

    # Remove all rows with NaN in all of the columns [2021_Value, 2020_Value, 2019_Value]
    # We are validating only the AMKEYs that have a retrievable value in the documents.
    train_df = train_df.dropna(subset=["2021_Value", "2020_Value", "2019_Value"], how="all")

    # Restrict to the company
    train_df = train_df[train_df["ID"].str.contains(f"X_{company}")]
    train_df.reset_index(drop=True, inplace=True)

    # Load and preprocess the documents
    docs = load_documents(company, year)
    docs = preprocess_documents(docs, window_size=1)

    query_pipeline = QueryPipeline(
        docs=docs,
        company=company,
        amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
        amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
        amkey_to_unit_path=AMKEY_TO_UNIT_PATH
    )

    results_df = train_df.copy(deep=True)
    results_df.drop(columns=["2020_Value", "2019_Value"], inplace=True)

    # Loop over the rows in the dataframe and retrieve the value for each AMKEY
    for idx, row in train_df.iterrows():
        amkey = int(row["ID"].split("_")[0])

        metric = query_pipeline.retrieve_metric_description(amkey)
        results_df.at[idx, "Metric"] = metric

        value = query_pipeline.query(amkey, year)
        results_df.at[idx, f"{year}_Generated"] = value

    results_df[f"2021_Value"] = results_df["2021_Value"].astype(float)
    results_df["Correct"] = results_df.apply(
        lambda row: row[f"{year}_Generated"] == row["2021_Value"] or
        (pd.isna(row[f"{year}_Generated"]) and pd.isna(row["2021_Value"])),
        axis=1
    )

    # Reordering the columns
    results_df = results_df[["ID", "Metric", "2021_Value", f"{year}_Generated", "Correct"]]

    accuracy = results_df["Correct"].sum() / len(results_df)

    logger.info(f"Accuracy: {accuracy}")

    return results_df, accuracy


In [70]:
results_df, accuracy = validate_retrieval("Tongaat", 2021)

2024-02-22 22:17:19.889 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG.json
2024-02-22 22:17:21.726 | INFO     | __main__:initialise_document_store:53 - Initialising document store
2024-02-22 22:17:21.733 | INFO     | __main__:initialise_retriever:59 - Initialising retriever
Batches: 100%|██████████| 27/27 [00:01<00:00, 25.07it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 9014.40 docs/s]        
2024-02-22 22:17:24.840 | INFO     | __main__:initialise_generation_llm:68 - Initialising generation LLM
2024-02-22 22:17:24.841 | INFO     | __main__:initialise_unit_conversion_llm:76 - Initialising unit conversion LLM
2024-02-22 22:17:24.842 | INFO     | __main__:initialise_mappings:84 - Initialising mappings
2024-02-22 22:17:24.847 | DEBUG    | __main__:query:108 - Retrieving AMKEY: 12
2024-02-22 22:17:24.848 | DEBUG    | __main__:query:111 - Retrieving metric: Total injury frequency rate (TIFR) – employees an

In [71]:
display(results_df)

Unnamed: 0,ID,Metric,2021_Value,2021_Generated,Correct
0,12_X_Tongaat,Total injury frequency rate (TIFR) – employees...,1.331,1.331,True
1,28_X_Tongaat,Total – company managed/farmed land (owned and...,60204.0,60204.0,True
2,49_X_Tongaat,B-BBEE Level,4.0,4.0,True
3,52_X_Tongaat,Overall Board and Committee meeting attendance,99.0,99.0,True
4,114_X_Tongaat,Energy efficiency: total direct and indirect e...,16.63,17.9,False
5,122_X_Tongaat,"Fatal injury frequency rate (FIFR, i.e. number...",0.005,0.005,True
6,128_X_Tongaat,Carbon emissions – Scope 1,505575.0,505575.0,True
7,129_X_Tongaat,Carbon emissions – Scope 2,51539.0,51539.0,True
8,138_X_Tongaat,Hazardous waste disposed of at appropriate fac...,184.0,186.0,False
9,151_X_Tongaat,"Lost time injury frequency rate (LTIFR, i.e. n...",0.093,0.093,True


The Tongaat 2021 report gives two tables - a comprehensive data table, and a 
supplemental environmental data table (inclusive of data from operations sold off, 
and not disposed of in financial year 2021). Some metrics are included in both.
The '2021_Value's are all from the comprehensive data table. Most of the errors
occur because the LLM generates the answer from the supplemental environmental 
data table. 

Most of the other errors are from two-line rows which have been split into 
individual rows in the pandas and markdown tables. The first row is retrieved,
but it's the second row that has the value. Wouldn't have this issue with a sliding
window > 1 probably. 

In [72]:
# Original Accuracy: 0.7377049180327869
# Accuracy after removing index: 0.819672131147541

## Debugging Individual Queries

In [72]:
COMPANY = "Tongaat"
YEAR = 2021

In [82]:
query_pipeline = QueryPipeline(
    docs=docs,
    company=COMPANY,
    amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
    amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
    amkey_to_unit_path=AMKEY_TO_UNIT_PATH
)

2024-02-22 22:22:14.827 | INFO     | __main__:initialise_document_store:53 - Initialising document store
2024-02-22 22:22:14.834 | INFO     | __main__:initialise_retriever:59 - Initialising retriever
Batches: 100%|██████████| 27/27 [00:01<00:00, 26.23it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 9424.25 docs/s]        
2024-02-22 22:22:17.928 | INFO     | __main__:initialise_generation_llm:68 - Initialising generation LLM
2024-02-22 22:22:17.929 | INFO     | __main__:initialise_unit_conversion_llm:76 - Initialising unit conversion LLM
2024-02-22 22:22:17.930 | INFO     | __main__:initialise_mappings:84 - Initialising mappings


In [83]:
# This is an example where additional context of the table would be useful (rows
# above and below the selected row).
# The description of the row is spread over two rows, with the values
# in the second row.

ans = query_pipeline.query(
    amkey=622,
    year=YEAR
)


2024-02-22 22:22:18.442 | DEBUG    | __main__:query:108 - Retrieving AMKEY: 622
2024-02-22 22:22:18.444 | DEBUG    | __main__:query:111 - Retrieving metric: Rand value of investments in COVID-19 avoidance, mitigation and treatment
Batches: 100%|██████████| 1/1 [00:00<00:00, 186.31it/s]
2024-02-22 22:22:18.482 | DEBUG    | __main__:query:115 - Retrieved context documents:
 |                                                  | Unit of measure   | 2021   | 2020   | 2019   | 2018   |
|--------------------------------------------------|-------------------|--------|--------|--------|--------|
| Rand value of investments in COVID-19 avoidance, |                   |        |        |        |        |

|                                   | Unit of measure   |   2021 | 2020   | 2019   | 2018   |
|-----------------------------------|-------------------|--------|--------|--------|--------|
| Number of COVID-19-related deaths | number            |     20 |        |        |        |

| BANKS   | St

In [84]:
ans = query_pipeline.query(
    amkey=550,
    year=YEAR
)

2024-02-22 22:22:19.211 | DEBUG    | __main__:query:108 - Retrieving AMKEY: 550
2024-02-22 22:22:19.212 | DEBUG    | __main__:query:111 - Retrieving metric: New cases of NIHL* – employees and contractors
Batches: 100%|██████████| 1/1 [00:00<00:00, 137.85it/s]
2024-02-22 22:22:19.248 | DEBUG    | __main__:query:115 - Retrieved context documents:
 |                                                | Unit of measure   | 2021   |   2020 |   2019 | 2018   |
|------------------------------------------------|-------------------|--------|--------|--------|--------|
| New cases of NIHL* - employees and contractors | number            |        |      8 |      1 |        |

|                                                     | Unit of measure   |   2021 |   2020 |   2019 | 2018   |
|-----------------------------------------------------|-------------------|--------|--------|--------|--------|
| Non-work related deaths - employees and contractors | number            |      7 |      3 |      3 |    

In [86]:
ans = query_pipeline.query(
    amkey=9,
    year=YEAR
)

print(f'Answer: {ans}')

2024-02-22 22:22:28.914 | DEBUG    | __main__:query:108 - Retrieving AMKEY: 9
2024-02-22 22:22:28.916 | DEBUG    | __main__:query:111 - Retrieving metric: Air emissions of the following pollutants: (4) particulate matter (PM10)
Batches: 100%|██████████| 1/1 [00:00<00:00, 53.39it/s]
2024-02-22 22:22:28.966 | DEBUG    | __main__:query:115 - Retrieved context documents:
 | NATURAL CAPITAL   | NATURAL CAPITAL                         | LAND   |
|-------------------|-----------------------------------------|--------|
| ·                 | Improve waste efficiency by 5% by 2025. | pollution of air through discharges of particulate matter,
or even via inadequate suppression of dust emanating        |

| CARBON EMISSIONS (TONS OF CARBON DIOXIDE EQUIVALENTS, CO2-e)   | 2021    | 2020    | 2019    | 2018    |
|----------------------------------------------------------------|---------|---------|---------|---------|
| Total scope 1 emissions                                        | 505 575 | 704 98

Answer: None


### Validating Uct 2021