In [1]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
from pathlib import Path

import pandas as pd
from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.utils import print_answers
from haystack.nodes import BaseComponent
from loguru import logger

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

AMKEY_TO_METRIC_PATH = "/home/tomw/unifi-pdf-llm/data/AMKEY_GoldenStandard.csv"
AMKEY_TO_SYNONYM_PATH = "/home/tomw/unifi-pdf-llm/data/ActivityMetricsSynonyms.csv"
AMKEY_TO_UNIT_PATH = "/home/tomw/unifi-pdf-llm/data/AMKEY_unit_conversion.csv"

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


## Convert PDF

In [2]:
def convert_validation_pdf() -> list[Document]:
    """
    Returns a list of Documents from the validation PDF.

    Uses the AzureConverter to convert the PDF to tables and text documents.

    Returns
    -------
    converted_docs : list[Document]
        The list of Documents from the validation PDF.
    """
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
        save_json=True
    )

    for fn in file_path.glob("*.pdf"):
        print(f"Converting {fn}")
        docs = converter.convert(file_path=fn, meta=None)
        converted_docs.extend(docs)

    return converted_docs

In [3]:
def load_validation_pdf_from_json() -> list[Document]:
    """
    Return a list of Documents from the validation PDF, loaded from JSON files.

    Requires AzureConverter to have been run on the PDF and saved the JSON files.

    Returns
    -------
    converted_docs : list[Document]
        The list of Documents from the validation PDF.
    """
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
    )

    for fn in file_path.glob("*.json"):
        print(f"Loading {fn}")
        docs = converter.convert_azure_json(file_path=fn)
        converted_docs.extend(docs)

    return converted_docs

In [4]:
docs = load_validation_pdf_from_json()

Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [7-8].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [5-6].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [11].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [3-4].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [1-2].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [9-10].json


## Preprocess Documents

**TODO**: Test removing the index from the tables. I don't think it adds much. Could be 
added as context to each document, and then used to recreate context of row. 

In [5]:
def preprocess_documents(
        docs: list[Document],
        window_size: int=5
    ) -> list[Document]:
    """
    Preprocess the documents.

    Parameters
    ----------
    docs : list[Document]
        The documents to preprocess.

    window_size : int
        The size of the sliding window used to split the tables.

    Returns
    -------
    docs : list[Document]
        The preprocessed documents.
    """
    preprocessed_docs = []

    for doc in docs:
        if doc.content_type == "table":
            doc.content = clean_table_column_names(doc.content)
            sliced_table_docs = slice_table_document(doc, window_size)
            preprocessed_docs.extend(sliced_table_docs)
        else:
            preprocessed_docs.append(doc)

    convert_tables_to_markdown(preprocessed_docs)

    return preprocessed_docs


def clean_table_column_names(df: pd.DataFrame, replace: str=' - ') -> pd.DataFrame:
    """
    Return a DataFrame with newlines removed from column headers.

    Parameters
    ----------
    df : pd.Dataframe
        The DataFrame to clean.

    replace: str
        The string to replace newlines with.

    Returns
    -------
    df : pd.Dataframe
        The dataframe with newlines removed from column headers.
    """
    df.columns = df.columns.str.replace('\n', replace)
    return df


def slice_table_document(doc: Document, window_size: int=5) -> list[Document]:
    """
    Return a list of documents, each containing a table with `window_size` rows.

    A sliding window approach is used to split the table into smaller tables. The
    returned documents have the same metadata as the original document, except for
    the content and id.

    Parameters
    ----------
    doc : Document
        Document with content_type "table".

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a table with `window_size` rows.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = _sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr not in ["content", "id"]:
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def _sliding_window(df: pd.DataFrame, window_size: int) -> list[pd.DataFrame]:
    """
    Return a list of DataFrames, each containing a window of the original DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    tables : list[pandas.DataFrame]
        A list of DataFrames, each containing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]

    return tables


def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents, some of which may have `content_type` 'table'.
    """
    for doc in docs:
        if doc.content_type == "table":
            _convert_table_to_markdown(doc)


def _convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(tablefmt="github")

    doc.content = markdown_table
    doc.content_type = "text"

In [6]:
docs = preprocess_documents(docs, window_size=1)

print(f"Number of documents: {len(docs)}\n")

print(docs[24].content)

Number of documents: 806

|    | Sasol in Society - Spend   | 2023 - Rm   | 2022 - Rm   | 2021 - Rm   |   2020 - Rm | LOA 2023   | Footnote   |
|----|----------------------------|-------------|-------------|-------------|-------------|------------|------------|
| 24 | Sasolburg                  | 1 360       | 1 366       | 1 586       |        1440 |            |            |


## Retrieval Augmented Generation

### Document Store

In [40]:
# TODO: Try to use other document stores (e.g. FAISS).

document_store = InMemoryDocumentStore(embedding_dim=384)

document_store.delete_documents()
document_store.write_documents(docs)

### Retriever

In [56]:
# TODO: I'm not sure what OpenAI embedding models are available. Is it possible to use
# their newest embedding models in Haystack v1?

# TODO: Look into other (non-OpenAI) embedding models that can be used with Haystack v1.

retriever = EmbeddingRetriever(
    embedding_model="sentence-transformers/all-MiniLM-L6-v2",
    document_store=document_store,
    top_k=3
)

document_store.update_embeddings(retriever=retriever)

Batches: 100%|██████████| 24/24 [00:00<00:00, 38.67it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 15564.22 docs/s]       


In [57]:
# Testing the retriever

retrieved_tables = retriever.retrieve("What was the number of permanent employees 2021?", top_k=3)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 208.81it/s]

|    | Human Capital - Our people   | 2023   | 2022   | 2021   | 2020   | LoA 2023   | Footnote   |
|----|------------------------------|--------|--------|--------|--------|------------|------------|
|  1 | Permanent employees          | 28 657 | 28 279 | 28 725 |        |            |            |





In [58]:
# Testing the retriever

retrieved_tables = retriever.retrieve("What was the Number of fatalities in the year 2023?", top_k=3)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 168.49it/s]

|    | Human Capital - Our people               | 2023   |   2022 | 2021   | 2020   | LoA 2023   | Footnote   |
|----|------------------------------------------|--------|--------|--------|--------|------------|------------|
| 43 | Employee and service provider fatalities | I      |      1 |        |        |            |            |





In [87]:
# Testing the retriever

retrieved_tables = retriever.retrieve("Black-owned spend", top_k=5)

# Get highest scored table
print(retrieved_tables[0].content)

Batches: 100%|██████████| 1/1 [00:00<00:00, 124.62it/s]

|    | Sasol in Society - Spend   | 2023 - Rm   | 2022 - Rm   | 2021 - Rm   | 2020 - Rm   | LOA 2023   | Footnote   |
|----|----------------------------|-------------|-------------|-------------|-------------|------------|------------|
| 16 | Black-owned spend          | 41 700      | 33 600      | 23 800      |             |            |            |





### LLM

In [92]:
rag_prompt = PromptTemplate(
    prompt="""Use the following pieces of context to answer the question at the end.
              The context may be text or a markdown table.
              Just retrieve the answer from the context. Please don't do any unit conversion.
              If you don't know the answer, please return 'None' for the answer and unit.
              Do not return any words other than 'Answer' and 'Unit' in the answer.
              Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

              \n\n Context: {join(documents)} \n\n Question: {query}? {append} \n\n Answer:""",
    output_parser=AnswerParser(),
)

generation_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo-1106",  # Using 'gpt-3.5-turbo-1106' as it has a larger context window.
    api_key=OPENAI_API_KEY,
    default_prompt_template=rag_prompt,
    output_variable="generated_answer",
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

In [103]:
generation_node_2 = PromptNode(
    model_name_or_path="gpt-3.5-turbo-1106",  # Using 'gpt-3.5-turbo-1106' as it has a larger context window.
    api_key=OPENAI_API_KEY,
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

In [104]:
generation_node_2("What is the capital of France?")

['The capital of France is Paris.']

In [60]:
class GeneratedAnswerParser(BaseComponent):
    """
    Parse the output returned by the generation node.

    The output is expected to be in the format "Answer: <number or None>, Unit: <unit or None>".
    """
    outgoing_edges = 1

    def run(self, generated_answer):
        """
        Parse the output returned by the generation node.

        The output is expected to be in the format "Answer: <number or None>, Unit: <unit or None>".

        Parameters
        ----------
        generated : list[Answer]
            The output returned by the generation node.

        Returns
        -------
        dict
            A dictionary containing the answer and unit.
        """
        output = generated_answer[0].answer

        answer, unit = output.split(", ")
        answer = answer.split(": ")[1]
        unit = unit.split(": ")[1]

        if answer == "None":
            answer = None
        else:
            answer = answer.replace(" ", "")
            answer = answer.replace(",", "")
            answer = int(answer)

        return {"answer": answer, "unit": unit}, "output_1"

    def run_batch(self, **kwargs):
        # TODO: Implement batch processing.
        pass


gen_parser = GeneratedAnswerParser()

In [61]:
# Unit conversion. TODO: Move

def create_unit_conversion_prompt(value, unit, target_unit):
    prompt=f"""You are an expert unit converter. You are aware of how to convert
    between different units within the same system of measurement.
    For example, 1236 million = 1236 * 1 million = 1236 * 1000000 = 1236000000.
    For example, to convert from Rm to R, you would multiply by 1000000. This is because
    1 Rm = 1000000 R.
    Please return a single number as your answer. Do not elaborate or give
    any context.\n\n

    What is {value} {unit} in {target_unit}? \n\n Answer:"""

    return prompt


unit_conversion_node = PromptNode(
    model_name_or_path="gpt-3.5-turbo",
    api_key=OPENAI_API_KEY,
    model_kwargs={"temperature": 0}  # It doesn't seem that the `temperature` parameter is having any effect. Seems like a bug. Might work in haystack 2.0.
)

query = create_unit_conversion_prompt(1.24, "Rm", "R")

unit_conversion_node(query)

['1240000']

### Querying Pipeline

In [93]:
querying_pipeline = Pipeline()
querying_pipeline.add_node(component=retriever, name="retriever", inputs=["Query"])
querying_pipeline.add_node(component=generation_node, name="prompt_node", inputs=["retriever"])
querying_pipeline.add_node(component=gen_parser, name="gen_parser", inputs=["prompt_node"])

In [94]:
output = querying_pipeline.run(
    query="What was the Black-owned spend in the year 2023?",
    params={
        "append": "Do not include the word 'Level' in the answer.",
        "retriever": {"debug": True, "top_k": 5},
        "prompt_node": {"debug": True},
        "gen_parser": {"debug": True},
    }
)

print(f"Answer: {output['answer']}, Unit: {output['unit']}")

ValueError: No node(s) or global parameter(s) named append found in pipeline.

In [76]:
# TODO: Write function to convert the answer to a more human-readable format.
# In particular, the markdown tables aren't very readable as a single line of text.

output["_debug"]

{'retriever': {'input': {'root_node': 'Query',
   'query': 'What was the Black-owned spend in the year 2023?',
   'top_k': 5,
   'debug': True},
  'output': {'documents': [<Document: {'content': '|    | Human Capital - Our people                | 2023   |   2022 |   2021 | 2020   | LoA 2023   |   Footnote |\n|----|-------------------------------------------|--------|--------|--------|--------|------------|------------|\n| 52 | Investment in black employees (R million) | 724,64 |    698 |    884 | 748,00 |            |          6 |', 'content_type': 'text', 'score': 0.501289664925134, 'meta': {'preceding_context': '- Significant fires, explosions and releases\n2\n6', 'following_context': '- Major fires, explosions and releases\n-\n-', 'page': 2}, 'id_hash_keys': ['content'], 'embedding': None, 'id': 'ad9e935dda72423edad23396d1eaa0d6'}>,
    <Document: {'content': '|    | Natural Capital - Our environment      |   2023 |   2022 |   2021 |   2020 | LoA 2023   | Footnote   |\n|----|-------

The `gpt-3.5-turbo` model has a context window of 4,096 tokens. As a result, my prompt is often
being truncated so that the prompt length and answer length (100 tokens) fit within the max token
limit. The updated GPT-3.5 model (`gpt-3.5-turbo-0125`) has a larger context window of 
16,385 tokens. Would be good to use this, if possible (may require using haystack 2.0)
The slightly older GPT-3.5 model `gpt-3.5-turbo-1106` has a larger context window and is available 
with haystack 1.0. I'll use this for now.

In [65]:
# Testing appending additional instructions to the query.

output = querying_pipeline.run(
    query="What was the B-BBEE status in the year 2021? Do not include the word 'Level' in the answer.",
    params={
        "retriever": {"debug": True},
        "prompt_node": {"debug": True},
        "gen_parser": {"debug": True},
    }
)

print(output['answer'])
print(output['unit'])

Batches: 100%|██████████| 1/1 [00:00<00:00, 55.06it/s]


4
None


In [66]:
output = querying_pipeline.run(
    query="What was the Employee turnover in the year 2021?",
    params={
        "retriever": {"debug": True},
        "prompt_node": {"debug": True},
        "gen_parser": {"debug": True},
    }
)

print(output['answer'])
print(output['unit'])

Batches: 100%|██████████| 1/1 [00:00<00:00, 39.46it/s]


3869
None


Should there be two pipelines - one for queries which may require unit conversion,
and another for queries that don't? For example, when retrieving the employee turnover,
the unit should always be 'None'. So why ask the model to try and retrieve this - want 
to make it easier for the model when possible.

In [67]:
querying_pipeline.draw()

Working very well. Only issue I have seen so far is not being able to answer "What was the 
GHG Scope 2 emissions in the year 2021?". 

## Function querying pipeline

I want more control over the pipeline. For example, I want the retriever to be called
with only the metric description (not the year), and for some AMKEYs I want to append 
additional instructions to the question. 

A class based solution might work well. Could be initilaised with the list of documents.

In [70]:
amkey_to_metric = pd.read_csv(AMKEY_TO_METRIC_PATH)
print(amkey_to_metric.head())

# Retrieve the metric for AMKEY=3
metric = amkey_to_metric[amkey_to_metric["AMKEY"] == 49]["ActivityMetric"].item()
print(metric)

   AMKEY                                     ActivityMetric
0      3              Advisory fees as per income statement
1      6  Air emissions of the following pollutants: (1) CO
2      7  Air emissions of the following pollutants: (2)...
3      8  Air emissions of the following pollutants: (3)...
4      9  Air emissions of the following pollutants: (4)...
B-BBEE Scorecard Level


In [73]:
amkey_to_synonym = pd.read_csv(AMKEY_TO_SYNONYM_PATH)
print(amkey_to_synonym.head())

# Retrieve the metric for AMKEY=7 and Group=Impala
metric = amkey_to_synonym[(amkey_to_synonym["AMKEY"] == 49) & (amkey_to_synonym["Group"] == "Sasol")]["ClientMetric"]

   AMKEY   Group                                     ActivityMetric  \
0      7  Impala  Air emissions of the following pollutants: (2)...   
1      7   Sasol  Air emissions of the following pollutants: (2)...   
2      8   Sasol  Air emissions of the following pollutants: (3)...   
3      8     Ssw  Air emissions of the following pollutants: (3)...   
4      8  Impala  Air emissions of the following pollutants: (3)...   

                            ClientMetric  
0                     Total indirect Nox  
1       Nitrogen oxides (NOx) (kilotons)  
2       Sulphur oxides (SOx ) (kilotons)  
3                          SO2 emissions  
4  Total direct SO2 + Total indirect SO2  


In [74]:
metric

17    B-BBEE verification certificate
Name: ClientMetric, dtype: object

In [160]:
# Check if metric is empty
if metric.empty:
    print("No metric found.")

No metric found.


In [171]:
amkey_to_unit = pd.read_csv(AMKEY_TO_UNIT_PATH)
print(amkey_to_unit.head())

unit = amkey_to_unit[amkey_to_unit["AMKEY"] == 6]["Unit"]

   AMKEY                                     ActivityMetric  Unit
0      3              Advisory fees as per income statement  rand
1      6  Air emissions of the following pollutants: (1) CO   NaN
2      7  Air emissions of the following pollutants: (2)...   NaN
3      8  Air emissions of the following pollutants: (3)...   NaN
4      9  Air emissions of the following pollutants: (4)...   NaN


In [179]:
unit[0]

KeyError: 0

In [174]:
unit == 'NaN'

1    False
Name: Unit, dtype: bool

In [151]:
class QueryPipeline2:
    def __init__(
            self,
            docs: list[Document]
        ):
        """
        Initalise the components of the query pipeline.

        Parameters
        ----------
        docs : list[Document]
            The documents to provide context for the queries.
        """
        self.docs = docs

        self.document_store = None
        self.retriever = None
        self.generation_llm = None

        self.initialise_document_store()
        self.initialise_retriever()
        self.initialise_generation_llm()

    def initialise_document_store(self):
        logger.info("Initialising document store")
        self.document_store = InMemoryDocumentStore(embedding_dim=384)
        self.document_store.delete_documents()
        self.document_store.write_documents(docs)

    def initialise_retriever(self, top_k=3):
        logger.info("Initialising retriever")
        self.retriever = EmbeddingRetriever(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            document_store=self.document_store,
            top_k=top_k
        )
        self.document_store.update_embeddings(retriever=self.retriever)

    def initialise_generation_llm(self):
        logger.info("Initialising generation LLM")
        self.generation_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo-1106",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def query(self, metric: str, year: int):
        """
        Return the value of a metric for a given year.

        Uses retrieval augmented generation to answer the query.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.
        """
        context_documents = self.retriever.retrieve(metric)

        prompt = self._create_generation_prompt(metric, year, context_documents)

        answer = self.generation_llm(prompt)[0]

        value, unit = self.parse_answer(answer)

        return value, unit

    def _create_generation_prompt(
            self,
            metric: str,
            year: int,
            docs: list[Document]
        )-> str:
        """
        Create a prompt for the generation LLM.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.

        Returns
        -------
        prompt : str
            The prompt for the generation LLM.
        """
        query = f"What was the {metric} in the year {year}?"

        # TODO: Generalise this to other metrics.
        if metric in ["B-BBEE status", "B-BBEE scorecard level"]:
            query += " Do not include the word 'Level' in the answer."

        context = "\n\n".join([doc.content for doc in docs])

        prompt = f"""Use the following pieces of context to answer the question at the end.
                    The context may be text or a markdown table.
                    Just retrieve the answer from the context. Please don't do any unit conversion.
                    If you don't know the answer, please return 'None' for the answer and unit.
                    Do not return any words other than 'Answer' and 'Unit' in the answer.
                    Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

                    \n\n Context: {context} \n\n Question: {query} \n\n Answer:"""

        return prompt

    def parse_answer(self, answer: str) -> tuple[int | None, str | None]:
        """
        Parse the answer returned by the generation LLM.

        Parameters
        ----------
        answer : str
            The answer returned by the generation LLM. This is expected to be in the
            format "Answer: <number or None>, Unit: <unit or None>".

        Returns
        -------
        value : int | None
            The value from the answer.

        unit : str | None
            The unit from the answer.
        """
        value, unit = answer.split(", ")
        value = value.split(": ")[1]
        unit = unit.split(": ")[1]

        if value == "None":
            value = None
        else:
            value = value.replace(" ", "")
            value = value.replace(",", "")
            value = int(value)

        return value, unit


In [152]:
query_class = QueryPipeline2(docs)

2024-02-18 22:03:20.107 | INFO     | __main__:initialise_document_store:25 - Initialising document store
2024-02-18 22:03:20.115 | INFO     | __main__:initialise_retriever:31 - Initialising retriever
Batches: 100%|██████████| 24/24 [00:00<00:00, 35.90it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 14478.99 docs/s]       
2024-02-18 22:03:22.888 | INFO     | __main__:initialise_generation_llm:40 - Initialising generation LLM


In [153]:
query_class.query("Black-owned spend", 2023)

Batches: 100%|██████████| 1/1 [00:00<00:00, 208.05it/s]


(41700, 'Rm')

## Validation of querying pipeline

In [68]:
VALIDATION_FILE = Path("/home/tomw/unifi-pdf-llm/data/validate/rag_esg_metric_validation.csv")

In [154]:
def validate_rag(docs: list[Document]) -> pd.DataFrame:
    validation_df = pd.read_csv(VALIDATION_FILE)
    results_df = validation_df.copy(deep=True)
    query_class = QueryPipeline2(docs)

    # Add row to results_df for the generated answer
    results_df["Generated"] = None

    for idx, row in validation_df.iterrows():
        year = row["Year"]
        metric = row["Metric"]

        value, _ = query_class.query(metric, year)

        results_df.at[idx, "Generated"] = value

    return results_df


In [155]:
results = validate_rag(querying_pipeline)

2024-02-18 22:04:59.204 | INFO     | __main__:initialise_document_store:25 - Initialising document store
2024-02-18 22:04:59.211 | INFO     | __main__:initialise_retriever:31 - Initialising retriever
Batches: 100%|██████████| 24/24 [00:00<00:00, 31.38it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 12705.90 docs/s]       
2024-02-18 22:05:02.079 | INFO     | __main__:initialise_generation_llm:40 - Initialising generation LLM
Batches: 100%|██████████| 1/1 [00:00<00:00, 217.39it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 85.72it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 88.78it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 170.85it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 200.06it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 208.46it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 98.09it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 122.86it/s]


In [157]:
results

Unnamed: 0,Company,Year,Metric,Unit,Source,Content Type,Page,Notes,Answer,Generated
0,SASOL,2023,Number of permanent employees,,SASOL Sustainability Report 2023 20-09_0.pdf,Table,17,May get confuesed with employee numbers in tab...,28657,28657
1,SASOL,2023,Employee turnover,,SASOL Sustainability Report 2023 20-09_0.pdf,Text,18,,1725,1725
2,SASOL,2023,B-BBEE status,,SASOL Sustainability Report 2023 20-09_0.pdf,Table,57,,3,3
3,SASOL,2023,B-BBEE scorecard level,,SASOL Sustainability Report 2023 20-09_0.pdf,Table,57,,3,3
4,SASOL,2023,Black women-owned spend,rand,SASOL Sustainability Report 2023 20-09_0.pdf,Text,11,May be retrieved from table. Requires conversi...,28500,28500
5,SASOL,2023,Black-owned spend,rand,SASOL Sustainability Report 2023 20-09_0.pdf,Text,11,Required conversion from million rand to rand,41700,41700
6,SASOL,2023,Number of fatalities,,SASOL Sustainability Report 2023 20-09_0.pdf,Text,11,,2,2
7,SASOL,2023,Number of undergraduate and postgraduate bursa...,,SASOL Sustainability Report 2023 20-09_0.pdf,Text,18,,544,544


## Query Class

In [7]:
class QueryPipeline:
    def __init__(
            self,
            docs: list[Document],
            company: str,
            amkey_to_metric_path: str,
            amkey_to_synonym_path: str,
            amkey_to_unit_path : str
        ):
        """
        Initalise the components of the query pipeline.

        Parameters
        ----------
        docs : list[Document]
            The documents to provide context for the queries.

        company : str
            The company the documents are for.

        amkey_to_metric_path : str
            Path to a csv file mapping AMKEY to metric.

        amkey_to_synonym_path : str
            Path to a csv file mapping AMKEY and company to metric synonym.

        amkey_to_unit_path : str
            Path to a csv file mapping AMKEY to desired unit.
        """
        self.docs = docs
        self.company = company
        self.amkey_to_metric_path = amkey_to_metric_path
        self.amkey_to_synonym_path = amkey_to_synonym_path
        self.amkey_to_unit_path = amkey_to_unit_path

        self.document_store = None
        self.retriever = None
        self.generation_llm = None
        self.unit_conversion_llm = None
        self.amkey_to_metric = None
        self.amkey_to_synonym = None
        self.amkey_to_unit = None

        self.initialise_document_store()
        self.initialise_retriever()
        self.initialise_generation_llm()
        self.initialise_unit_conversion_llm()
        self.initialise_mappings()

    def initialise_document_store(self):
        logger.info("Initialising document store")
        self.document_store = InMemoryDocumentStore(embedding_dim=384)
        self.document_store.delete_documents()
        self.document_store.write_documents(docs)

    def initialise_retriever(self, top_k=3):
        logger.info("Initialising retriever")
        self.retriever = EmbeddingRetriever(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            document_store=self.document_store,
            top_k=top_k
        )
        self.document_store.update_embeddings(retriever=self.retriever)

    def initialise_generation_llm(self):
        logger.info("Initialising generation LLM")
        self.generation_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo-1106",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_unit_conversion_llm(self):
        logger.info("Initialising unit conversion LLM")
        self.unit_conversion_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_mappings(self):
        logger.info("Initialising mappings")
        self.amkey_to_metric = pd.read_csv(self.amkey_to_metric_path)
        self.amkey_to_synonym = pd.read_csv(self.amkey_to_synonym_path)
        self.amkey_to_unit = pd.read_csv(self.amkey_to_unit_path)

    def query(self, amkey: int, year: int):
        """
        Return the value associated with an AMKEY for a given year.

        Uses retrieval augmented generation to retrieve the value.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric to retrieve.

        year : int
            The year to retrieve the metric for.

        Returns
        -------
        value : int
            The value associated with the AMKEY for the given year.
        """
        metric = self.retrieve_company_metric_description(amkey)
        if metric is None:
            metric = self.retrieve_metric_description(amkey)
        logger.debug(f"Retrieving metric: {metric}")

        context_documents = self.retriever.retrieve(metric)

        append = self._retrieve_additional_appended_instructions(amkey)
        logger.debug(f"Appending: {append}")

        prompt = self._create_generation_prompt(metric, year, context_documents, append)

        answer = self.generation_llm(prompt)[0]
        logger.debug(f"Generated answer: {answer}")

        value, unit = self.parse_answer(answer)

        required_unit = self.retrieve_unit(amkey)
        logger.debug(f"Required unit: {required_unit}")

        if required_unit is not None:
            if unit != required_unit:
                unit_conversion_prompt = self.create_unit_conversion_prompt(value, unit, required_unit)
                value = self.unit_conversion_llm(unit_conversion_prompt)

        return value

    def _retrieve_additional_appended_instructions(self, amkey: int) -> str:
        """
        Return additional instructions to append to the query.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric to retrieve.

        Returns
        -------
        append : str
            Additional instructions to append to the query.
        """
        if amkey in [47, 48, 49]:
            append = "Do not include the word 'Level' in the answer."
        else:
            append = ""

        return append

    def _create_generation_prompt(
            self,
            metric: str,
            year: int,
            docs: list[Document],
            append: str
        )-> str:
        """
        Create a prompt for the generation LLM.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.

        docs : list[Document]
            The documents to provide context for the queries.

        append : str
            Additional instructions to append to the query.

        Returns
        -------
        prompt : str
            The prompt for the generation LLM.
        """
        query = f"What was the {metric} in the year {year}?"

        context = "\n\n".join([doc.content for doc in docs])

        prompt = f"""Use the following pieces of context to answer the question at the end.
                    The context may be text or a markdown table.
                    Just retrieve the answer from the context. Please don't do any unit conversion.
                    If you don't know the answer, please return 'None' for the answer and unit.
                    Do not return any words other than 'Answer' and 'Unit' in the answer.
                    Please return the answer in the format 'Answer: <number or None>, Unit: <unit or None>'.

                    \n\n Context: {context} \n\n Question: {query} {append}\n\n Answer:"""

        return prompt

    def create_unit_conversion_prompt(self, value: int, unit: str, target_unit: str) -> str:
        prompt=f"""You are an expert unit converter. You are aware of how to convert
                    between different units within the same system of measurement.
                    For example, 1236 million = 1236 * 1 million = 1236 * 1000000 = 1236000000.
                    For example, to convert from Rm to R, you would multiply by 1000000. This is because
                    1 Rm = 1000000 R.
                    Please return a single number as your answer. Do not elaborate or give
                    any context.\n\n

                    What is {value} {unit} in {target_unit}? \n\n Answer:"""

        return prompt

    def parse_answer(self, answer: str) -> tuple[float | None, str | None]:
        """
        Parse the answer returned by the generation LLM.

        Parameters
        ----------
        answer : str
            The answer returned by the generation LLM. This is expected to be in the
            format "Answer: <number or None>, Unit: <unit or None>".

        Returns
        -------
        value : float | None
            The value from the answer.

        unit : str | None
            The unit from the answer.
        """
        value, unit = answer.split(", ")
        value = value.split(": ")[1]
        unit = unit.split(": ")[1]

        if value == "None":
            value = None
        else:
            value = value.replace(" ", "")
            value = value.replace(",", "")
            value = float(value)

        return value, unit

    def retrieve_company_metric_description(self, amkey: int) -> str | None:
        """
        Return the company-specific description of a metric, if available.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str | None
            The company-specific description of the metric, if available.
            Otherwise, None.
        """
        metric = self.amkey_to_synonym[
            (self.amkey_to_synonym["AMKEY"] == amkey)
            & (self.amkey_to_synonym["Group"] == self.company)
        ]["ClientMetric"]

        if metric.empty:
            metric = None
        else:
            metric = metric.item()

        return metric

    def retrieve_metric_description(self, amkey: int) -> str:
        """
        Return the description of a metric.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str
            The description of the metric.

        Raises
        ------
        ValueError
            If the AMKEY is invalid.
        """
        try:
            metric = self.amkey_to_metric[
                self.amkey_to_metric["AMKEY"] == amkey
            ]["ActivityMetric"].item()
        except Exception:
            raise ValueError(f"Invalid AMKEY {amkey}")

        return metric

    def retrieve_unit(self, amkey: int) -> str | None:
        """
        Return the required unit for a metric.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        unit : str | None
            The required unit for the metric, if specified. Otherwise, None.
        """
        try:
            unit = self.amkey_to_unit[self.amkey_to_unit["AMKEY"] == amkey]["Unit"][0]
        except KeyError:
            unit = None

        return unit


In [8]:
# TODO: Test the QueryPipeline class.
query_pipeline = QueryPipeline(
    docs=docs,
    company="Sasol",
    amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
    amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
    amkey_to_unit_path=AMKEY_TO_UNIT_PATH
)

2024-02-20 21:23:04.484 | INFO     | __main__:initialise_document_store:51 - Initialising document store
2024-02-20 21:23:04.490 | INFO     | __main__:initialise_retriever:57 - Initialising retriever
Batches: 100%|██████████| 24/24 [00:01<00:00, 18.78it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 7679.60 docs/s]        
2024-02-20 21:23:09.117 | INFO     | __main__:initialise_generation_llm:66 - Initialising generation LLM
2024-02-20 21:23:09.400 | INFO     | __main__:initialise_unit_conversion_llm:74 - Initialising unit conversion LLM
2024-02-20 21:23:09.401 | INFO     | __main__:initialise_mappings:82 - Initialising mappings


In [9]:
query_pipeline.query(
    amkey=49,
    year=2021
)

2024-02-20 21:23:09.411 | DEBUG    | __main__:query:109 - Retrieving metric: B-BBEE verification certificate
Batches: 100%|██████████| 1/1 [00:00<00:00, 110.36it/s]
2024-02-20 21:23:09.562 | DEBUG    | __main__:query:114 - Retrieved append: Do not include the word 'Level' in the answer.
2024-02-20 21:23:10.446 | DEBUG    | __main__:query:119 - Generated answer: Answer: 4, Unit: None
2024-02-20 21:23:10.448 | DEBUG    | __main__:query:124 - Required unit: None


4.0

In [10]:
query_pipeline.retrieve_metric_description(49)

'B-BBEE Scorecard Level'

In [11]:
query_pipeline.retrieve_company_metric_description(49)

'B-BBEE verification certificate'

In [12]:
PATH = "/home/tomw/unifi-pdf-llm/data/azureconverter_outputs/afs2021.json"

converter = AzureConverter(
    endpoint="https://azureconverter.cognitiveservices.azure.com/",
    credential_key=AZURE_CONVERTER_KEY,
    model_id="prebuilt-layout",  # Was "prebuilt-document"
)

uct_docs = converter.convert_azure_json(file_path=Path(PATH))

In [13]:
len(uct_docs)

124

### Retrieving from entire documents

In [28]:
from mapping import COMPANY_YEAR_PDF_MAPPING

AZURE_CONVERTER_DIR = "/home/tomw/unifi-pdf-llm/data/azureconverter_outputs"
"""Path to directory with json outputs from AzureConverter."""


def load_documents(company: str, year: int) -> list[Document]:
    """
    Load documents for a company and year.

    Parameters
    ----------
    company : str
        The company to load documents for.

    year : int
        The year to load documents for.

    Returns
    -------
    company_docs : list[Document]
        The documents for the company and year.

    Raises
    ------
    ValueError
        If no documents are found for the company and year.
    """
    company_docs = []
    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
    )

    try:
        file_name_list = COMPANY_YEAR_PDF_MAPPING[company][year]
    except KeyError:
        raise ValueError(f"No documents found for {company} in {year}")

    for file_name in file_name_list:
        file_name = file_name.replace(".pdf", ".json")
        file_path = Path(AZURE_CONVERTER_DIR) / file_name
        logger.info(f"Loading documents from {file_path}")
        docs = converter.convert_azure_json(file_path=file_path)
        company_docs.extend(docs)

    return company_docs


In [39]:
TRAIN_CSV_PATH = "/home/tomw/unifi-pdf-llm/data/Train.csv"

train_df = pd.read_csv(TRAIN_CSV_PATH)

# Remove all rows with NaN in all of the columns 2021_Value	2020_Value	2019_Value
train_df = train_df.dropna(subset=["2021_Value", "2020_Value", "2019_Value"], how="all")
train_df.reset_index(drop=True, inplace=True)

In [40]:
COMPANY = "Tongaat"
YEAR = 2021

In [41]:
# Restrict to the rows associated with ID <AMKEY>_X_<COMPANY>
train_df = train_df[train_df["ID"].str.contains(f"X_{COMPANY}")]
train_df.reset_index(drop=True, inplace=True)

In [42]:
train_df

Unnamed: 0,ID,2021_Value,2020_Value,2019_Value
0,12_X_Tongaat,1.331,1.636,2.039000e+00
1,28_X_Tongaat,60204.0,63512.0,7.844200e+04
2,49_X_Tongaat,4.0,4.0,4.000000e+00
3,52_X_Tongaat,99.0,97.9,9.493000e+01
4,114_X_Tongaat,16.63,16.18,1.495000e+01
...,...,...,...,...
56,871_X_Tongaat,8760.0,9331.0,1.958500e+04
57,874_X_Tongaat,6769128.0,6210711.0,2.264179e+07
58,1041_X_Tongaat,70.0,70.0,7.000000e+01
59,1042_X_Tongaat,70.0,70.0,7.000000e+01


In [50]:
docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(docs, window_size=1)

2024-02-20 21:36:03.113 | INFO     | __main__:load_documents:39 - Loading /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG.json


In [51]:
query_pipeline = QueryPipeline(
    docs=docs,
    company=COMPANY,
    amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
    amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
    amkey_to_unit_path=AMKEY_TO_UNIT_PATH
)

2024-02-20 21:37:32.131 | INFO     | __main__:initialise_document_store:51 - Initialising document store
2024-02-20 21:37:32.139 | INFO     | __main__:initialise_retriever:57 - Initialising retriever
Batches: 100%|██████████| 27/27 [00:01<00:00, 25.54it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 9182.31 docs/s]        
2024-02-20 21:37:35.296 | INFO     | __main__:initialise_generation_llm:66 - Initialising generation LLM
2024-02-20 21:37:35.297 | INFO     | __main__:initialise_unit_conversion_llm:74 - Initialising unit conversion LLM
2024-02-20 21:37:35.298 | INFO     | __main__:initialise_mappings:82 - Initialising mappings


In [58]:
results_df = train_df.copy(deep=True)

In [59]:
# Loop over the rows in the dataframe and retrieve the value for each AMKEY
for idx, row in train_df.iterrows():
    amkey = int(row["ID"].split("_")[0])
    print(f"Retrieving value for AMKEY {amkey}")

    value = query_pipeline.query(amkey, YEAR)

    results_df.at[idx, "Generated"] = value

2024-02-20 21:41:38.534 | DEBUG    | __main__:query:109 - Retrieving metric: Total injury frequency rate (TIFR) – employees and contractors


Retrieving value for AMKEY 12


Batches: 100%|██████████| 1/1 [00:00<00:00, 51.90it/s]
2024-02-20 21:41:38.583 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:39.196 | DEBUG    | __main__:query:119 - Generated answer: Answer: 1.331, Unit: rate
2024-02-20 21:41:39.197 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:39.199 | DEBUG    | __main__:query:109 - Retrieving metric: Total – company managed/farmed land (owned and leased)


Retrieving value for AMKEY 28


Batches: 100%|██████████| 1/1 [00:00<00:00, 98.03it/s]
2024-02-20 21:41:39.238 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:39.970 | DEBUG    | __main__:query:119 - Generated answer: Answer: 60,204, Unit: hectares
2024-02-20 21:41:39.971 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:39.972 | DEBUG    | __main__:query:109 - Retrieving metric: B-BBEE Level


Retrieving value for AMKEY 49


Batches: 100%|██████████| 1/1 [00:00<00:00, 109.39it/s]
2024-02-20 21:41:40.010 | DEBUG    | __main__:query:114 - Retrieved append: Do not include the word 'Level' in the answer.
2024-02-20 21:41:40.645 | DEBUG    | __main__:query:119 - Generated answer: Answer: 4, Unit: None
2024-02-20 21:41:40.647 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:40.648 | DEBUG    | __main__:query:109 - Retrieving metric: Overall Board and Committee meeting attendance


Retrieving value for AMKEY 52


Batches: 100%|██████████| 1/1 [00:00<00:00, 116.21it/s]
2024-02-20 21:41:40.688 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:41.313 | DEBUG    | __main__:query:119 - Generated answer: Answer: 99, Unit: %
2024-02-20 21:41:41.315 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:41.316 | DEBUG    | __main__:query:109 - Retrieving metric: Energy efficiency: total direct and indirect energy consumed per ton of sugar produced


Retrieving value for AMKEY 114


Batches: 100%|██████████| 1/1 [00:00<00:00, 53.54it/s]
2024-02-20 21:41:41.366 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:42.041 | DEBUG    | __main__:query:119 - Generated answer: Answer: 16.63, Unit: GJ/t
2024-02-20 21:41:42.042 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:42.043 | DEBUG    | __main__:query:109 - Retrieving metric: Fatal injury frequency rate (FIFR, i.e. number of fatalities per 200 000 person hours worked) – employees and contractors


Retrieving value for AMKEY 122


Batches: 100%|██████████| 1/1 [00:00<00:00, 149.16it/s]
2024-02-20 21:41:42.084 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:42.729 | DEBUG    | __main__:query:119 - Generated answer: Answer: 0.005, Unit: rate
2024-02-20 21:41:42.730 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:42.731 | DEBUG    | __main__:query:109 - Retrieving metric: Carbon emissions – Scope 1


Retrieving value for AMKEY 128


Batches: 100%|██████████| 1/1 [00:00<00:00, 125.52it/s]
2024-02-20 21:41:42.769 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:43.435 | DEBUG    | __main__:query:119 - Generated answer: Answer: 557114, Unit: tCO2e
2024-02-20 21:41:43.436 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:43.438 | DEBUG    | __main__:query:109 - Retrieving metric: Carbon emissions – Scope 2


Retrieving value for AMKEY 129


Batches: 100%|██████████| 1/1 [00:00<00:00, 131.18it/s]
2024-02-20 21:41:43.475 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:44.164 | DEBUG    | __main__:query:119 - Generated answer: Answer: 51,539, Unit: tCO2e
2024-02-20 21:41:44.165 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:44.166 | DEBUG    | __main__:query:109 - Retrieving metric: Hazardous waste disposed of at appropriate facilities


Retrieving value for AMKEY 138


Batches: 100%|██████████| 1/1 [00:00<00:00, 122.32it/s]
2024-02-20 21:41:44.203 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:45.006 | DEBUG    | __main__:query:119 - Generated answer: Answer: 186, Unit: Tons
2024-02-20 21:41:45.008 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:45.009 | DEBUG    | __main__:query:109 - Retrieving metric: Lost time injury frequency rate (LTIFR, i.e. number of LTIs per 200 000 person hours worked) – employees and contractors


Retrieving value for AMKEY 151


Batches: 100%|██████████| 1/1 [00:00<00:00, 78.62it/s]
2024-02-20 21:41:45.051 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:45.734 | DEBUG    | __main__:query:119 - Generated answer: Answer: 0.093, Unit: None
2024-02-20 21:41:45.736 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:45.737 | DEBUG    | __main__:query:109 - Retrieving metric: Natural gas used in stationary combustion equipment


Retrieving value for AMKEY 163


Batches: 100%|██████████| 1/1 [00:00<00:00, 108.00it/s]
2024-02-20 21:41:45.776 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:46.540 | DEBUG    | __main__:query:119 - Generated answer: Answer: 0, Unit: m
2024-02-20 21:41:46.542 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:46.543 | DEBUG    | __main__:query:109 - Retrieving metric: Non-hazardous waste sent to landfill


Retrieving value for AMKEY 170


Batches: 100%|██████████| 1/1 [00:00<00:00, 147.55it/s]
2024-02-20 21:41:46.580 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:47.264 | DEBUG    | __main__:query:119 - Generated answer: Answer: 7 681, Unit: Tons
2024-02-20 21:41:47.266 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:47.267 | DEBUG    | __main__:query:109 - Retrieving metric: Number of employees with disabilities


Retrieving value for AMKEY 213


Batches: 100%|██████████| 1/1 [00:00<00:00, 63.66it/s]
2024-02-20 21:41:47.311 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:47.982 | DEBUG    | __main__:query:119 - Generated answer: Answer: 41, Unit: number
2024-02-20 21:41:47.983 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:47.985 | DEBUG    | __main__:query:109 - Retrieving metric: Environmental incidents – Level 3


Retrieving value for AMKEY 216


Batches: 100%|██████████| 1/1 [00:00<00:00, 94.91it/s]
2024-02-20 21:41:48.026 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:48.672 | DEBUG    | __main__:query:119 - Generated answer: Answer: 0, Unit: number
2024-02-20 21:41:48.673 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:48.674 | DEBUG    | __main__:query:109 - Retrieving metric: Fatalities (i.e. injuries on duty leading to death, excluding the deaths of workers not occurring “at work”) – employees and contractors


Retrieving value for AMKEY 219


Batches: 100%|██████████| 1/1 [00:00<00:00, 61.74it/s]
2024-02-20 21:41:48.719 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:49.405 | DEBUG    | __main__:query:119 - Generated answer: Answer: 7, Unit: None
2024-02-20 21:41:49.406 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:49.407 | DEBUG    | __main__:query:109 - Retrieving metric: Female Board members


Retrieving value for AMKEY 220


Batches: 100%|██████████| 1/1 [00:00<00:00, 156.78it/s]
2024-02-20 21:41:49.444 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:50.138 | DEBUG    | __main__:query:119 - Generated answer: Answer: 2, Unit: number
2024-02-20 21:41:50.139 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:50.140 | DEBUG    | __main__:query:109 - Retrieving metric: Employees currently receiving company-provided ART (anti-retroviral treatment)


Retrieving value for AMKEY 226


Batches: 100%|██████████| 1/1 [00:00<00:00, 78.73it/s]
2024-02-20 21:41:50.185 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:50.874 | DEBUG    | __main__:query:119 - Generated answer: Answer: 2 707, Unit: number
2024-02-20 21:41:50.876 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:50.877 | DEBUG    | __main__:query:109 - Retrieving metric: Lost time injuries (LTIs, i.e. injuries on duty leading to at least one lost day) – employees and contractors


Retrieving value for AMKEY 236


Batches: 100%|██████████| 1/1 [00:00<00:00, 70.05it/s]
2024-02-20 21:41:50.920 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:51.618 | DEBUG    | __main__:query:119 - Generated answer: Answer: 36, Unit: None
2024-02-20 21:41:51.620 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:51.621 | DEBUG    | __main__:query:109 - Retrieving metric: Total number of recordable Injuries, including MTCs, LTIs and Fatalities – employees and contractors


Retrieving value for AMKEY 244


Batches: 100%|██████████| 1/1 [00:00<00:00, 54.13it/s]
2024-02-20 21:41:51.668 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:52.486 | DEBUG    | __main__:query:119 - Generated answer: Answer: 365, Unit: number
2024-02-20 21:41:52.488 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:52.489 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of employees who are deemed HDSA (SA only)


Retrieving value for AMKEY 289


Batches: 100%|██████████| 1/1 [00:00<00:00, 147.75it/s]
2024-02-20 21:41:52.524 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:53.259 | DEBUG    | __main__:query:119 - Generated answer: Answer: 95.3, Unit: %
2024-02-20 21:41:53.260 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:53.261 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of employees covered by collective bargaining agreements


Retrieving value for AMKEY 295


Batches: 100%|██████████| 1/1 [00:00<00:00, 149.87it/s]
2024-02-20 21:41:53.298 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:53.971 | DEBUG    | __main__:query:119 - Generated answer: Answer: 80, Unit: %
2024-02-20 21:41:53.972 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:53.974 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of female Board members.


Retrieving value for AMKEY 298


Batches: 100%|██████████| 1/1 [00:00<00:00, 152.17it/s]
2024-02-20 21:41:54.009 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:54.707 | DEBUG    | __main__:query:119 - Generated answer: Answer: 20, Unit: %
2024-02-20 21:41:54.708 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:54.710 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of waste disposed of that is sent for recycling (or reuse)


Retrieving value for AMKEY 338


Batches: 100%|██████████| 1/1 [00:00<00:00, 152.81it/s]
2024-02-20 21:41:54.748 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:55.419 | DEBUG    | __main__:query:119 - Generated answer: Answer: 97.7, Unit: %
2024-02-20 21:41:55.420 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:55.421 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of employees who are women (SA only)


Retrieving value for AMKEY 349


Batches: 100%|██████████| 1/1 [00:00<00:00, 101.15it/s]
2024-02-20 21:41:55.462 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:56.184 | DEBUG    | __main__:query:119 - Generated answer: Answer: 25.9, Unit: %
2024-02-20 21:41:56.185 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:56.186 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of energy consumption sourced from renewable fuels


Retrieving value for AMKEY 353


Batches: 100%|██████████| 1/1 [00:00<00:00, 161.08it/s]
2024-02-20 21:41:56.221 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:56.925 | DEBUG    | __main__:query:119 - Generated answer: Answer: 79.41, Unit: %
2024-02-20 21:41:56.926 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:56.927 | DEBUG    | __main__:query:109 - Retrieving metric: Total recordable injury frequency rate (TRIFR) – employees and contractors


Retrieving value for AMKEY 379


Batches: 100%|██████████| 1/1 [00:00<00:00, 151.32it/s]
2024-02-20 21:41:56.963 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:57.647 | DEBUG    | __main__:query:119 - Generated answer: Answer: 1.037, Unit: rate
2024-02-20 21:41:57.648 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:57.649 | DEBUG    | __main__:query:109 - Retrieving metric: Corporate social investment (CSI)/Socio-economic development (SED) expenditures


Retrieving value for AMKEY 398


Batches: 100%|██████████| 1/1 [00:00<00:00, 81.96it/s]
2024-02-20 21:41:57.691 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:58.301 | DEBUG    | __main__:query:119 - Generated answer: Answer: None, Unit: None
2024-02-20 21:41:58.302 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:58.304 | DEBUG    | __main__:query:109 - Retrieving metric: Total Rand value of investments in projects to improve environmental efficiencies


Retrieving value for AMKEY 432


Batches: 100%|██████████| 1/1 [00:00<00:00, 104.34it/s]
2024-02-20 21:41:58.344 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:59.002 | DEBUG    | __main__:query:119 - Generated answer: Answer: 33 097 440, Unit: Rand
2024-02-20 21:41:59.004 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:59.005 | DEBUG    | __main__:query:109 - Retrieving metric: Value of employee training spend


Retrieving value for AMKEY 466


Batches: 100%|██████████| 1/1 [00:00<00:00, 112.25it/s]
2024-02-20 21:41:59.045 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:59.818 | DEBUG    | __main__:query:119 - Generated answer: Answer: 23,822,054, Unit: Rands
2024-02-20 21:41:59.819 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:41:59.820 | DEBUG    | __main__:query:109 - Retrieving metric: Total direct and indirect energy consumption


Retrieving value for AMKEY 468


Batches: 100%|██████████| 1/1 [00:00<00:00, 79.91it/s]
2024-02-20 21:41:59.864 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:00.595 | DEBUG    | __main__:query:119 - Generated answer: Answer: 29 162 679, Unit: GJ
2024-02-20 21:42:00.597 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:00.598 | DEBUG    | __main__:query:109 - Retrieving metric: Enterprise and supplier development spend (i.e. support for small business)


Retrieving value for AMKEY 477


Batches: 100%|██████████| 1/1 [00:00<00:00, 106.77it/s]
2024-02-20 21:42:00.640 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:01.275 | DEBUG    | __main__:query:119 - Generated answer: Answer: 35,721,923, Unit: Rands
2024-02-20 21:42:01.277 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:01.278 | DEBUG    | __main__:query:109 - Retrieving metric: Water abstracted – total


Retrieving value for AMKEY 488


Batches: 100%|██████████| 1/1 [00:00<00:00, 158.92it/s]
2024-02-20 21:42:01.313 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:02.044 | DEBUG    | __main__:query:119 - Generated answer: Answer: 586,251,099, Unit: m3
2024-02-20 21:42:02.046 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:02.047 | DEBUG    | __main__:query:109 - Retrieving metric: Net volume of water consumed


Retrieving value for AMKEY 489


Batches: 100%|██████████| 1/1 [00:00<00:00, 115.21it/s]
2024-02-20 21:42:02.084 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:02.781 | DEBUG    | __main__:query:119 - Generated answer: Answer: 834 868 840, Unit: m3
2024-02-20 21:42:02.782 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:02.783 | DEBUG    | __main__:query:109 - Retrieving metric: Total number of training hours


Retrieving value for AMKEY 500


Batches: 100%|██████████| 1/1 [00:00<00:00, 136.97it/s]
2024-02-20 21:42:02.820 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:03.445 | DEBUG    | __main__:query:119 - Generated answer: Answer: 66 986, Unit: number
2024-02-20 21:42:03.447 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:03.448 | DEBUG    | __main__:query:109 - Retrieving metric: Volume of electricity consumed – purchased and self-generated


Retrieving value for AMKEY 508


Batches: 100%|██████████| 1/1 [00:00<00:00, 100.88it/s]
2024-02-20 21:42:03.486 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:04.146 | DEBUG    | __main__:query:119 - Generated answer: Answer: 715 832, Unit: MWh
2024-02-20 21:42:04.147 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:04.148 | DEBUG    | __main__:query:109 - Retrieving metric: Number of contractor (vendors) employees (full time equivalent*)


Retrieving value for AMKEY 517


Batches: 100%|██████████| 1/1 [00:00<00:00, 110.65it/s]
2024-02-20 21:42:04.187 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:04.919 | DEBUG    | __main__:query:119 - Generated answer: Answer: 4,408, Unit: number
2024-02-20 21:42:04.920 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:04.921 | DEBUG    | __main__:query:109 - Retrieving metric: Number of employees (full time equivalent)


Retrieving value for AMKEY 523


Batches: 100%|██████████| 1/1 [00:00<00:00, 109.94it/s]
2024-02-20 21:42:04.961 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:05.658 | DEBUG    | __main__:query:119 - Generated answer: Answer: 32 949, Unit: number
2024-02-20 21:42:05.660 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:05.661 | DEBUG    | __main__:query:109 - Retrieving metric: Number of person hours worked – employees and contractors


Retrieving value for AMKEY 533


Batches: 100%|██████████| 1/1 [00:00<00:00, 103.97it/s]
2024-02-20 21:42:05.703 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:06.437 | DEBUG    | __main__:query:119 - Generated answer: Answer: 87 701 632, Unit: hours
2024-02-20 21:42:06.438 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:06.439 | DEBUG    | __main__:query:109 - Retrieving metric: New cases of NIHL* – employees and contractors


Retrieving value for AMKEY 550


Batches: 100%|██████████| 1/1 [00:00<00:00, 93.14it/s]
2024-02-20 21:42:06.480 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:07.150 | DEBUG    | __main__:query:119 - Generated answer: Answer: 20, Unit: number
2024-02-20 21:42:07.151 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:07.152 | DEBUG    | __main__:query:109 - Retrieving metric: Number of registered patents


Retrieving value for AMKEY 555


Batches: 100%|██████████| 1/1 [00:00<00:00, 93.37it/s]
2024-02-20 21:42:07.194 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:07.814 | DEBUG    | __main__:query:119 - Generated answer: Answer: 2, Unit: number
2024-02-20 21:42:07.815 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:07.816 | DEBUG    | __main__:query:109 - Retrieving metric: Water discharged (scheduled and/or non-scheduled effluent and/ or overflows) – total


Retrieving value for AMKEY 573


Batches: 100%|██████████| 1/1 [00:00<00:00, 132.69it/s]
2024-02-20 21:42:07.855 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:08.798 | DEBUG    | __main__:query:119 - Generated answer: Answer: 72,721,647, Unit: m3
2024-02-20 21:42:08.800 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:08.801 | DEBUG    | __main__:query:109 - Retrieving metric: CSI/SED spend on health, including HIV/AIDS


Retrieving value for AMKEY 587


Batches: 100%|██████████| 1/1 [00:00<00:00, 156.64it/s]
2024-02-20 21:42:08.838 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:09.570 | DEBUG    | __main__:query:119 - Generated answer: Answer: 99 893 760, Unit: Rands
2024-02-20 21:42:09.571 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:09.572 | DEBUG    | __main__:query:109 - Retrieving metric: Investments in projects to improve waste efficiency


Retrieving value for AMKEY 595


Batches: 100%|██████████| 1/1 [00:00<00:00, 119.41it/s]
2024-02-20 21:42:09.610 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:10.227 | DEBUG    | __main__:query:119 - Generated answer: Answer: None, Unit: Rand
2024-02-20 21:42:10.228 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:10.230 | DEBUG    | __main__:query:109 - Retrieving metric: CSI/SED spend on basic needs and social development, including nutrition and/or feeding programmes


Retrieving value for AMKEY 607


Batches: 100%|██████████| 1/1 [00:00<00:00, 94.54it/s]
2024-02-20 21:42:10.271 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:11.003 | DEBUG    | __main__:query:119 - Generated answer: Answer: 5 258 537, Unit: Rands
2024-02-20 21:42:11.005 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:11.006 | DEBUG    | __main__:query:109 - Retrieving metric: Rand value of investments in COVID-19 avoidance, mitigation and treatment


Retrieving value for AMKEY 622


Batches: 100%|██████████| 1/1 [00:00<00:00, 105.23it/s]
2024-02-20 21:42:11.043 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:11.711 | DEBUG    | __main__:query:119 - Generated answer: Answer: 15, Unit: None
2024-02-20 21:42:11.712 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:11.713 | DEBUG    | __main__:query:109 - Retrieving metric: Investments in projects to improve energy efficiency


Retrieving value for AMKEY 627


Batches: 100%|██████████| 1/1 [00:00<00:00, 114.50it/s]
2024-02-20 21:42:11.752 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:12.520 | DEBUG    | __main__:query:119 - Generated answer: Answer: 18 140 418, Unit: Rand
2024-02-20 21:42:12.522 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:12.523 | DEBUG    | __main__:query:109 - Retrieving metric: Cost of ART**


Retrieving value for AMKEY 630


Batches: 100%|██████████| 1/1 [00:00<00:00, 116.67it/s]
2024-02-20 21:42:12.560 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:13.268 | DEBUG    | __main__:query:119 - Generated answer: Answer: 867883, Unit: Rands
2024-02-20 21:42:13.269 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:13.271 | DEBUG    | __main__:query:109 - Retrieving metric: Investments in projects to improve water efficiency


Retrieving value for AMKEY 639


Batches: 100%|██████████| 1/1 [00:00<00:00, 123.89it/s]
2024-02-20 21:42:13.311 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:14.064 | DEBUG    | __main__:query:119 - Generated answer: Answer: 14 957 022, Unit: Rand
2024-02-20 21:42:14.065 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:14.066 | DEBUG    | __main__:query:109 - Retrieving metric: Employee turnover – South Africa


Retrieving value for AMKEY 732


Batches: 100%|██████████| 1/1 [00:00<00:00, 138.66it/s]
2024-02-20 21:42:14.102 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:14.888 | DEBUG    | __main__:query:119 - Generated answer: Answer: 20.4, Unit: %
2024-02-20 21:42:14.889 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:14.890 | DEBUG    | __main__:query:109 - Retrieving metric: Total volume of waste recycled (or reused)


Retrieving value for AMKEY 734


Batches: 100%|██████████| 1/1 [00:00<00:00, 130.69it/s]
2024-02-20 21:42:14.926 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:15.564 | DEBUG    | __main__:query:119 - Generated answer: Answer: 322986, Unit: Tons
2024-02-20 21:42:15.565 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:15.566 | DEBUG    | __main__:query:109 - Retrieving metric: Total carbon emissions – Scope 1 and 2


Retrieving value for AMKEY 749


Batches: 100%|██████████| 1/1 [00:00<00:00, 122.00it/s]
2024-02-20 21:42:15.605 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:16.322 | DEBUG    | __main__:query:119 - Generated answer: Answer: 833091, Unit: tCO2e
2024-02-20 21:42:16.324 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:16.325 | DEBUG    | __main__:query:109 - Retrieving metric: Board members


Retrieving value for AMKEY 750


Batches: 100%|██████████| 1/1 [00:00<00:00, 165.71it/s]
2024-02-20 21:42:16.362 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:17.085 | DEBUG    | __main__:query:119 - Generated answer: Answer: 10, Unit: number
2024-02-20 21:42:17.086 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:17.087 | DEBUG    | __main__:query:109 - Retrieving metric: Executive Board members


Retrieving value for AMKEY 794


Batches: 100%|██████████| 1/1 [00:00<00:00, 128.46it/s]
2024-02-20 21:42:17.123 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:17.815 | DEBUG    | __main__:query:119 - Generated answer: Answer: 3, Unit: number
2024-02-20 21:42:17.817 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:17.817 | DEBUG    | __main__:query:109 - Retrieving metric: Independent Board members


Retrieving value for AMKEY 795


Batches: 100%|██████████| 1/1 [00:00<00:00, 164.27it/s]
2024-02-20 21:42:17.852 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:19.892 | DEBUG    | __main__:query:119 - Generated answer: Answer: 7, Unit: number
2024-02-20 21:42:19.893 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:19.894 | DEBUG    | __main__:query:109 - Retrieving metric: Non-executive Board members


Retrieving value for AMKEY 796


Batches: 100%|██████████| 1/1 [00:00<00:00, 160.23it/s]
2024-02-20 21:42:19.932 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:20.651 | DEBUG    | __main__:query:119 - Generated answer: Answer: 7, Unit: number
2024-02-20 21:42:20.652 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:20.654 | DEBUG    | __main__:query:109 - Retrieving metric: Average length of full Board service


Retrieving value for AMKEY 799


Batches: 100%|██████████| 1/1 [00:00<00:00, 104.04it/s]
2024-02-20 21:42:20.692 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:21.375 | DEBUG    | __main__:query:119 - Generated answer: Answer: 1.5, Unit: years
2024-02-20 21:42:21.377 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:21.378 | DEBUG    | __main__:query:109 - Retrieving metric: Number of employees trained, including internal and external training interventions


Retrieving value for AMKEY 871


Batches: 100%|██████████| 1/1 [00:00<00:00, 126.63it/s]
2024-02-20 21:42:21.416 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:22.199 | DEBUG    | __main__:query:119 - Generated answer: Answer: 8 760, Unit: number
2024-02-20 21:42:22.200 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:22.202 | DEBUG    | __main__:query:109 - Retrieving metric: CSI/SED spend on education


Retrieving value for AMKEY 874


Batches: 100%|██████████| 1/1 [00:00<00:00, 155.49it/s]
2024-02-20 21:42:22.238 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:22.924 | DEBUG    | __main__:query:119 - Generated answer: Answer: 6 769 128, Unit: Rands
2024-02-20 21:42:22.926 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:22.927 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of non-executive Board members


Retrieving value for AMKEY 1041


Batches: 100%|██████████| 1/1 [00:00<00:00, 159.60it/s]
2024-02-20 21:42:22.963 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:23.604 | DEBUG    | __main__:query:119 - Generated answer: Answer: 70, Unit: %
2024-02-20 21:42:23.606 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:23.607 | DEBUG    | __main__:query:109 - Retrieving metric: Percentage of independent Board members


Retrieving value for AMKEY 1042


Batches: 100%|██████████| 1/1 [00:00<00:00, 106.98it/s]
2024-02-20 21:42:23.648 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:24.271 | DEBUG    | __main__:query:119 - Generated answer: Answer: 70, Unit: %
2024-02-20 21:42:24.272 | DEBUG    | __main__:query:124 - Required unit: None
2024-02-20 21:42:24.273 | DEBUG    | __main__:query:109 - Retrieving metric: Environmental fines and/or non-compliances


Retrieving value for AMKEY 1070


Batches: 100%|██████████| 1/1 [00:00<00:00, 131.59it/s]
2024-02-20 21:42:24.313 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:42:24.960 | DEBUG    | __main__:query:119 - Generated answer: Answer: 3, Unit: number
2024-02-20 21:42:24.962 | DEBUG    | __main__:query:124 - Required unit: None


In [56]:
query_pipeline.query(
    amkey=12,
    year=YEAR
)

2024-02-20 21:41:23.116 | DEBUG    | __main__:query:109 - Retrieving metric: Total injury frequency rate (TIFR) – employees and contractors
Batches: 100%|██████████| 1/1 [00:00<00:00, 30.21it/s]
2024-02-20 21:41:23.187 | DEBUG    | __main__:query:114 - Retrieved append: 
2024-02-20 21:41:23.969 | DEBUG    | __main__:query:119 - Generated answer: Answer: 1.331, Unit: rate
2024-02-20 21:41:23.970 | DEBUG    | __main__:query:124 - Required unit: None


1.331

In [62]:
results_df

Unnamed: 0,ID,2021_Value,2020_Value,2019_Value,Generated
0,12_X_Tongaat,1.331,1.636,2.039000e+00,1.331
1,28_X_Tongaat,60204.0,63512.0,7.844200e+04,60204.000
2,49_X_Tongaat,4.0,4.0,4.000000e+00,4.000
3,52_X_Tongaat,99.0,97.9,9.493000e+01,99.000
4,114_X_Tongaat,16.63,16.18,1.495000e+01,16.630
...,...,...,...,...,...
56,871_X_Tongaat,8760.0,9331.0,1.958500e+04,8760.000
57,874_X_Tongaat,6769128.0,6210711.0,2.264179e+07,6769128.000
58,1041_X_Tongaat,70.0,70.0,7.000000e+01,70.000
59,1042_X_Tongaat,70.0,70.0,7.000000e+01,70.000


In [64]:
results_df

Unnamed: 0,ID,2021_Value,2020_Value,2019_Value,Generated,Correct
0,12_X_Tongaat,1.331,1.636,2.039000e+00,1.331,False
1,28_X_Tongaat,60204.0,63512.0,7.844200e+04,60204.000,False
2,49_X_Tongaat,4.0,4.0,4.000000e+00,4.000,False
3,52_X_Tongaat,99.0,97.9,9.493000e+01,99.000,False
4,114_X_Tongaat,16.63,16.18,1.495000e+01,16.630,False
...,...,...,...,...,...,...
56,871_X_Tongaat,8760.0,9331.0,1.958500e+04,8760.000,False
57,874_X_Tongaat,6769128.0,6210711.0,2.264179e+07,6769128.000,False
58,1041_X_Tongaat,70.0,70.0,7.000000e+01,70.000,False
59,1042_X_Tongaat,70.0,70.0,7.000000e+01,70.000,False


In [68]:
results_df["2021_Value"] = results_df["2021_Value"].astype(float)

In [69]:
# Calculate accuracy. This is the percentage of values in 'Generated' that are
# equal to the corresponding value in '2021_Value'.

results_df["Correct"] = results_df["Generated"] == results_df["2021_Value"]

accuracy = results_df["Correct"].sum() / len(results_df)

accuracy

0.7377049180327869

In [70]:
results_df["Generated"]

0           1.331
1       60204.000
2           4.000
3          99.000
4          16.630
         ...     
56       8760.000
57    6769128.000
58         70.000
59         70.000
60          3.000
Name: Generated, Length: 61, dtype: float64

In [71]:
results_df["2021_Value"]

0           1.331
1       60204.000
2           4.000
3          99.000
4          16.630
         ...     
56       8760.000
57    6769128.000
58         70.000
59         70.000
60          3.000
Name: 2021_Value, Length: 61, dtype: float64

In [72]:
# TODO: Formalise this testing. We can use Tongaat 2021 as a test case, as well
# as UCT 2021.

In [74]:
results_df.to_csv("/home/tomw/unifi-pdf-llm/data/tongaat_2021_results.csv", index=False)