In [75]:
"""
Table QA - RAG approach with tables converted to markdown format.

See https://haystack.deepset.ai/tutorials/22_pipeline_with_promptnode
"""
import os
import json
from pathlib import Path

import pandas as pd
from haystack import Document
from haystack.nodes import AzureConverter, EmbeddingRetriever, PromptNode, PromptTemplate, AnswerParser
from haystack.document_stores import InMemoryDocumentStore
from haystack.pipelines import Pipeline
from haystack.nodes import BaseComponent, PreProcessor
from loguru import logger

pd.set_option('display.max_rows', 100)

OPENAI_API_KEY = os.environ.get("OPENAI_API_KEY")
AZURE_CONVERTER_KEY = os.environ.get("AZURE_CONVERTER_KEY")

AMKEY_TO_METRIC_PATH = "/home/tomw/unifi-pdf-llm/data/AMKEY_GoldenStandard.csv"
"""Path to csv file mapping AMKEY to metric description."""

AMKEY_TO_SYNONYM_PATH = "/home/tomw/unifi-pdf-llm/data/ActivityMetricsSynonyms.csv"
"""Path to csv file mapping AMKEY to company metric description."""

AMKEY_TO_UNIT_PATH = "/home/tomw/unifi-pdf-llm/data/AMKEY_unit_conversion.csv"
"""Path to csv file mapping AMKEY to required unit."""

'Path to csv file mapping AMKEY to required unit.'

## Convert PDF

In [76]:
# Temp

def convert_validation_pdf() -> list[Document]:
    """
    Returns a list of Documents from the validation PDF.

    Uses the AzureConverter to convert the PDF to tables and text documents.

    Returns
    -------
    converted_docs : list[Document]
        The list of Documents from the validation PDF.
    """
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
        save_json=True
    )

    for fn in file_path.glob("*.pdf"):
        print(f"Converting {fn}")
        docs = converter.convert(file_path=fn, meta=None)
        converted_docs.extend(docs)

    return converted_docs


def load_validation_pdf_from_json() -> list[Document]:
    """
    Return a list of Documents from the validation PDF, loaded from JSON files.

    Requires AzureConverter to have been run on the PDF and saved the JSON files.

    Returns
    -------
    converted_docs : list[Document]
        The list of Documents from the validation PDF.
    """
    converted_docs = []
    file_path = Path("/home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split")

    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
    )

    for fn in file_path.glob("*.json"):
        print(f"Loading {fn}")
        docs = converter.convert_azure_json(file_path=fn)
        converted_docs.extend(docs)

    return converted_docs

In [77]:
docs = load_validation_pdf_from_json()

Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [7-8].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [5-6].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [11].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [3-4].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [1-2].json
Loading /home/tomw/unifi-pdf-llm/data/validate/SASOL Sustainability Report 2023 20-09_0_minimal_split/SASOL Sustainability Report 2023 20-09_0_minimal [9-10].json


## Preprocess Documents

In [78]:
def preprocess_documents(
        docs: list[Document],
        window_size: int=5,
        discard_text: bool=True
    ) -> list[Document]:
    """
    Preprocess the documents.

    Parameters
    ----------
    docs : list[Document]
        The documents to preprocess.

    window_size : int
        The size of the sliding window used to split the tables.

    discard_text : bool
        If True, discard text passages and only keep tables.

    Returns
    -------
    docs : list[Document]
        The preprocessed documents.
    """
    preprocessed_docs = []

    # Preprossor used to split text documents
    processor = PreProcessor(
        clean_empty_lines=True,
        clean_whitespace=True,
        clean_header_footer=True,
        remove_substrings=None,
        split_by="word",
        split_length=50,
        split_respect_sentence_boundary=True,
        split_overlap=0,
        max_chars_check=10_000
    )

    for doc in docs:
        if doc.content_type == "table":
            doc.content = clean_table_column_names(doc.content)
            doc.content = clean_table_values(doc.content)
            sliced_table_docs = slice_table_document(doc, window_size)
            preprocessed_docs.extend(sliced_table_docs)
        else:
            if discard_text:
                continue
            split_text_docs = processor.process([doc])
            preprocessed_docs.extend(split_text_docs)

    convert_tables_to_markdown(preprocessed_docs)

    return preprocessed_docs


def clean_table_column_names(df: pd.DataFrame, replace: str=' - ') -> pd.DataFrame:
    """
    Return a DataFrame with newlines removed from column headers.

    Parameters
    ----------
    df : pd.Dataframe
        The DataFrame to clean.

    replace: str
        The string to replace newlines with.

    Returns
    -------
    df : pd.Dataframe
        The dataframe with newlines removed from column headers.
    """
    df.columns = df.columns.str.replace('\n', replace)
    return df


def clean_table_values(df: pd.DataFrame) -> pd.DataFrame:
    """
    Return a DataFrame with commas and spaces replaced or removed from values.

    Commas are replaced with a decimal point, and spaces are removed.

    Parameters
    ----------
    df : pd.Dataframe
        The DataFrame to clean.

    Returns
    -------
    df : pd.Dataframe
        The dataframe with commas and spaces replaced or removed from values.
    """
    # TODO: Reconsider removing spaces. This could mess up unit columns, or other
    # text columns.

    for col in df.columns:
        df[col] = df[col].apply(
            lambda x: str(x).replace(',', '.').replace(' ', '') if _is_number(str(x)) else x
        )
    return df


def _is_number(string: str) -> bool:
    """
    Return True if the string is a number, False otherwise.

    Parameters
    ----------
    string : str
        The string to check.

    Returns
    -------
    is_number : bool
        True if the string is a number, False otherwise.
    """
    is_number = string.replace('.','').replace(',', '').replace(' ', '').isdigit()
    return is_number


def slice_table_document(doc: Document, window_size: int=5) -> list[Document]:
    """
    Return a list of documents, each containing a table with `window_size` rows.

    A sliding window approach is used to split the table into smaller tables. The
    returned documents have the same metadata as the original document, except for
    the content and id.

    Parameters
    ----------
    doc : Document
        Document with content_type "table".

    window_size : int
        The size of the sliding window.

    Returns
    -------
    docs : list[Document]
        A list of documents, each one containing a table with `window_size` rows.

    Raises
    ------
    ValueError
        If the document does not contain a table.
    """
    if doc.content_type != "table":
        raise ValueError("The document does not contain a table.")

    tables = _sliding_window(doc.content, window_size)
    docs = []
    for table in tables:
        new_doc = Document(content=table)
        for attr, value in doc.__dict__.items():
            if attr not in ["content", "id"]:
                setattr(new_doc, attr, value)
        docs.append(new_doc)

    return docs


def _sliding_window(df: pd.DataFrame, window_size: int) -> list[pd.DataFrame]:
    """
    Return a list of DataFrames, each containing a window of the original DataFrame.

    Parameters
    ----------
    df : pandas.DataFrame
        The DataFrame to split.

    window_size : int
        The size of the sliding window.

    Returns
    -------
    tables : list[pandas.DataFrame]
        A list of DataFrames, each containing a window of the original DataFrame.
    """
    tables = [df.iloc[i:i+window_size] for i in range(len(df) - window_size + 1)]

    return tables


def convert_tables_to_markdown(docs: list[Document]) -> None:
    """
    Convert tables to markdown format in place.

    Parameters
    ----------
    docs : List[Document]
        List of Documents, some of which may have `content_type` 'table'.
    """
    for doc in docs:
        if doc.content_type == "table":
            _convert_table_to_markdown(doc)


def _convert_table_to_markdown(doc: Document) -> None:
    """
    Convert table to markdown format in place.

    Parameters
    ----------
    doc : Document
        Document with `content_type` table.

    Raises
    ------
    ValueError
        If `doc.content_type` is not "table".
    """
    if doc.content_type != "table":
        raise ValueError(f"Document content_type must be 'table', not '{doc.content_type}'")

    table = doc.content
    markdown_table = table.to_markdown(index=False, tablefmt="github", intfmt='')

    doc.content = markdown_table
    doc.content_type = "text"

In [79]:
processed_docs = preprocess_documents(docs, window_size=1)

print(f"Number of documents: {len(processed_docs)}\n")

print(processed_docs[12].content)

Number of documents: 800

| Sasol in Society - Spend     |   2023 - Rm |   2022 - Rm |   2021 - Rm | 2020 - Rm   | LOA 2023   | Footnote   |
|------------------------------|-------------|-------------|-------------|-------------|------------|------------|
| Environment and biodiversity |        13.4 |        38.6 |        13.2 |             |            |            |


## Query Class

In [296]:
class QueryPipeline:
    """Retrieve AMKEY values from documents using a RAG approach."""

    def __init__(
            self,
            docs: list[Document],
            company: str,
            top_k: int=3,
            amkey_to_metric_path: str=AMKEY_TO_METRIC_PATH,
            amkey_to_synonym_path: str=AMKEY_TO_SYNONYM_PATH,
            amkey_to_unit_path : str=AMKEY_TO_UNIT_PATH,
        ):
        """
        Initalise the components of the query pipeline.

        Parameters
        ----------
        docs : list[Document]
            The documents to provide context for the queries.

        company : str
            The company the documents are for.

        amkey_to_metric_path : str
            Path to a csv file mapping AMKEY to metric.

        amkey_to_synonym_path : str
            Path to a csv file mapping AMKEY and company to metric synonym.

        amkey_to_unit_path : str
            Path to a csv file mapping AMKEY to desired unit.
        """
        self.docs = docs
        self.company = company
        self.top_k = top_k
        self.amkey_to_metric_path = amkey_to_metric_path
        self.amkey_to_synonym_path = amkey_to_synonym_path
        self.amkey_to_unit_path = amkey_to_unit_path

        self.document_store = None
        self.retriever = None
        self.generation_llm = None
        self.unit_conversion_llm = None
        self.json_conversion_llm = None
        self.confirm_relevant_context_llm = None
        self.amkey_to_metric = None
        self.amkey_to_synonym = None
        self.amkey_to_unit = None

        self.initialise_document_store()
        self.initialise_retriever()
        self.initialise_generation_llm()
        self.initialise_unit_conversion_llm()
        self.initialise_json_conversion_llm()
        self.initialise_relevant_context_llm()
        self.initialise_mappings()

    def initialise_document_store(self):
        # TODO: Try using other document stores (e.g. FAISS).
        logger.info("Initialising document store")
        self.document_store = InMemoryDocumentStore(embedding_dim=384)
        self.document_store.delete_documents()
        self.document_store.write_documents(self.docs)

    def initialise_retriever(self):
        # TODO: I'm not sure which OpenAI embedding models are available. Is it possible
        # to use their newest embedding models in Haystack v1?
        # TODO: Look into other (non-OpenAI) embedding models that can be used with
        # Haystack v1.
        logger.info("Initialising retriever")
        self.retriever = EmbeddingRetriever(
            embedding_model="sentence-transformers/all-MiniLM-L6-v2",
            document_store=self.document_store,
            top_k=self.top_k
        )
        self.document_store.update_embeddings(retriever=self.retriever)

    def initialise_generation_llm(self):
        # TODO: Temperature = 0 isn't giving deterministic results. Is this a bug?
        # TODO: Currently 'gpt-3.5-turbo-1106' as it has a larger context window.
        # There is a newer gpt-3.5 model (gpt-3.5-turbo-0125) which also has a large
        # context window. I should use that if it's available with Haystack v1.
        logger.info("Initialising generation LLM")
        self.generation_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo-1106",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_unit_conversion_llm(self):
        logger.info("Initialising unit conversion LLM")
        self.unit_conversion_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_json_conversion_llm(self):
        logger.info("Initialising json conversion LLM")
        self.json_conversion_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0}
        )

    def initialise_relevant_context_llm(self):
        logger.info("Initialising relevant context LLM")
        self.confirm_relevant_context_llm = PromptNode(
            model_name_or_path="gpt-3.5-turbo",
            api_key=OPENAI_API_KEY,
            model_kwargs={"temperature": 0.4}
        )

    def initialise_mappings(self):
        logger.info("Initialising mappings")
        self.amkey_to_metric = pd.read_csv(self.amkey_to_metric_path)
        self.amkey_to_synonym = pd.read_csv(self.amkey_to_synonym_path)
        self.amkey_to_unit = pd.read_csv(self.amkey_to_unit_path)

    def query(self, amkey: int, year: int):
        """
        Return the value associated with an AMKEY for a given year.

        Uses retrieval augmented generation to retrieve the value.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric to retrieve.

        year : int
            The year to retrieve the metric for.

        Returns
        -------
        value : int
            The value associated with the AMKEY for the given year.
        """
        logger.debug(f"Retrieving AMKEY: {amkey}")

        metric = self.retrieve_metric_description(amkey)
        logger.debug(f"Retrieving metric: {metric}")

        context_documents = self.retriever.retrieve(metric)
        context_str = "\n\n".join([doc.content for doc in context_documents])
        logger.debug(f"Retrieved context documents:\n\n {context_str}\n")

        append = self._retrieve_additional_appended_instructions(amkey)
        logger.debug(f"Appending: {append}")

        prompt = self._create_generation_prompt(metric, year, context_documents, append)
        # logger.debug(f"Generation prompt: \n{prompt}")

        answer = self.generation_llm(prompt)[0]
        logger.debug(f"Generated answer: {answer}")

        try:
            value, unit = self.parse_answer(answer)
        except json.JSONDecodeError:
            logger.error(f"Error parsing answer: {answer}")
            json_conversion_prompt = self.create_json_generation_prompt(answer)
            answer = self.json_conversion_llm(json_conversion_prompt)[0]
            logger.debug(f"JSON converted answer: {answer}")
            value, unit = self.parse_answer(answer)
        except Exception as err:
            logger.error(f"Non-JSONDecodeError exception when parsing answer: {err}")
            value, unit = None, None

        logger.debug(f"Parsed answer: {value}, {unit}")

        if value is not None:
            # Confirm that the context was sufficient to answer the question.
            relevant_context_prompt = self.create_relevant_context_prompt(metric, year, context_documents)
            relevant_context_answer = self.confirm_relevant_context_llm(relevant_context_prompt)[0]
            # Strip '.' and whitespace from the answer.
            relevant_context_answer = relevant_context_answer.replace(".", "").strip()
            logger.debug(f"Relevant context answer: {relevant_context_answer}")
            if relevant_context_answer.lower() != "yes":
                value = -1
            else:
                # Double check
                # TODO: Consider rephrasing this. Could give the previous prompt (with answer),
                # and ask 'Are you sure the context is not sufficient to answer the question?'
                relevant_context_answer = self.confirm_relevant_context_llm(relevant_context_prompt)[0]
                logger.debug(f"DOUBLE CHECK Relevant context answer: {relevant_context_answer}")
                if relevant_context_answer.lower() != "yes":
                    value = -1

        required_unit = self.retrieve_unit(amkey)
        logger.debug(f"Required unit: {required_unit}")

        if required_unit is not None and value is not None and value != -1:
            if unit != required_unit:
                unit_conversion_prompt = self.create_unit_conversion_prompt(value, unit, required_unit)
                value = self.unit_conversion_llm(unit_conversion_prompt)[0]
                logger.debug(f"Unit converted value: {value}")

        return value

    def _retrieve_additional_appended_instructions(self, amkey: int) -> str:
        """
        Return additional instructions to append to the query.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric to retrieve.

        Returns
        -------
        append : str
            Additional instructions to append to the query.
        """
        if amkey in [47, 48, 49]:
            append = "Do not include the word 'Level' in the answer."
        else:
            append = ""

        return append

    def _create_generation_prompt(
            self,
            metric: str,
            year: int,
            docs: list[Document],
            append: str
        )-> str:
        """
        Create a prompt for the generation LLM.

        Parameters
        ----------
        metric : str
            The metric to retrieve.

        year : int
            The year to retrieve the metric for.

        docs : list[Document]
            The documents to provide context for the queries.

        append : str
            Additional instructions to append to the query.

        Returns
        -------
        prompt : str
            The prompt for the generation LLM.
        """
        query = f"What was the {metric} in the year {year}?"

        context = "\n\n".join([doc.content for doc in docs])

        prompt = f"""
        Use the following pieces of context to answer the question at the end.
        The answer must be a value from the context.
        The context may be text or a markdown table.
        Just retrieve the answer from the context. Please don't do any unit conversion.
        If you don't know the answer, please return 'null' for the answer and unit.
        Do not return any words other than 'Answer' and 'Unit' in the answer.
        Please return the answer in the format of a python dictionary / JSON object:
        {{"Answer": <number or null>, "Unit": <unit or null>}}
        Please always use double quotes for the keys and values.
        If the requested value is not present in the context, please return 'null' for the answer and unit.

        \nContext:\n{context} \n\n Question: {query} {append}\n\n Answer:
        """

        return prompt

    def create_unit_conversion_prompt(self, value: int, unit: str, target_unit: str) -> str:
        prompt=f"""
        You are an expert unit converter. You are aware of how to convert
        between different units within the same system of measurement.
        For example, 1236 million = 1236 * 1 million = 1236 * 1000000 = 1236000000.
        For example, to convert from Rm to R, you would multiply by 1000000. This is because
        1 Rm = 1000000 R.
        Do not do any unit conversion if it is not necessary. That is, if the
        unit is already in the required unit, do not convert it.
        For example, 'What is 242353 Rands in rand? Answer: 242353' is the correct answer.
        Please return a single number as your answer. Do not elaborate or give
        any context.\n\n

        What is {value} {unit} in {target_unit}? \n\n Answer:"""

        return prompt

    def create_json_generation_prompt(self, answer: str) -> str:
        prompt=f"""The following answer was generated by a large language model: {answer}.
                   Please convert this answer to follow the python dictionary / JSON object format:
                   {{"Answer": <number or null>, "Unit": <unit or null>'}}
                   \n\n Answer:"""

        return prompt

    def create_relevant_context_prompt(
            self,
            metric: str,
            year: int,
            docs: list[Document],
    ) -> str:
        query = f"What was the {metric} in the year {year}?"
        context = "\n\n".join([doc.content for doc in docs])

        prompt = f"""
        You are an expert in determining whether there is sufficient information in a given context to answer a specific question.
        Is there sufficient information in the following context to answer the specific question: '{query}'?

        Context:\n{context}

        Think cleary and think step by step. Please return 'Yes' or 'No' as your answer. Answer:
        """

        return prompt

    def parse_answer(self, answer: str) -> tuple[float | None, str | None]:
        """
        Parse the answer returned by the generation LLM.

        Parameters
        ----------
        answer : str
            The answer returned by the generation LLM. This is expected to be in
            the format of a python dictionary / JSON object:
            {"Answer": <number or null>, "Unit": <unit or null>}

        Returns
        -------
        value : float | None
            The value from the answer.

        unit : str | None
            The unit from the answer.
        """
        logger.debug(f"Parsing answer: {answer}")

        answer_dict = json.loads(answer)

        logger.info(f"answer_dict: {answer_dict}")

        value = answer_dict["Answer"]
        unit = answer_dict["Unit"]

        if value is None:
            pass
        elif isinstance(value, str) and value.lower() in ["null", "n/a"]:
            value = None
        elif value == "nil":
            value = 0
        elif isinstance(value, str):
            value = value.replace(" ", "").replace(",", "")
            value = "".join(filter(lambda x: x.isdigit() or x == ".", value))
            value = float(value)
        else:
            value = float(value)

        if unit == "null": unit = None

        return value, unit

    def retrieve_metric_description(self, amkey: int) -> str:
        """
        Return the description of a metric.

        If a company-specific description is available, it is returned. Otherwise, the
        generic description is returned.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str
            The description of the metric.
        """
        metric = self.retrieve_company_metric_description(amkey)
        if metric is None:
            metric = self.retrieve_generic_metric_description(amkey)

        return metric

    def retrieve_company_metric_description(self, amkey: int) -> str | None:
        """
        Return the company-specific description of a metric, if available.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str | None
            The company-specific description of the metric, if available.
            Otherwise, None.
        """
        metric = self.amkey_to_synonym[
            (self.amkey_to_synonym["AMKEY"] == amkey)
            & (self.amkey_to_synonym["Group"] == self.company)
        ]["ClientMetric"]

        if metric.empty:
            metric = None
        else:
            metric = metric.item()

        return metric

    def retrieve_generic_metric_description(self, amkey: int) -> str:
        """
        Return the generic description of a metric.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        metric : str
            The description of the metric.

        Raises
        ------
        ValueError
            If the AMKEY is invalid.
        """
        try:
            metric = self.amkey_to_metric[
                self.amkey_to_metric["AMKEY"] == amkey
            ]["ActivityMetric"].item()
        except Exception:
            raise ValueError(f"Invalid AMKEY {amkey}")

        return metric

    def retrieve_unit(self, amkey: int) -> str | None:
        """
        Return the required unit for a metric.

        Parameters
        ----------
        amkey : int
            The AMKEY of the metric.

        Returns
        -------
        unit : str | None
            The required unit for the metric, if specified. Otherwise, None.
        """
        try:
            unit = self.amkey_to_unit[
                self.amkey_to_unit["AMKEY"] == amkey
            ]["Unit"].item()
        except KeyError:
            unit = None

        if pd.isna(unit): unit = None

        return unit


## Validation

In [81]:
from mapping import COMPANY_YEAR_PDF_MAPPING

AZURE_CONVERTER_DIR = "/home/tomw/unifi-pdf-llm/data/azureconverter_outputs"
"""Path to directory with json outputs from AzureConverter."""

TRAIN_CSV_PATH = "/home/tomw/unifi-pdf-llm/data/Train.csv"
"""Path to the Train.csv file."""


def load_documents(company: str, year: int) -> list[Document]:
    """
    Load documents for a company and year.

    Requires the corresponding pdf file(s) to have been previously converted to json
    using the AzureConverter.

    Parameters
    ----------
    company : str
        The company to load documents for.

    year : int
        The year to load documents for.

    Returns
    -------
    company_docs : list[Document]
        The documents for the company and year.

    Raises
    ------
    ValueError
        If no documents are found for the company and year.
    """
    company_docs = []
    converter = AzureConverter(
        endpoint="https://azureconverter.cognitiveservices.azure.com/",
        credential_key=AZURE_CONVERTER_KEY,
        model_id="prebuilt-layout",  # Was "prebuilt-document"
    )

    try:
        file_name_list = COMPANY_YEAR_PDF_MAPPING[company][year]
    except KeyError:
        raise ValueError(f"No documents found for {company} in {year}")

    for file_name in file_name_list:
        file_name = file_name.replace(".pdf", ".json")
        file_path = Path(AZURE_CONVERTER_DIR) / file_name
        logger.info(f"Loading documents from {file_path}")
        docs = converter.convert_azure_json(file_path=file_path)
        company_docs.extend(docs)

    return company_docs

In [88]:
train_df = pd.read_csv(TRAIN_CSV_PATH)
train_df

Unnamed: 0,ID,2021_Value,2020_Value,2019_Value
0,3_X_Absa,,,
1,6_X_Absa,,,
2,7_X_Absa,,,
3,8_X_Absa,,,
4,9_X_Absa,,,
...,...,...,...,...
5105,1052_X_Uct1&2,,,
5106,1053_X_Uct1&2,,,
5107,1054_X_Uct1&2,,,
5108,1055_X_Uct1&2,,,


In [90]:
# Filter train_df for rows with NaN in the 2021_Value column
nan_2021_value = train_df["2021_Value"].isna()
nan_2021_value_df = train_df[nan_2021_value]
nan_2021_value_df

Unnamed: 0,ID,2021_Value,2020_Value,2019_Value
0,3_X_Absa,,,
1,6_X_Absa,,,
2,7_X_Absa,,,
3,8_X_Absa,,,
4,9_X_Absa,,,
...,...,...,...,...
5105,1052_X_Uct1&2,,,
5106,1053_X_Uct1&2,,,
5107,1054_X_Uct1&2,,,
5108,1055_X_Uct1&2,,,


In [258]:
def validate_retrieval(
        company: str,
        year: int,
        type: str="retrieval",
        window_size: int=1,
        discard_text: bool=True
    ) -> tuple[pd.DataFrame, float]:
    """
    Returns a DataFrame with the results of the retrieval validation.

    Parameters
    ----------
    company : str
        The company to validate.

    year : int
        The year to validate.

    type : str
        The type of validation test to run. Options are "retrieval" or "nan".
        The "retrieval" test checks the retrieval of values that are present in the
        documents. The "nan" test checks the retrieval of values that are not present
        in the documents (i.e. testing the ability to return 'None' when the value is
        not present).

    window_size : int
        The size of the sliding window to use when slicing tables.

    discard_text : bool
        If True, discard text passages when preprocessing the documents. Only tables
        are kept.

    Returns
    -------
    results_df : pd.DataFrame
        The results of the retrieval validation.

    accuracy : float
        The accuracy of the retrieval validation.

    Raises
    ------
    ValueError
        If the year is not 2019, 2020, or 2021.
    """
    if year not in [2019, 2020, 2021]:
        raise ValueError(f"Unable to validate year: {year}")

    train_df = pd.read_csv(TRAIN_CSV_PATH)

    # Restrict to the company
    train_df = train_df[train_df["ID"].str.contains(f"X_{company}")]
    train_df.reset_index(drop=True, inplace=True)

    # Drop the two columns that we are not interested in
    all_years = ["2021", "2020", "2019"]
    all_years.remove(str(year))
    for _year in all_years:
        train_df.drop(columns=[f"{_year}_Value"], inplace=True)

    if type == "retrieval":
        train_df = train_df.dropna(subset=[f"{year}_Value"], how="all")
    elif type == "nan":
        train_df = train_df[train_df[f"{year}_Value"].isna()]
        # Keep a random sample rows
        train_df = train_df.sample(n=50)
    else:
        raise ValueError(f"Invalid validation type: {type}")

    # Load and preprocess the documents
    docs = load_documents(company, year)
    docs = preprocess_documents(
        docs, window_size=window_size, discard_text=discard_text
    )

    logger.debug(f"Number of documents: {len(docs)}")

    query_pipeline = QueryPipeline(
        docs=docs,
        company=company,
        amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
        amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
        amkey_to_unit_path=AMKEY_TO_UNIT_PATH
    )

    results_df = train_df.copy(deep=True)

    # Loop over the rows in the dataframe and retrieve the value for each AMKEY
    for idx, row in train_df.iterrows():
        amkey = int(row["ID"].split("_")[0])

        metric = query_pipeline.retrieve_metric_description(amkey)
        results_df.at[idx, "Metric"] = metric

        value = query_pipeline.query(amkey, year)
        results_df.at[idx, f"{year}_Generated"] = value

    results_df[f"{year}_Value"] = results_df[f"{year}_Value"].astype(float)
    results_df[f"{year}_Generated"] = results_df[f"{year}_Generated"].astype(float)
    results_df["Correct"] = results_df.apply(
        lambda row: (row[f"{year}_Generated"] == row[f"{year}_Value"]) or
        (pd.isna(row[f"{year}_Generated"]) and pd.isna(row[f"{year}_Value"])) or
        (row[f"{year}_Generated"] == -1 and pd.isna(row[f"{year}_Value"])),
        axis=1
    )

    # Reordering the columns
    results_df = results_df[["ID", "Metric", f"{year}_Value", f"{year}_Generated", "Correct"]]

    accuracy = results_df["Correct"].sum() / len(results_df)

    logger.info(f"Accuracy: {accuracy}")

    return results_df, accuracy


### Validating Tongaat 2021

In [302]:
results_df, accuracy = validate_retrieval("Tongaat", 2021, type="retrieval", window_size=2)

2024-02-25 19:32:19.956 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG_removed_sup_table.json
2024-02-25 19:32:21.747 | DEBUG    | __main__:validate_retrieval:76 - Number of documents: 678
2024-02-25 19:32:21.748 | INFO     | __main__:initialise_document_store:60 - Initialising document store
2024-02-25 19:32:21.753 | INFO     | __main__:initialise_retriever:70 - Initialising retriever
Batches: 100%|██████████| 22/22 [00:01<00:00, 21.97it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 9709.72 docs/s]        
2024-02-25 19:32:24.221 | INFO     | __main__:initialise_generation_llm:83 - Initialising generation LLM
2024-02-25 19:32:24.222 | INFO     | __main__:initialise_unit_conversion_llm:91 - Initialising unit conversion LLM
2024-02-25 19:32:24.223 | INFO     | __main__:initialise_json_conversion_llm:99 - Initialising json conversion LLM
2024-02-25 19:32:24.223 | INFO     | __main__:initialise_relevant_cont

In [303]:
display(results_df)

Unnamed: 0,ID,Metric,2021_Value,2021_Generated,Correct
7,12_X_Tongaat,Total injury frequency rate (TIFR) – employees...,1.331,1.331,True
18,28_X_Tongaat,Total – company managed/farmed land (owned and...,60204.0,52883.0,False
30,49_X_Tongaat,B-BBEE Level,4.0,-1.0,False
33,52_X_Tongaat,Overall Board and Committee meeting attendance,99.0,99.0,True
64,114_X_Tongaat,Energy efficiency: total direct and indirect e...,16.63,-1.0,False
71,122_X_Tongaat,"Fatal injury frequency rate (FIFR, i.e. number...",0.005,0.005,True
76,128_X_Tongaat,Carbon emissions – Scope 1,505575.0,505575.0,True
77,129_X_Tongaat,Carbon emissions – Scope 2,51539.0,51539.0,True
85,138_X_Tongaat,Hazardous waste disposed of at appropriate fac...,184.0,184.0,True
94,151_X_Tongaat,"Lost time injury frequency rate (LTIFR, i.e. n...",0.093,0.093,True


The Tongaat 2021 report gives two tables - a comprehensive data table, and a 
supplemental environmental data table (inclusive of data from operations sold off, 
and not disposed of in financial year 2021). Some metrics are included in both.
The '2021_Value's are all from the comprehensive data table. Most of the errors
occur because the LLM generates the answer from the supplemental environmental 
data table. 

Most of the other errors are from two-line rows which have been split into 
individual rows in the pandas and markdown tables. The first row is retrieved,
but it's the second row that has the value. Wouldn't have this issue with a sliding
window > 1 probably. 

In [None]:
# Original Accuracy: 0.7377049180327869
# Accuracy after removing index: 0.819672131147541
# Accuracy after splitting text documents: ~0.88
# Accuracy after enforcing json scheme: ~0.77 (seems to making some values up, instead
# of retrieving them from the documents)
# Accuracy after cleaning table values: ~0.77

In [299]:
COMPANY = "Tongaat"
YEAR = 2021

docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(docs, window_size=2)

2024-02-25 19:28:27.852 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG_removed_sup_table.json


In [300]:
query_pipeline = QueryPipeline(
    docs=docs,
    company=COMPANY,
)

2024-02-25 19:28:29.962 | INFO     | __main__:initialise_document_store:60 - Initialising document store
2024-02-25 19:28:29.968 | INFO     | __main__:initialise_retriever:70 - Initialising retriever
Batches: 100%|██████████| 22/22 [00:00<00:00, 30.98it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 13529.84 docs/s]       
2024-02-25 19:28:31.909 | INFO     | __main__:initialise_generation_llm:83 - Initialising generation LLM
2024-02-25 19:28:31.909 | INFO     | __main__:initialise_unit_conversion_llm:91 - Initialising unit conversion LLM
2024-02-25 19:28:31.910 | INFO     | __main__:initialise_json_conversion_llm:99 - Initialising json conversion LLM
2024-02-25 19:28:31.910 | INFO     | __main__:initialise_relevant_context_llm:107 - Initialising relevant context LLM
2024-02-25 19:28:31.911 | INFO     | __main__:initialise_mappings:115 - Initialising mappings


In [301]:
# Example where the context isn't relevant.
# TODO: Consider removing the text documents. For structured pdfs, the values seem
# to be always in the tables. Otherwise, think about prioritising the tables over
# the text documents. Could have two seperate document stores, one for text and one
# for tables. Only use the text document store if using the table document store
# doesn't return any results.
ans = query_pipeline.query(
    amkey=198,
    year=YEAR
)

2024-02-25 19:28:37.921 | DEBUG    | __main__:query:139 - Retrieving AMKEY: 198
2024-02-25 19:28:37.923 | DEBUG    | __main__:query:142 - Retrieving metric: Number of Compensation for Occupational Injuries and Diseases (COID) lost time for employees
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.43it/s]
2024-02-25 19:28:37.963 | DEBUG    | __main__:query:146 - Retrieved context documents:

 |                                                                                                                                  | Unit of measure   | 2021   | 2020   | 2019   | 2018   |
|----------------------------------------------------------------------------------------------------------------------------------|-------------------|--------|--------|--------|--------|
| Medical treatment cases (MTCs, i.e. injuries on duty leading to medical treatment, but no lost days) - employees and contractors | number            | 365    | 405    | 544    | 608    |
| Lost time injuries (LTIs, i.e. injuri

In [272]:
# Example where the context doesn't contain the value, but a non-None value is returned.
# TODO: Consider adding a step to check if the context is sufficient to answer the query.
ans = query_pipeline.query(
    amkey=766,
    year=YEAR
)

2024-02-25 19:17:03.091 | DEBUG    | __main__:query:139 - Retrieving AMKEY: 766
2024-02-25 19:17:03.092 | DEBUG    | __main__:query:142 - Retrieving metric: Average hours of training provided to male employees during the reporting period
Batches: 100%|██████████| 1/1 [00:00<00:00, 33.19it/s]
2024-02-25 19:17:03.147 | DEBUG    | __main__:query:146 - Retrieved context documents:

 | IMPROVING LIVES   | EMPLOYEES      |
|-------------------|----------------|
|                   | (2020: 9 331)  |
|                   | TRAINING HOURS |

| IMPROVING LIVES   | EMPLOYEES      |
|-------------------|----------------|
|                   | TRAINING HOURS |
|                   | 66986          |

|                                  | Unit of measure   |     2021 |     2020 |     2019 |     2018 |
|----------------------------------|-------------------|----------|----------|----------|----------|
| Value of employee training spend | Rands             | 23822054 | 34770862 | 59687346 | 57050167 |
|

In [212]:
ans = query_pipeline.query(
    amkey=298,
    year=YEAR
)

2024-02-25 18:47:56.940 | DEBUG    | __main__:query:139 - Retrieving AMKEY: 298
2024-02-25 18:47:56.942 | DEBUG    | __main__:query:142 - Retrieving metric: Percentage of female Board members.
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.52it/s]
2024-02-25 18:47:56.990 | DEBUG    | __main__:query:146 - Retrieved context score: 0.5020582346587903
2024-02-25 18:47:56.991 | DEBUG    | __main__:query:146 - Retrieved context score: 0.5016005293001237
2024-02-25 18:47:56.991 | DEBUG    | __main__:query:146 - Retrieved context score: 0.5015509566094587
2024-02-25 18:47:56.991 | DEBUG    | __main__:query:148 - Retrieved context documents:

 |                                    | Unit of measure   |   2021 |   2020 |   2019 |   2018 |
|------------------------------------|-------------------|--------|--------|--------|--------|
| Female Board members               | number            |      2 |      1 |      2 |    5   |
| Percentage of female Board members | %                 |     20 |     

In [30]:
ans = query_pipeline.query(
    amkey=13,
    year=YEAR
)

2024-02-25 12:30:47.215 | DEBUG    | __main__:query:118 - Retrieving AMKEY: 13
2024-02-25 12:30:47.217 | DEBUG    | __main__:query:121 - Retrieving metric: Amount of assets under management, by asset class, that employ governance issues
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.33it/s]
2024-02-25 12:30:47.296 | DEBUG    | __main__:query:125 - Retrieved context documents:

 |            | Unit of measure   | 2021   | 2020   | 2019   | 2018   |
|------------|-------------------|--------|--------|--------|--------|
| GOVERNANCE |                   |        |        |        |        |

| CAPITALS - Manufactured Capital   |  - Our plants, farmers and infrastructure utilised in production   | INPUTS - Capital expenditure in 2021 R505 million (2020: R195 million)   |
|-----------------------------------|--------------------------------------------------------------------|--------------------------------------------------------------------------|
| Financial Capital                 | Sha

In [20]:
# This is an example where additional context of the table would be useful (rows
# above and below the selected row).
# The description of the row is spread over two rows, with the values
# in the second row.

ans = query_pipeline.query(
    amkey=622,
    year=YEAR
)


2024-02-25 15:39:54.910 | DEBUG    | __main__:query:120 - Retrieving AMKEY: 622
2024-02-25 15:39:54.911 | DEBUG    | __main__:query:123 - Retrieving metric: Rand value of investments in COVID-19 avoidance, mitigation and treatment
Batches: 100%|██████████| 1/1 [00:00<00:00, 41.99it/s]
2024-02-25 15:39:54.984 | DEBUG    | __main__:query:127 - Retrieved context documents:

 |                                                  | Unit of measure   | 2021     | 2020   | 2019   | 2018   |
|--------------------------------------------------|-------------------|----------|--------|--------|--------|
| Rand value of investments in COVID-19 avoidance, |                   |          |        |        |        |
| mitigation and treatment                         | Rands             | 84210673 |        |        |        |

|                                                  | Unit of measure   | 2021   | 2020   | 2019   | 2018   |
|--------------------------------------------------|-------------------

In [32]:
ans = query_pipeline.query(
    amkey=550,
    year=YEAR
)

2024-02-25 12:31:09.875 | DEBUG    | __main__:query:118 - Retrieving AMKEY: 550
2024-02-25 12:31:09.876 | DEBUG    | __main__:query:121 - Retrieving metric: New cases of NIHL* – employees and contractors
Batches: 100%|██████████| 1/1 [00:00<00:00, 153.41it/s]
2024-02-25 12:31:09.933 | DEBUG    | __main__:query:125 - Retrieved context documents:

 |                                                | Unit of measure   | 2021   |   2020 |   2019 | 2018   |
|------------------------------------------------|-------------------|--------|--------|--------|--------|
| New cases of NIHL* - employees and contractors | number            |        |      8 |      1 |        |

|                                                     | Unit of measure   |   2021 |   2020 |   2019 | 2018   |
|-----------------------------------------------------|-------------------|--------|--------|--------|--------|
| Non-work related deaths - employees and contractors | number            |      7 |      3 |      3 |   

In [217]:
ans = query_pipeline.query(
    amkey=9,
    year=YEAR
)

print(f'Answer: {ans}')

2024-02-24 16:54:01.570 | DEBUG    | __main__:query:118 - Retrieving AMKEY: 9
2024-02-24 16:54:01.571 | DEBUG    | __main__:query:121 - Retrieving metric: Air emissions of the following pollutants: (4) particulate matter (PM10)
Batches: 100%|██████████| 1/1 [00:00<00:00, 82.80it/s]
2024-02-24 16:54:01.634 | DEBUG    | __main__:query:125 - Retrieved context documents:

 | NATURAL CAPITAL   | NATURAL CAPITAL                         | LAND   |
|-------------------|-----------------------------------------|--------|
| ·                 | Improve waste efficiency by 5% by 2025. | pollution of air through discharges of particulate matter,
or even via inadequate suppression of dust emanating        |

| CARBON EMISSIONS (TONS OF CARBON DIOXIDE EQUIVALENTS, CO2-e)   | 2021    | 2020    | 2019    | 2018    |
|----------------------------------------------------------------|---------|---------|---------|---------|
| Total scope 1 emissions                                        | 505 575 | 704 9

Answer: None


### Validating Uct 2021

In [420]:
results_df, accuracy = validate_retrieval("Uct", 2021)

2024-02-24 23:10:46.345 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/UCT_Carbon_Footprint_Report_2020-2021.json
2024-02-24 23:10:46.504 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/afs2021.json
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 22.05docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  4.96docs/s]
2024-02-24 23:10:50.222 | DEBUG    | __main__:validate_retrieval:19 - Number of documents: 2806
2024-02-24 23:10:50.223 | INFO     | __main__:initialise_document_store:55 - Initialising document store
2024-02-24 23:10:50.247 | INFO     | __main__:initialise_retriever:61 - Initialising retriever
Batches: 100%|██████████| 88/88 [00:01<00:00, 45.50it/s]docs/s]
Documents Processed: 10000 docs [00:02

In [421]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Generated,Correct
0,67_X_Uct1&2,Depreciation as per income statement,329028000.0,,False
1,109_X_Uct1&2,Employee wages and benefits,4029001000.0,842427.0,False
2,124_X_Uct1&2,Finance cost per income statement,39359000.0,81900.0,False
3,128_X_Uct1&2,GHG Scope 1 emissions,2340.0,1.2,False
4,129_X_Uct1&2,GHG Scope 2 emissions,58166.0,1.2,False
5,130_X_Uct1&2,GHG Scope 3 emissions,39696.0,,False
6,268_X_Uct1&2,Other Income as per income statement,1922763000.0,26200.0,False
7,385_X_Uct1&2,Revenue,7053925000.0,,False
8,448_X_Uct1&2,Total assets as per balance sheet,15634720000.0,10951.0,False
9,523_X_Uct1&2,Total number of employees,6265.0,842427.0,False


In [422]:
COMPANY = "Uct"
YEAR = 2021

docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(docs, window_size=1)

query_pipeline = QueryPipeline(
    docs=docs,
    company=COMPANY,
    amkey_to_metric_path=AMKEY_TO_METRIC_PATH,
    amkey_to_synonym_path=AMKEY_TO_SYNONYM_PATH,
    amkey_to_unit_path=AMKEY_TO_UNIT_PATH
)

2024-02-24 23:11:39.678 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/UCT_Carbon_Footprint_Report_2020-2021.json
2024-02-24 23:11:40.104 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/afs2021.json
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 23.12docs/s]
Preprocessing: 100%|██████████| 1/1 [00:00<00:00,  5.16docs/s]
2024-02-24 23:11:43.895 | INFO     | __main__:initialise_document_store:55 - Initialising document store
2024-02-24 23:11:43.920 | INFO     | __main__:initialise_retriever:61 - Initialising retriever
Batches: 100%|██████████| 88/88 [00:02<00:00, 42.10it/s]docs/s]
Documents Processed: 10000 docs [00:02, 4640.89 docs/s]         
2024-02-24 23:11:47.334 | INFO     | __main__:initialise_generation_l

In [423]:
ans = query_pipeline.query(
    amkey=128,
    year=YEAR
)

print(f'Answer: {ans}')

2024-02-24 23:11:47.349 | DEBUG    | __main__:query:118 - Retrieving AMKEY: 128
2024-02-24 23:11:47.351 | DEBUG    | __main__:query:121 - Retrieving metric: GHG Scope 1 emissions
Batches: 100%|██████████| 1/1 [00:00<00:00, 215.02it/s]
2024-02-24 23:11:47.445 | DEBUG    | __main__:query:125 - Retrieved context documents:

 · Scope 2 emissions were calculated and reported using both the location-based and market-based methods.
This is in accordance with GHG Protocol Corporate Standard's "Scope 2 Guidance" (January 2015). 

Scope 1, 2 and Other Direct Emissions
0,12
2,40
0,11
2,20
0,10
tCO2e per m2
0,09
2,00
1,80
tCO2e per capita
1,60
0,08
1,40
0,07
2012
2013
2014
2015
2016
-Tonnes of CO2e per m2
2017
2018
2019
2020
-Tonne of CO2e per capita
Figure 6: Annual Fluctuation in GHG Emissions Intensity Metrics
1,20
202113
5 Scope 1
The figure below provides a breakdown of UCT's Scope 1 emissions by emissions source. 

Unlike Scope 1 and 2 sources, reporting Scope 3 emissions is optional. Howev

Answer: 1.2


TODO: Add different modes to the validation - want to test retrieving values that
exist in the documents, and test that None is returned for values that don't
exist in the documents. 

### Validating Absa 2021

In [69]:
results_df, accuracy = validate_retrieval("Absa", 2021, window_size=2)

2024-02-25 16:29:01.737 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2022-Absa-Group-limited-Environmental-Social-and-Governance-Data-sheet.json
2024-02-25 16:29:02.973 | DEBUG    | __main__:validate_retrieval:50 - Number of documents: 536
2024-02-25 16:29:02.974 | INFO     | __main__:initialise_document_store:57 - Initialising document store
2024-02-25 16:29:02.978 | INFO     | __main__:initialise_retriever:63 - Initialising retriever
Batches: 100%|██████████| 17/17 [00:00<00:00, 24.61it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 14070.70 docs/s]       
2024-02-25 16:29:04.955 | INFO     | __main__:initialise_generation_llm:72 - Initialising generation LLM
2024-02-25 16:29:04.955 | INFO     | __main__:initialise_unit_conversion_llm:80 - Initialising unit conversion LLM
2024-02-25 16:29:04.956 | INFO     | __main__:initialise_json_conversion_llm:88 - Initialising json conversion LLM
2024-02-25 16:29:04.957 |

In [72]:
display(results_df)

Unnamed: 0,ID,Metric,2021_Value,2021_Generated,Correct
0,46_X_Absa,Total procurement spend on qualifying small en...,4400000000.0,4400000000.0,True
1,49_X_Absa,B-BBEE level (South Africa),1.0,1.0,True
2,52_X_Absa,Board meeting attendance (%),98.0,98.0,True
3,53_X_Absa,Average age 40-49 years,3.0,3.0,True
4,54_X_Absa,Average age 50+,12.0,3.0,False
5,109_X_Absa,Staff costs and benefits (Rbn),26133000000.0,26133.0,False
6,122_X_Absa,Fatal-injury frequency rate (number of fatalit...,0.0,,False
7,128_X_Absa,Scope 1,12276.0,,False
8,129_X_Absa,Scope 2,158756.0,,False
9,130_X_Absa,Scope 3,16205.0,16205.0,True


In [40]:
COMPANY = "Absa"
YEAR = 2021

docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(docs, window_size=2)

2024-02-25 16:19:22.472 | INFO     | __main__:load_documents:50 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2022-Absa-Group-limited-Environmental-Social-and-Governance-Data-sheet.json
Preprocessing:   0%|          | 0/1 [00:00<?, ?docs/s]We found one or more sentences whose split count is higher than the split length.
Preprocessing: 100%|██████████| 1/1 [00:00<00:00, 33.84docs/s]


In [41]:
query_pipeline = QueryPipeline(
    docs=docs,
    company=COMPANY,
    top_k=3,
)

2024-02-25 16:19:23.852 | INFO     | __main__:initialise_document_store:57 - Initialising document store
2024-02-25 16:19:23.857 | INFO     | __main__:initialise_retriever:63 - Initialising retriever
Batches: 100%|██████████| 18/18 [00:00<00:00, 22.20it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 12036.73 docs/s]       
2024-02-25 16:19:25.948 | INFO     | __main__:initialise_generation_llm:72 - Initialising generation LLM
2024-02-25 16:19:25.949 | INFO     | __main__:initialise_unit_conversion_llm:80 - Initialising unit conversion LLM
2024-02-25 16:19:25.949 | INFO     | __main__:initialise_json_conversion_llm:88 - Initialising json conversion LLM
2024-02-25 16:19:25.950 | INFO     | __main__:initialise_mappings:96 - Initialising mappings


In [45]:
query_pipeline.query(
    amkey=246,
    year=YEAR
)

2024-02-25 16:21:17.353 | DEBUG    | __main__:query:120 - Retrieving AMKEY: 246
2024-02-25 16:21:17.354 | DEBUG    | __main__:query:123 - Retrieving metric: External learning programmes - Leadership and management
Batches: 100%|██████████| 1/1 [00:00<00:00, 42.81it/s]
2024-02-25 16:21:17.398 | DEBUG    | __main__:query:127 - Retrieved context documents:

 Programme initiatives
14Environment
Social
Tax
Governance
Shareholders
Social data - Talent development continued
Training delivery type
hours
Number of
utilised
Training programmes
hours
Number of
utilised
Training
hours
Number of
programmes
utilised
Training
programmes
hours
Number of
utilised
Face-to-face (workshops)
–
–
–
–
102 445.1
326
703 803.3
1 178
Virtual (online)
834 208.2
5 131
961 864.8
3 961
765 348.7
2 908
793 220.4
3 094
Virtual (classroom)
816 658.7
973
856 976.5
903
407 319.2
631
–
–
Material (self-study)
21 540.4
361
63 479.8
1 987
13 919.7
708
5 693.7
270
Tests/assessments
7 388.7
70
26 267.3
59
2 472.0
88
5 746.8

145923.8

In [35]:
# An example where the retrieved context is not relevant.
query_pipeline.query(
    amkey=490,
    year=YEAR
)

2024-02-25 16:03:35.395 | DEBUG    | __main__:query:120 - Retrieving AMKEY: 490
2024-02-25 16:03:35.397 | DEBUG    | __main__:query:123 - Retrieving metric: Non-renewable – Diesel (kWh)
Batches: 100%|██████████| 1/1 [00:00<00:00, 86.01it/s]
2024-02-25 16:03:35.440 | DEBUG    | __main__:query:127 - Retrieved context documents:

 | Indicator              | Trend   | 2022      | 2021      | 2020      | 2019      | 2018      | Notes                                                 |
|------------------------|---------|-----------|-----------|-----------|-----------|-----------|-------------------------------------------------------|
| Energy consumption     |         |           |           |           |           |           |                                                       |
| Total energy use (kWh) | 1       | 215963015 | 225659620 | 224593325 | 279837708 | 323133101 | Targeting 30% reduction in energy consumption by 2030 |

| Indicator                  |   Trend |      2022 |     

In [98]:
query_pipeline.query(
    amkey=46,
    year=YEAR
)

2024-02-25 13:56:52.354 | DEBUG    | __main__:query:120 - Retrieving AMKEY: 46
2024-02-25 13:56:52.356 | DEBUG    | __main__:query:123 - Retrieving metric: Total procurement spend on qualifying small enterprises and exempt micro enterprises(Rbn)
Batches: 100%|██████████| 1/1 [00:00<00:00, 183.08it/s]
2024-02-25 13:56:52.385 | DEBUG    | __main__:query:127 - Retrieved context documents:

 | Indicator                                                   | Trend   | 2022   | 2021   | 2020   | 2019   | Targets/Comments                 |
|-------------------------------------------------------------|---------|--------|--------|--------|--------|----------------------------------|
| Total procurement spend on qualifying small enterprises and |         |        |        |        |        |                                  |
| exempt micro enterprises(Rbn)                               | 1       | 2.6    | 4.4    | 3.5    | 2.7    | Limited to B-BBEE (South Africa) |

| Indicator                 

'4400000000'

In [83]:
query_pipeline.retrieve_unit(122)

In [112]:
# Convert to scientific notation
val = 6210711000.

f"{val:.2e}"

'6.21e+09'

In [102]:
4.400000e+09 == 4400000000

True

In [75]:
unit = query_pipeline.amkey_to_unit[query_pipeline.amkey_to_unit["AMKEY"] == 122]["Unit"].item()

In [78]:
# Check if unit is nan
if pd.isna(unit):
    print("Unit is nan")

Unit is nan
