In [1]:
# TODO: Package
import sys
sys.path.append('/home/tomw/unifi-pdf-llm/')

import pandas as pd
from loguru import logger

from load import load_documents
from preprocess import preprocess_documents
from rag import ModularRAG


TRAIN_CSV_PATH = "/home/tomw/unifi-pdf-llm/data/Train.csv"
"""Path to the Train.csv file."""

  from .autonotebook import tqdm as notebook_tqdm
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(
  _torch_pytree._register_pytree_node(


'Path to the Train.csv file.'

In [9]:
# Set log level
logger.remove()
logger.add(sys.stderr, level="DEBUG")

2

In [3]:
def validate_retrieval(
        company: str,
        year: int,
        type: str="retrieval",
        num: int=50,
        window_size: int=1,
        discard_text: bool=True
    ) -> tuple[pd.DataFrame, float]:
    """
    Returns a DataFrame with the results of the retrieval validation.

    TODO: The accuracy with validation also includes unit conversion. Which means
    with type 'retrieval', the accuracy with validation can actually be higher than
    the accuracy without validation (shouldn't happen).

    Parameters
    ----------
    company : str
        The company to validate.

    year : int
        The year to validate.

    type : str
        The type of validation test to run. Options are "retrieval" or "nan".
        The "retrieval" test checks the retrieval of values that are present in the
        documents. The "nan" test checks the retrieval of values that are not present
        in the documents (i.e. testing the ability to return 'None' when the value is
        not present).

    window_size : int
        The size of the sliding window to use when slicing tables.

    discard_text : bool
        If True, discard text passages when preprocessing the documents. Only tables
        are kept.

    Returns
    -------
    results_df : pd.DataFrame
        The results of the retrieval validation.

    accuracy : float
        The accuracy of the retrieval validation.

    Raises
    ------
    ValueError
        If the year is not 2019, 2020, or 2021.
    """
    if year not in [2019, 2020, 2021]:
        raise ValueError(f"Unable to validate year: {year}")

    train_df = pd.read_csv(TRAIN_CSV_PATH)

    # Restrict to the company
    train_df = train_df[train_df["ID"].str.contains(f"X_{company}")]
    train_df.reset_index(drop=True, inplace=True)

    # Drop the two columns that we are not interested in
    all_years = ["2021", "2020", "2019"]
    all_years.remove(str(year))
    for _year in all_years:
        train_df.drop(columns=[f"{_year}_Value"], inplace=True)

    if type == "retrieval":
        train_df = train_df.dropna(subset=[f"{year}_Value"], how="all")
    elif type == "nan":
        train_df = train_df[train_df[f"{year}_Value"].isna()]
        # Keep a random sample rows
        # train_df = train_df.sample(n=50)
    else:
        raise ValueError(f"Invalid validation type: {type}")

    train_df = train_df.head(n=num)

    # Load and preprocess the documents
    docs = load_documents(company, year)
    docs = preprocess_documents(
        docs, window_size=window_size, discard_text=discard_text
    )

    logger.debug(f"Number of documents: {len(docs)}")

    query_pipeline = ModularRAG(
        docs=docs,
        company=company,
    )

    results_df = train_df.copy(deep=True)

    # Loop over the rows in the dataframe and retrieve the value for each AMKEY
    for idx, row in train_df.iterrows():
        amkey = int(row["ID"].split("_")[0])

        metric = query_pipeline.retrieve_metric_description(amkey)
        results_df.at[idx, "Metric"] = metric

        value, unvalidated_value = query_pipeline.query(amkey, year)
        results_df.at[idx, f"{year}_Generated"] = value
        results_df.at[idx, f"{year}_Gen_Unvalidated"] = unvalidated_value

    results_df[f"{year}_Value"] = results_df[f"{year}_Value"].astype(float)
    results_df[f"{year}_Generated"] = results_df[f"{year}_Generated"].astype(float)
    results_df["Correct"] = results_df.apply(
        lambda row: (row[f"{year}_Generated"] == row[f"{year}_Value"]) or
        (pd.isna(row[f"{year}_Generated"]) and pd.isna(row[f"{year}_Value"])) or
        (row[f"{year}_Generated"] == -1 and pd.isna(row[f"{year}_Value"])),
        axis=1
    )

    # Reordering the columns
    results_df = results_df[["ID", "Metric", f"{year}_Value", f"{year}_Gen_Unvalidated", f"{year}_Generated", "Correct"]]

    accuracy_w_validation = results_df["Correct"].sum() / len(results_df)

    logger.info(f"Accuracy w/ validation: {accuracy_w_validation}")

    accurcy_wo_validation = results_df.apply(
        lambda row: (row[f"{year}_Gen_Unvalidated"] == row[f"{year}_Value"]) or
        (pd.isna(row[f"{year}_Gen_Unvalidated"]) and pd.isna(row[f"{year}_Value"])),
        axis=1
    ).sum() / len(results_df)

    logger.info(f"Accuracy w/o validation: {accurcy_wo_validation}")

    return results_df, accuracy_w_validation, accurcy_wo_validation


## Tongaat 2021

In [7]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Tongaat", 2021, type="retrieval", num=10, window_size=2)

2024-03-02 14:08:03.442 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG_removed_sup_table.json
2024-03-02 14:08:05.360 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 14:08:05.367 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 22/22 [00:00<00:00, 22.81it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 10067.20 docs/s]       
2024-03-02 14:08:08.450 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
Batches: 100%|██████████| 1/1 [00:00<00:00, 201.02it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 197.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 200.44it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 191.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 194.03it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 209.57it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 126.33it/s]
Batches: 100%|██████████| 1/

In [9]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
7,12_X_Tongaat,Total injury frequency rate (TIFR) – employees...,1.331,1.331,1.331,True
18,28_X_Tongaat,Total – company managed/farmed land (owned and...,60204.0,52883.0,,False
30,49_X_Tongaat,B-BBEE Level,4.0,4.0,,False
33,52_X_Tongaat,Overall Board and Committee meeting attendance,99.0,99.0,99.0,True
64,114_X_Tongaat,Energy efficiency: total direct and indirect e...,16.63,16.63,16.63,True
71,122_X_Tongaat,"Fatal injury frequency rate (FIFR, i.e. number...",0.005,0.005,0.005,True
76,128_X_Tongaat,Carbon emissions – Scope 1,505575.0,505575.0,505575.0,True
77,129_X_Tongaat,Carbon emissions – Scope 2,51539.0,51539.0,51539.0,True
85,138_X_Tongaat,Hazardous waste disposed of at appropriate fac...,184.0,184.0,184.0,True
94,151_X_Tongaat,"Lost time injury frequency rate (LTIFR, i.e. n...",0.093,0.093,0.093,True


In [10]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Tongaat", 2021, type="nan", num=20, window_size=2)

2024-02-29 22:34:31.653 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG_removed_sup_table.json


2024-02-29 22:34:33.588 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 678
2024-02-29 22:34:33.589 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-02-29 22:34:33.594 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 22/22 [00:01<00:00, 21.57it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 9555.98 docs/s]        
2024-02-29 22:34:36.513 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-02-29 22:34:36.518 | DEBUG    | rag:query:202 - Retrieving AMKEY: 3
2024-02-29 22:34:36.519 | DEBUG    | rag:query:204 - Retrieving metric: Advisory fees as per income statement
Batches: 100%|██████████| 1/1 [00:00<00:00, 216.72it/s]
2024-02-29 22:34:36.545 | DEBUG    | rag:retrieve_value:266 - Retrieval prompt:

Use the following markdown tables to as context to answer the question at the end.
The answer must be a value retrieved directly from the context. Please don't do any unit co

In [11]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
0,3_X_Tongaat,Advisory fees as per income statement,,,,True
1,6_X_Tongaat,Air emissions of the following pollutants: (1) CO,,,,True
2,7_X_Tongaat,Air emissions of the following pollutants: (2)...,,,,True
3,8_X_Tongaat,Air emissions of the following pollutants: (3)...,,,,True
4,9_X_Tongaat,Air emissions of the following pollutants: (4)...,,,,True
5,10_X_Tongaat,Air emissions of the following pollutants: (5)...,,,,True
6,11_X_Tongaat,ALL Administration expenses per income statement,,,,True
8,13_X_Tongaat,"Amount of assets under management, by asset cl...",,,,True
9,14_X_Tongaat,"Amount of assets under management, by asset cl...",,,,True
10,15_X_Tongaat,"Amount of assets under management, by asset cl...",,,,True


In [4]:
COMPANY = "Tongaat"
YEAR = 2021
AMKEY = 49

docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(docs, window_size=2, discard_text=True)

query_pipeline = ModularRAG(
    docs=docs,
    company=COMPANY,
)

value, unvalidated_value = query_pipeline.query(AMKEY, YEAR)

2024-03-02 15:39:55.318 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2021ESG_removed_sup_table.json
2024-03-02 15:39:57.227 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 15:39:57.232 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 22/22 [00:01<00:00, 14.86it/s]ocs/s]
Documents Processed: 10000 docs [00:01, 6612.31 docs/s]        
2024-03-02 15:40:01.993 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 15:40:01.997 | DEBUG    | rag:query:202 - Retrieving AMKEY: 49
2024-03-02 15:40:01.998 | DEBUG    | rag:query:204 - Retrieving metric: B-BBEE Level
Batches: 100%|██████████| 1/1 [00:00<00:00, 76.44it/s]
2024-03-02 15:40:02.155 | DEBUG    | rag:retrieve_value:263 - Retrieval prompt:

Use the following markdown tables to as context to answer the question at the end.
The answer must be a value retrieved direc

## ABSA 2021

In [8]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Absa", 2021, type="retrieval", num=50, window_size=2)

2024-03-02 15:42:18.117 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2022-Absa-Group-limited-Environmental-Social-and-Governance-Data-sheet.json
2024-03-02 15:42:19.373 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 536
2024-03-02 15:42:19.374 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 15:42:19.379 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 17/17 [00:00<00:00, 29.68it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 16895.32 docs/s]       
2024-03-02 15:42:22.065 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 15:42:22.069 | DEBUG    | rag:query:202 - Retrieving AMKEY: 46
2024-03-02 15:42:22.070 | DEBUG    | rag:query:204 - Retrieving metric: Total procurement spend on qualifying small enterprises and exempt micro enterprises(Rbn)
Batches: 100%|██████████| 1/1 [00:00<00:00, 

In [13]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
27,46_X_Absa,Total procurement spend on qualifying small en...,4400000000.0,4.4,4400000000.0,True
30,49_X_Absa,B-BBEE level (South Africa),1.0,1.0,1.0,True
33,52_X_Absa,Board meeting attendance (%),98.0,98.0,98.0,True
34,53_X_Absa,Average age 40-49 years,3.0,3.0,,False
35,54_X_Absa,Average age 50+,12.0,61.0,61.0,False
59,109_X_Absa,Staff costs and benefits (Rbn),26133000000.0,26133.0,26133.0,False
71,122_X_Absa,Fatal-injury frequency rate (number of fatalit...,0.0,0.0,,False
76,128_X_Absa,Scope 1,12276.0,,,False
77,129_X_Absa,Scope 2,158756.0,12.24,12.24,False
78,130_X_Absa,Scope 3,16205.0,16205.0,16205.0,True


In [14]:
COMPANY = "Absa"
YEAR = 2021

docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(
    docs, window_size=2, discard_text=True
)

2024-02-29 22:38:07.201 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2022-Absa-Group-limited-Environmental-Social-and-Governance-Data-sheet.json


In [15]:
AMKEY = 575

query_pipeline = ModularRAG(
    docs=docs,
    company=COMPANY,
)

validated_value, unvalidated_value = query_pipeline.query(AMKEY, YEAR)

print(f'Retrieved values: {validated_value}, {unvalidated_value}')

2024-02-29 22:38:08.489 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-02-29 22:38:08.494 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 17/17 [00:00<00:00, 21.98it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 12623.27 docs/s]       
2024-02-29 22:38:11.418 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-02-29 22:38:11.424 | DEBUG    | rag:query:202 - Retrieving AMKEY: 575
2024-02-29 22:38:11.425 | DEBUG    | rag:query:204 - Retrieving metric: Total procurement spend in South Africa (Rbn)
Batches: 100%|██████████| 1/1 [00:00<00:00, 213.83it/s]
2024-02-29 22:38:11.451 | DEBUG    | rag:retrieve_value:266 - Retrieval prompt:

Use the following markdown tables to as context to answer the question at the end.
The answer must be a value retrieved directly from the context. Please don't do any unit conversion.

It is possible that the answer is not explicitly stated in the context.
If

Retrieved values: 18900000000.0, 18.9


In [16]:
results_df, accuracy, validated_accuracy = validate_retrieval("Absa", 2021, type="nan", num=50, window_size=2)

2024-02-29 22:38:17.447 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2022-Absa-Group-limited-Environmental-Social-and-Governance-Data-sheet.json
2024-02-29 22:38:18.899 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 536
2024-02-29 22:38:18.899 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-02-29 22:38:18.903 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 17/17 [00:00<00:00, 23.22it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 13326.97 docs/s]       
2024-02-29 22:38:21.612 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-02-29 22:38:21.617 | DEBUG    | rag:query:202 - Retrieving AMKEY: 3
2024-02-29 22:38:21.618 | DEBUG    | rag:query:204 - Retrieving metric: Advisory fees as per income statement
Batches: 100%|██████████| 1/1 [00:00<00:00, 207.00it/s]
2024-02-29 22:38:21.647 | DEBUG    | rag:

In [17]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
0,3_X_Absa,Advisory fees as per income statement,,,,True
1,6_X_Absa,Air emissions of the following pollutants: (1) CO,,187237.0,,True
2,7_X_Absa,Air emissions of the following pollutants: (2)...,,,,True
3,8_X_Absa,Air emissions of the following pollutants: (3)...,,,,True
4,9_X_Absa,Air emissions of the following pollutants: (4)...,,,,True
5,10_X_Absa,Air emissions of the following pollutants: (5)...,,,,True
6,11_X_Absa,ALL Administration expenses per income statement,,7407.0,,True
7,12_X_Absa,All Inury Frequency Rate (Injuries/1m hrs worked),,0.0,,True
8,13_X_Absa,"Amount of assets under management, by asset cl...",,3.35,,True
9,14_X_Absa,"Amount of assets under management, by asset cl...",,3.75,,True


In [7]:
COMPANY = "Absa"
YEAR = 2021
AMKEY = 49

docs = load_documents(COMPANY, YEAR)
docs = preprocess_documents(docs, window_size=2, discard_text=True)

query_pipeline = ModularRAG(
    docs=docs,
    company=COMPANY,
)

value, unvalidated_value = query_pipeline.query(AMKEY, YEAR)

2024-03-02 15:41:46.288 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/2022-Absa-Group-limited-Environmental-Social-and-Governance-Data-sheet.json
2024-03-02 15:41:47.743 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 15:41:47.747 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 17/17 [00:00<00:00, 21.87it/s]ocs/s]
Documents Processed: 10000 docs [00:00, 12555.75 docs/s]       
2024-03-02 15:41:50.680 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 15:41:50.687 | DEBUG    | rag:query:202 - Retrieving AMKEY: 49
2024-03-02 15:41:50.688 | DEBUG    | rag:query:204 - Retrieving metric: B-BBEE level (South Africa)
Batches: 100%|██████████| 1/1 [00:00<00:00, 205.76it/s]
2024-03-02 15:41:50.711 | DEBUG    | rag:retrieve_value:263 - Retrieval prompt:

Use the following markdown tables to as context to answer the qu

## Distell 2021

In [12]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Distell", 2021, type="retrieval", num=50, window_size=2)

2024-03-02 14:31:02.870 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/DISTELL ESG Appendix 2022.json
2024-03-02 14:31:03.361 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 14:31:03.363 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 6/6 [00:00<00:00, 17.50it/s] docs/s]
Documents Processed: 10000 docs [00:00, 28455.10 docs/s]       
2024-03-02 14:31:05.806 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
Batches: 100%|██████████| 1/1 [00:00<00:00, 223.86it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 160.38it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 40.45it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 57.66it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 126.35it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 79.70it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 43.63it/s]
Batches: 100%|██████████| 1/1 [0

In [13]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Distell", 2021, type="nan", num=50, window_size=2)

2024-03-02 14:33:36.683 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/DISTELL ESG Appendix 2022.json
2024-03-02 14:33:36.972 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 14:33:36.974 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 6/6 [00:00<00:00, 21.74it/s] docs/s]
Documents Processed: 10000 docs [00:00, 35168.97 docs/s]       
2024-03-02 14:33:39.561 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
Batches: 100%|██████████| 1/1 [00:00<00:00, 120.40it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 154.67it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 209.22it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 160.99it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 104.18it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 58.75it/s]
Batches: 100%|██████████| 1/1 [00:00<00:00, 99.53it/s]
Batches: 100%|██████████| 1/1 

## Pick n Pay 2021

In [10]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Picknpay", 2021, type="retrieval", num=50, window_size=2)

2024-03-02 18:46:55.552 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/picknpay-esg-report-spreads-2023.json
2024-03-02 18:46:56.696 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 275
2024-03-02 18:46:56.697 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 18:46:56.699 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 9/9 [00:00<00:00, 23.57it/s] docs/s]
Documents Processed: 10000 docs [00:00, 25216.81 docs/s]       
2024-03-02 18:46:59.486 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 18:46:59.491 | DEBUG    | rag:query:202 - Retrieving AMKEY: 46
2024-03-02 18:46:59.492 | DEBUG    | rag:query:204 - Retrieving metric: BBBEE procurement spend from Exempt Micro Enterprises (EMEs), Qualifyimg Small Enterprises (QSEs) & Empowering Suppliers that are >51 black owned
Batches: 100%|██████████| 

In [11]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
27,46_X_Picknpay,BBBEE procurement spend from Exempt Micro Ente...,4500000000.0,,,False
33,52_X_Picknpay,Board meeting attendance rate,100.0,100.0,100.0,True
71,122_X_Picknpay,Fatal injury frequency rate (FIFR),0.0,0.0,0.0,True
76,128_X_Picknpay,GHG Scope 1 emissions,123421.0,3.6,,False
77,129_X_Picknpay,GHG Scope 2 emissions,766174.0,7.1,,False
78,130_X_Picknpay,GHG Scope 3 emissions,482615.0,4.2,,False
94,151_X_Picknpay,Lost-time injury frequency rate (LTIFR),2.3,0.75,,False
97,156_X_Picknpay,"Number of Medical Treatment Cases (MTCs, i.e. ...",102.0,102.0,102.0,True
124,216_X_Picknpay,Number of environmental incidents with a negat...,0.0,0.0,0.0,True
125,219_X_Picknpay,Number of work-related fatalities,0.0,0.0,,False


In [12]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Picknpay", 2021, type="nan", num=50, window_size=2)

2024-03-02 18:49:08.909 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/picknpay-esg-report-spreads-2023.json
2024-03-02 18:49:10.154 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 275
2024-03-02 18:49:10.155 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 18:49:10.157 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 9/9 [00:00<00:00, 13.90it/s] docs/s]
Documents Processed: 10000 docs [00:00, 15115.88 docs/s]       
2024-03-02 18:49:12.900 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 18:49:12.904 | DEBUG    | rag:query:202 - Retrieving AMKEY: 3
2024-03-02 18:49:12.905 | DEBUG    | rag:query:204 - Retrieving metric: Advisory fees as per income statement
Batches: 100%|██████████| 1/1 [00:00<00:00, 218.40it/s]
2024-03-02 18:49:12.922 | DEBUG    | rag:retrieve_value:263 - Retrieval prompt:

In [13]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
0,3_X_Picknpay,Advisory fees as per income statement,,,,True
1,6_X_Picknpay,Air emissions of the following pollutants: (1) CO,,,,True
2,7_X_Picknpay,Air emissions of the following pollutants: (2)...,,,,True
3,8_X_Picknpay,Air emissions of the following pollutants: (3)...,,,,True
4,9_X_Picknpay,Air emissions of the following pollutants: (4)...,,,,True
5,10_X_Picknpay,Air emissions of the following pollutants: (5)...,,1372210.0,,True
6,11_X_Picknpay,ALL Administration expenses per income statement,,,,True
7,12_X_Picknpay,All Inury Frequency Rate (Injuries/1m hrs worked),,2.5,,True
8,13_X_Picknpay,"Amount of assets under management, by asset cl...",,15.0,,True
9,14_X_Picknpay,"Amount of assets under management, by asset cl...",,1.5,,True


## Sasol 2021

In [14]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Sasol", 2021, type="retrieval", num=50, window_size=2)

2024-03-02 18:51:11.752 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/SASOL Sustainability Report 2023 20-09_0.json
2024-03-02 18:51:14.323 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 1017
2024-03-02 18:51:14.324 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 18:51:14.332 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 32/32 [00:01<00:00, 31.74it/s]docs/s]
Documents Processed: 10000 docs [00:01, 9601.22 docs/s]         
2024-03-02 18:51:17.642 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 18:51:17.647 | DEBUG    | rag:query:202 - Retrieving AMKEY: 7
2024-03-02 18:51:17.648 | DEBUG    | rag:query:204 - Retrieving metric: Nitrogen oxides (NOx) (kilotons)
Batches: 100%|██████████| 1/1 [00:00<00:00, 215.11it/s]
2024-03-02 18:51:17.686 | DEBUG    | rag:retrieve_value:263 - Retrieval p

In [15]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
2,7_X_Sasol,Nitrogen oxides (NOx) (kilotons),122300.0,124.0,124.0,False
3,8_X_Sasol,Sulphur oxides (SOx ) (kilotons),181100.0,181.1,181.1,False
4,9_X_Sasol,Particulates (fly ash) (kilotons),8100.0,8.2,8.2,False
5,10_X_Sasol,Volatile Organic Compounds (VOC) Indicator of ...,27800.0,27.8,,False
20,31_X_Sasol,Area dedicated to biodiversity and conservatio...,5525.0,6147.0,6147.0,False
30,49_X_Sasol,B-BBEE verification certificate,4.0,4.0,4.0,True
59,109_X_Sasol,Value added statement (unaudited) - Employees,33000000.0,0.31,,False
76,128_X_Sasol,Direct carbon dioxide (CO2 ) Scope 1 (kilotons),57903.0,,,False
77,129_X_Sasol,Indirect carbon dioxide (CO2 ) Scope 2 (kilotons),5099000.0,63.0,,False
82,135_X_Sasol,Hazardous waste (kilotons),302000.0,320.0,320.0,False


In [16]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Sasol", 2021, type="nan", num=50, window_size=2)

2024-03-02 18:53:33.316 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/SASOL Sustainability Report 2023 20-09_0.json
2024-03-02 18:53:35.962 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 1017
2024-03-02 18:53:35.963 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 18:53:35.971 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 32/32 [00:01<00:00, 31.92it/s]docs/s]
Documents Processed: 10000 docs [00:01, 9649.71 docs/s]         
2024-03-02 18:53:39.321 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 18:53:39.326 | DEBUG    | rag:query:202 - Retrieving AMKEY: 3
2024-03-02 18:53:39.328 | DEBUG    | rag:query:204 - Retrieving metric: Advisory fees as per income statement
Batches: 100%|██████████| 1/1 [00:00<00:00, 177.69it/s]
2024-03-02 18:53:39.369 | DEBUG    | rag:retrieve_value:263 - Retrie

In [17]:
results_df

Unnamed: 0,ID,Metric,2021_Value,2021_Gen_Unvalidated,2021_Generated,Correct
0,3_X_Sasol,Advisory fees as per income statement,,,,True
1,6_X_Sasol,Air emissions of the following pollutants: (1) CO,,,,True
6,11_X_Sasol,ALL Administration expenses per income statement,,,,True
7,12_X_Sasol,All Inury Frequency Rate (Injuries/1m hrs worked),,0.59,,True
8,13_X_Sasol,"Amount of assets under management, by asset cl...",,,,True
9,14_X_Sasol,"Amount of assets under management, by asset cl...",,1.7,1.7,False
10,15_X_Sasol,"Amount of assets under management, by asset cl...",,147.09,147.09,False
11,16_X_Sasol,"Amount of assets under management, by asset cl...",,526.2,,True
12,17_X_Sasol,"Amount of assets under management, by asset cl...",,,,True
13,18_X_Sasol,"Amount of assets under management, by asset cl...",,,,True


## SSW 2021

In [18]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Ssw", 2021, type="retrieval", num=50, window_size=2)

2024-03-02 18:55:26.189 | INFO     | load:load_documents:62 - Loading documents from /home/tomw/unifi-pdf-llm/data/azureconverter_outputs/ssw-IR22.json
2024-03-02 18:55:35.259 | DEBUG    | __main__:validate_retrieval:79 - Number of documents: 2751
2024-03-02 18:55:35.260 | INFO     | rag:_initialise_document_store:131 - Initialising document store
2024-03-02 18:55:35.281 | INFO     | rag:_initialise_retriever:149 - Initialising retriever
Batches: 100%|██████████| 86/86 [00:03<00:00, 23.24it/s]docs/s]
Documents Processed: 10000 docs [00:03, 2627.44 docs/s]         
2024-03-02 18:55:41.309 | INFO     | rag:_initialise_mappings:177 - Initialising mappings
2024-03-02 18:55:41.314 | DEBUG    | rag:query:202 - Retrieving AMKEY: 7
2024-03-02 18:55:41.316 | DEBUG    | rag:query:204 - Retrieving metric: Air emissions of the following pollutants: (2) NOx (excluding N2O)
Batches: 100%|██████████| 1/1 [00:00<00:00, 132.76it/s]
2024-03-02 18:55:41.410 | DEBUG    | rag:retrieve_value:263 - Retrieval

In [None]:
results_df

In [None]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Ssw", 2021, type="nan", num=50, window_size=2)

In [None]:
results_df

## UCT 2021

In [None]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Uct", 2021, type="retrieval", num=50, window_size=2)

In [None]:
results_df

In [None]:
results_df, accuracy, unvalidated_accuracy = validate_retrieval("Uct", 2021, type="nan", num=50, window_size=2)

In [None]:
results_df