In [None]:
import functools
import os
import shutil
import textwrap

import numpy as np
import pandas as pd

import qut01

qut01.utils.logging.setup_logging_for_analysis_script()

In [None]:
dataset_path = qut01.data.dataset_parser.get_default_deeplake_dataset_path()
dataset = qut01.data.dataset_parser.get_deeplake_dataset(  # this will load the deeplake dataset itself
    dataset_path=dataset_path,
    checkout_branch=qut01.data.dataset_parser.dataset_annotated_branch_name,  # NOTE: not validated data!
)
data_parser = qut01.data.dataset_parser.DataParser(  # this will give us a easy-to-use parser for the dataset
    dataset_path_or_object=dataset,
    use_processed_data_cache=False,  # we will iterate over the entire dataset below, caching might go out of memory
)
potentially_annotated_statement_ids = data_parser.get_potentially_annotated_statement_ids()
fully_annot_sids = functools.reduce(set.intersection, [set(v) for v in potentially_annotated_statement_ids.values()])
target_sidxs = [data_parser.statement_ids.index(sid) for sid in fully_annot_sids]

In [None]:
np.random.seed(1)
np.random.shuffle(target_sidxs)

num_samples = 5
output_dir = "/tmp/raw_data_sample/"
abbyy_data_subdir = os.path.join(output_dir, "abbyy")
fitz_data_subdir = os.path.join(output_dir, "fitz")
pdf_data_subdir = os.path.join(output_dir, "pdf")
annotation_csv_path = os.path.join(output_dir, "annotations.csv")
metadata_csv_path = os.path.join(output_dir, "metadata.csv")
readme_path = os.path.join(output_dir, "README.txt")

if os.path.exists(output_dir) and os.path.isdir(output_dir):
    shutil.rmtree(output_dir)

os.makedirs(output_dir, exist_ok=True)

output_annots_df, output_metadata_df = None, None

for target_sidx in target_sidxs[:num_samples]:
    tensor_data = data_parser.get_tensor_data(target_sidx)
    statement_id = int(tensor_data["statement_id"])

    annotation_cols = [k for k in tensor_data.keys() if k.startswith("annotations/") and "annotated" not in k]
    single_annot_cols = [k for k in annotation_cols if "/a-s-c1/" in k]
    double_annot_cols = [k for k in annotation_cols if "/c2-c3-c4-c5-c6/" in k]
    metadata_cols = [k for k in tensor_data.keys() if k.startswith("metadata/") and "LocalLink" not in k]

    single_annots = pd.Series(
        {
            "statement_id": str(statement_id),
            **{k.split("/")[-1]: tensor_data[k].item() for k in single_annot_cols},
        }
    )
    double_annots = [
        pd.Series(
            {
                "statement_id": str(statement_id),
                **{k.split("/")[-1]: tensor_data[k][annotator_idx].item() for k in double_annot_cols},
            }
        )
        for annotator_idx in range(2)
    ]

    annots_df = pd.concat([single_annots, *double_annots], axis=1)
    annots_df = annots_df.transpose()
    annots_df.fillna(value=np.nan, inplace=True)

    metadata_df = pd.Series(
        {
            "statement_id": str(statement_id),
            **{k.split("/")[-1]: tensor_data[k].item() for k in metadata_cols},
        }
    )

    if output_annots_df is None:
        output_annots_df = annots_df
        output_metadata_df = metadata_df
    else:
        output_annots_df = pd.concat([output_annots_df, annots_df], axis=0)
        output_metadata_df = pd.concat([output_metadata_df, metadata_df], axis=1)

    os.makedirs(abbyy_data_subdir, exist_ok=True)
    os.makedirs(fitz_data_subdir, exist_ok=True)
    os.makedirs(pdf_data_subdir, exist_ok=True)

    abbyy_text_path = os.path.join(abbyy_data_subdir, f"{statement_id}.txt")
    with open(abbyy_text_path, "w") as fd:
        fd.write(tensor_data["abbyy/text"].item())

    fitz_text_path = os.path.join(fitz_data_subdir, f"{statement_id}.txt")
    with open(fitz_text_path, "w") as fd:
        fd.write(tensor_data["fitz/text"].item())

    pdf_path = os.path.join(pdf_data_subdir, f"{statement_id}.pdf")
    with open(pdf_path, "wb") as fd:
        fd.write(tensor_data["pdf_data"])

output_annots_df.to_csv(annotation_csv_path)
output_metadata_df = output_metadata_df.transpose()
output_metadata_df.to_csv(metadata_csv_path)

readme_content = f"""\
This data sample contains the original PDF, extracted text (using ABBYY FineReader and fitz), and annotations for {num_samples} modern slavery statements published on the Australian Modern Slavery Register.

The data provided here is formatted in a way to simplify exploration and parsing directly from a file browser. The real dataset contains more metadata and files (e.g. the auxiliary outputs from ABBYY FineReader and fitz), and is packaged in deeplake/HDF5 format.

NeurIPS 2024 Datasets and Benchmarks Track, Submission #1041 --- do not redistribute.
"""
wrapped_readme_content = "\n\n".join(
    "\n".join(textwrap.wrap(paragraph, width=100)) for paragraph in readme_content.split("\n\n")
)

with open(readme_path, "w") as fd:
    fd.write(wrapped_readme_content)

print("all done")