# AIMS.au Statement Text Extraction Quality Analysis

This notebook shows how to parse statement text that has already been extracted by using [ABBYY
FineReader](https://pdf.abbyy.com/) and [PyMuPDF (fitz)](https://pymupdf.readthedocs.io/en/latest/).
We also look into the main differences between these two sources of text.

This notebook was last updated on 2024-04-19 for framework v0.5.1.

In [None]:
import collections

import matplotlib.pyplot as plt
import numpy as np
import tqdm

import qut01

qut01.utils.logging.setup_logging_for_analysis_script()

In [None]:
parser_types = ["fitz", "abbyy"]

parsers = {
    parser_type: qut01.data.dataset_parser.DataParser(
        dataset_path_or_object=qut01.data.dataset_parser.get_default_deeplake_dataset_path(),
        dataset_branch=qut01.data.dataset_parser.dataset_annotated_branch_name,
        add_processed_data_to_batch=True,
        use_processed_data_cache=False,
        sentence_source_text_tensor=f"{parser_type}/text",
    )
    for parser_type in parser_types
}
assert len(parsers["abbyy"]) == len(parsers["fitz"])
statement_count = len(parsers["abbyy"])
print(f"ready to analyze processed text for {statement_count} statements")

In [None]:
raw_text_lens, extracted_sentences, matched_sentences = {}, {}, {}
MatchInfo = collections.namedtuple("MatchInfo", ["text", "score"])  # noqa
statement_idxs = list(range(statement_count))
for parser_type in parser_types:
    raw_text_lens[parser_type] = {}
    extracted_sentences[parser_type] = {}
    matched_sentences[parser_type] = {}
    for statement_idx in tqdm.tqdm(statement_idxs, desc=f"parsing {parser_type} data"):
        statement_id = parsers[parser_type].statement_ids[statement_idx]
        statement_data = parsers[parser_type].get_processed_data(statement_idx)
        raw_text_lens[parser_type][statement_id] = len(statement_data.text)
        extracted_sentences[parser_type][statement_id] = statement_data.sentences
        curr_matched_sentences = [list() for _ in statement_data.sentences]
        for chunk in statement_data.annotation_chunks:
            for chunk_sentence_idx in range(len(chunk.sentences)):
                orig_sentence_idx = chunk.matched_sentences_orig_idxs[chunk_sentence_idx]
                curr_matched_sentences[orig_sentence_idx].append(
                    MatchInfo(
                        text=chunk.sentences[chunk_sentence_idx],
                        score=chunk.matched_sentences_scores[chunk_sentence_idx],
                    )
                )
        matched_sentences[parser_type][statement_id] = curr_matched_sentences

In [None]:
def plot_bars(
    ax,
    abbyy_val,
    fitz_val,
    val_type_str,
    float_val=False,
):
    rects = ax.bar(
        [0, 1],
        [abbyy_val, fitz_val],
        0.8,
        color=["red", "blue"],
    )

    ax.set_ylabel(val_type_str)
    ax.set_title(f"Average {val_type_str} by Extraction Approach")
    ax.set_xticks([0, 1])
    ax.set_xticklabels(["abbyy", "fitz"])

    def autolabel(rects):
        for rect in rects:
            height = rect.get_height()
            ax.annotate(
                f"{height:.2f}" if float_val else f"{int(round(height))}",
                xy=(rect.get_x() + rect.get_width() / 2, height),
                xytext=(0, 3),  # 3 points vertical offset
                textcoords="offset points",
                ha="center",
                va="bottom",
            )

    autolabel(rects)


fig, ax = plt.subplots(nrows=2, ncols=2, figsize=(12, 12))  # noqa

raw_text_lens_abbyy = list(raw_text_lens["abbyy"].values())
raw_text_lens_fitz = list(raw_text_lens["fitz"].values())
assert len(raw_text_lens_abbyy) == len(raw_text_lens_fitz)
avg_text_len_abbyy = np.mean(raw_text_lens_abbyy)
avg_text_len_fitz = np.mean(raw_text_lens_fitz)
std_text_len_abbyy = np.std(raw_text_lens_abbyy)
std_text_len_fitz = np.std(raw_text_lens_fitz)
print(f"abbyy text len mean={int(avg_text_len_abbyy)}, std={int(std_text_len_abbyy)}")
print(f"fitz text len mean={int(avg_text_len_fitz)}, std={int(std_text_len_fitz)}")
plot_bars(
    ax=ax[0, 0],
    abbyy_val=avg_text_len_abbyy,
    fitz_val=avg_text_len_fitz,
    val_type_str="PDF Text Length",
)

raw_seq_count_abbyy = [len(sentences) for sentences in extracted_sentences["abbyy"].values()]
raw_seq_count_fitz = [len(sentences) for sentences in extracted_sentences["fitz"].values()]
avg_seq_count_abbyy = np.mean(raw_seq_count_abbyy)
avg_seq_count_fitz = np.mean(raw_seq_count_fitz)
std_seq_count_abbyy = np.std(raw_seq_count_abbyy)
std_seq_count_fitz = np.std(raw_seq_count_fitz)
print(f"abbyy seq count mean={int(avg_seq_count_abbyy)}, std={int(std_seq_count_abbyy)}")
print(f"fitz seq count mean={int(avg_seq_count_fitz)}, std={int(std_seq_count_fitz)}")
plot_bars(
    ax=ax[0, 1],
    abbyy_val=avg_seq_count_abbyy,
    fitz_val=avg_seq_count_fitz,
    val_type_str="Sentence Count",
)

raw_seq_lens_abbyy = [len(s) for sentences in extracted_sentences["abbyy"].values() for s in sentences]
raw_seq_lens_fitz = [len(s) for sentences in extracted_sentences["fitz"].values() for s in sentences]
avg_seq_lens_abbyy = np.mean(raw_seq_lens_abbyy)
avg_seq_lens_fitz = np.mean(raw_seq_lens_fitz)
std_seq_lens_abbyy = np.std(raw_seq_lens_abbyy)
std_seq_lens_fitz = np.std(raw_seq_lens_fitz)
print(f"abbyy seq len mean={int(avg_seq_lens_abbyy)}, std={int(std_seq_lens_abbyy)}")
print(f"fitz seq len mean={int(avg_seq_lens_fitz)}, std={int(std_seq_lens_fitz)}")
plot_bars(
    ax=ax[1, 0],
    abbyy_val=avg_seq_lens_abbyy,
    fitz_val=avg_seq_lens_fitz,
    val_type_str="Sentence Length",
)

raw_scores_abbyy = [m.score for s in matched_sentences["abbyy"].values() for matches in s if matches for m in matches]
raw_scores_fitz = [m.score for s in matched_sentences["fitz"].values() for matches in s if matches for m in matches]
avg_scores_abbyy = np.mean(raw_scores_abbyy)
avg_scores_fitz = np.mean(raw_scores_fitz)
std_scores_abbyy = np.std(raw_scores_abbyy)
std_scores_fitz = np.std(raw_scores_fitz)
print(f"abbyy scores mean={int(avg_scores_abbyy)}, std={int(std_scores_abbyy)}")
print(f"fitz scores mean={int(avg_scores_fitz)}, std={int(std_scores_fitz)}")
plot_bars(
    ax=ax[1, 1],
    abbyy_val=avg_scores_abbyy,
    fitz_val=avg_scores_fitz,
    val_type_str="Match Score",
    float_val=True,
)

fig.tight_layout()
plt.show()

fig.savefig("analysis_results.png")