In [None]:
import copy

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tqdm

import qut01

In [None]:
qut01.utils.logging.setup_logging_for_analysis_script()
dataset_path = qut01.data.dataset_parser.get_default_deeplake_dataset_path()

dataset = qut01.data.dataset_parser.get_deeplake_dataset(  # this will load the deeplake dataset itself
    dataset_path=dataset_path,
    checkout_branch=qut01.data.dataset_parser.dataset_validated_branch_name,  # to load all annotations
    # in order to load only annotated provided by hired workers, used the `dataset_annotated_branch_name` branch
)
data_parser = qut01.data.dataset_parser.DataParser(  # this will give us a easy-to-use parser for the dataset
    dataset_path_or_object=dataset,
    add_processed_data_to_batch=True,
    use_processed_data_cache=False,  # we will iterate over the entire dataset below, caching might go out of memory
)

In [None]:
potentially_annotated_statement_ids = data_parser.get_potentially_annotated_statement_ids()
potentially_annotated_statement_counts = {k: len(np.unique(v)) for k, v in potentially_annotated_statement_ids.items()}
print(f"\npotentially annotated statement counts across annot groups:\n\t{potentially_annotated_statement_counts}")
all_annot_sids = set([sid for annot_sids in potentially_annotated_statement_ids.values() for sid in annot_sids])

print(f"\ndataset contains {len(data_parser)} statements (and {len(all_annot_sids)} with potential annotations)")

statement_idxs = sorted([data_parser.statement_ids.index(sid) for sid in all_annot_sids])
print(f"will extract metadata from {len(statement_idxs)} statements")

In [None]:
def relevant_sentence_counter(statement_data):
    # note: if the statement is not annotated, we return nothing for that criteria
    # (means we can avoid aggregating unannotated statements in stats)
    output = {}
    for cname in qut01.data.classif_utils.ANNOT_CLASS_NAMES:
        if not statement_data["processed_data"].annotation_counts.get(cname, 0):
            continue
        tot_annot_sentence_count = 0
        for annot_counts in statement_data["processed_data"].sentence_annotation_counts:
            if annot_counts.get(cname, 0):
                tot_annot_sentence_count += 1
        output[cname] = tot_annot_sentence_count
    return output


fields_to_extract = {
    "text_lengths": lambda x: [len(x["fitz/text"].item())],
    "word_counts": lambda x: [x["metadata/WordCount"].item()],
    "sentence_counts": lambda x: [len(x["processed_data"].sentences)],
    "annual_revenues": lambda x: [x["metadata/AnnualRevenue"].item()],
    "countries": lambda x: x["metadata/Countries"].tolist(),
    "entities": lambda x: x["metadata/Entities"].tolist(),
    "industry_sectors": lambda x: x["metadata/IndustrySectors"].tolist(),
    "page_counts": lambda x: [x["metadata/PageCount"].item()],
    "timestamps": lambda x: [x["metadata/PeriodEnd"].item()],
    "trademarks": lambda x: x["metadata/Trademarks"].tolist(),
    "relevant_sentence_counts": lambda x: [relevant_sentence_counter(x)],
}

extracted_data = {key: [] for key in fields_to_extract}

for sidx in tqdm.tqdm(statement_idxs, "extracting metadata"):
    statement_data = data_parser[sidx]
    for field, getter in fields_to_extract.items():
        extracted_data[field].extend(getter(statement_data))

In [None]:
sector_count = len(set([s.strip() for strs in extracted_data["industry_sectors"] for s in strs.split("\n")]))
print(f"industrial sectors: {sector_count}")
trademarks = len(set([s.strip() for strs in extracted_data["trademarks"] for s in strs.split(",")]))
print(f"trademarks: {trademarks}")
entities = len(set([s.strip() for entities in extracted_data["entities"] for s in entities.split(",")]))
print(f"entities: {entities}")

avg_word_count = np.mean(extracted_data["word_counts"])
avg_sentence_count = np.mean(extracted_data["sentence_counts"])
avg_page_count = np.mean(extracted_data["page_counts"])
print(f"avg word count: {avg_word_count:.2f}")
print(f"avg sentence count: {avg_sentence_count:.2f}")
print(f"avg page count: {avg_page_count:.2f}")
tot_word_count = sum(extracted_data["word_counts"])
tot_sentence_count = sum(extracted_data["sentence_counts"])
tot_page_count = sum(extracted_data["page_counts"])
print(f"total word count: {tot_word_count}")
print(f"total sentence count: {tot_sentence_count}")
print(f"total page count: {tot_page_count}")

In [None]:
# draw the pie chart for page counts
bins = [0, 5, 10, 15, 30, float("inf")]
bin_labels = ["1-5", "5-10", "10-15", "15-30", "30+"]
bin_counts = [0] * (len(bins) - 1)
# bin_colors = ["#1f77b4", "#9467bd", "#ff7f0e", "#d62728", "#ff0000"]
page_counts = copy.deepcopy(extracted_data["page_counts"])
for count in page_counts:
    for i in range(len(bins) - 1):
        if bins[i] < count <= bins[i + 1]:
            bin_counts[i] += 1
            break
plt.figure(figsize=(4.5, 4.5))
wedges, texts, autotexts = plt.pie(
    bin_counts,
    labels=bin_labels,
    autopct="%1.1f%%",
    startangle=140,
    # colors=bin_colors,
    wedgeprops={"edgecolor": "black", "linewidth": 1.25, "antialiased": True},
    textprops={"fontsize": 14, "weight": "bold"},
)
for text in texts:
    text.set_color("black")
    text.set_fontsize(12)
    text.set_fontweight("bold")
for autotext in autotexts:
    autotext.set_color("white")
    autotext.set_fontsize(12)
    autotext.set_fontweight("bold")
plt.savefig("page_count.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
relevant_sentence_counts = {cname: [] for cname in qut01.data.classif_utils.ANNOT_CLASS_NAMES}
irrelevant_sentence_counts = {cname: [] for cname in qut01.data.classif_utils.ANNOT_CLASS_NAMES}
relevant_sentence_ratios = {cname: [] for cname in qut01.data.classif_utils.ANNOT_CLASS_NAMES}

for sen_count, annot_counts in zip(extracted_data["sentence_counts"], extracted_data["relevant_sentence_counts"]):
    for cname in annot_counts:
        if sen_count > 0:
            assert sen_count >= annot_counts[cname]
            relevant_sentence_counts[cname].append(annot_counts[cname])
            irrelevant_sentence_counts[cname].append(sen_count - annot_counts[cname])
            relevant_sentence_ratios[cname].append(annot_counts[cname] / sen_count)

In [None]:
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(5.5, 4.0), dpi=300)
relevant_sentence_ratios = {k: relevant_sentence_ratios[k] for k in sorted(relevant_sentence_ratios)}
labels = list(relevant_sentence_ratios.keys())
distributions = list(relevant_sentence_ratios.values())
palette = sns.color_palette("husl", len(relevant_sentence_ratios.keys()))
bxplot = ax.boxplot(
    distributions,
    labels=labels,
    showfliers=False,
    patch_artist=True,
    boxprops=dict(linewidth=1, edgecolor="black", facecolor="w"),
    whiskerprops=dict(linewidth=1, color="black"),
    medianprops=dict(linewidth=1, color="black"),
)
for label in ax.get_xticklabels():
    label.set_rotation(30)
    label.set_horizontalalignment("right")
    label.set_fontsize(11)
    label.set_weight("bold")
for label in ax.get_yticklabels():
    label.set_fontsize(11)
for patch, color in zip(bxplot["boxes"], palette):
    patch.set_facecolor(color)
ax.set_ylabel("Relevant sentence ratio", fontsize=12, weight="bold")
ax.get_xaxis().tick_bottom()
ax.get_yaxis().tick_left()
plt.tight_layout()
plt.savefig("relevant_ratios.pdf", dpi=300, format="pdf", bbox_inches="tight")
plt.show()