In [None]:
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import tqdm

import qut01

qut01.utils.logging.setup_logging_for_analysis_script()

In [None]:
dataset_path = qut01.data.dataset_parser.get_default_deeplake_dataset_path()
dataset = qut01.data.dataset_parser.get_deeplake_dataset(  # this will load the deeplake dataset itself
    dataset_path=dataset_path,
    checkout_branch=qut01.data.dataset_parser.dataset_validated_branch_name,  # to load all annotations (train-valid-test)
)
data_parser = qut01.data.dataset_parser.DataParser(  # this will give us a easy-to-use parser for the dataset
    dataset_path_or_object=dataset,
    use_processed_data_cache=False,  # we will iterate over the entire dataset below, caching might go out of memory
)
potentially_annotated_statement_ids = data_parser.get_potentially_annotated_statement_ids()
all_annot_sids = set([sid for annot_sids in potentially_annotated_statement_ids.values() for sid in annot_sids])

In [None]:
sentence_counts = []  # overall
relevant_sentence_counts = {}  # per annot type
irrelevant_sentence_counts = {}  # per annot type
word_counts = []  # overall
for target_sid in tqdm.tqdm(all_annot_sids, desc="parsing all annotated statements"):
    target_idx = data_parser.statement_ids.index(target_sid)
    statement_processed_data = data_parser.get_processed_data(target_idx)
    sentence_counts.append(len(statement_processed_data.sentences))
    for sentence in statement_processed_data.sentences:
        word_counts.append(len(sentence.split(" ")))


print(f"\ttotal sentence count: {sum(sentence_counts)}")
print(f"\ttotal word count: {sum(word_counts)}")

In [None]:
print(f"{min(sentence_counts)=}")
print(f"{max(sentence_counts)=}")
sentence_counts = np.asarray(sentence_counts)
mean_sentence_count = np.mean(sentence_counts)
print(f"{mean_sentence_count=}")
max_sentence_count = 600  # eliminate outliers
filtered_sentence_counts = sentence_counts[sentence_counts <= max_sentence_count]
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(4.0, 4.0), dpi=300)
ax.hist(filtered_sentence_counts, color="skyblue", bins=30, edgecolor="black")
ax.axvline(mean_sentence_count, color="red", linestyle="--", linewidth=2, label=f"Mean: {mean_sentence_count:.1f}")
ax.set_xlim([0, max_sentence_count])
ax.set_ylim([0, 800])
ax.set_xlabel("Number of sentences", fontsize=12, weight="bold")
ax.set_ylabel("Number of statements", fontsize=12, weight="bold")
ax.grid(axis="y", linestyle="--", alpha=0.7)
ax.legend()
fig.tight_layout()
fig.savefig("sentence_count_distrib.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
print(f"{min(word_counts)=}")
print(f"{max(word_counts)=}")
word_counts = np.asarray(word_counts)
mean_word_count = np.mean(word_counts)
print(f"{mean_word_count=}")
max_sentence_count = 100  # eliminate outliers
filtered_word_counts = word_counts[word_counts <= max_sentence_count]
sns.set(style="whitegrid")
fig, ax = plt.subplots(figsize=(4.0, 4.0), dpi=300)
ax.hist(filtered_word_counts, color="#D8BFD8", bins=30, edgecolor="black")
ax.axvline(mean_word_count, color="red", linestyle="--", linewidth=2, label=f"Mean: {mean_word_count:.1f}")
ax.set_xlim([0, max_sentence_count])
ax.set_ylim([0, 100000])
ax.set_xlabel("Number of words", fontsize=12, weight="bold")
ax.set_ylabel("Number of sentences", fontsize=12, weight="bold")
ax.grid(axis="y", linestyle="--", alpha=0.7)
ax.legend()
fig.tight_layout()
fig.savefig("word_count_distrib.pdf", format="pdf", dpi=300, bbox_inches="tight")
plt.show()