In [21]:
from functools import partialmethod
from pathlib import Path
from typing import Tuple

from tqdm import tqdm

from src.data.utils import read_dataset
from src.data.preprocess import documents_to_sentence_annotation

tqdm.__init__ = partialmethod(tqdm.__init__, disable=True)  # disable progress bar

root_path = Path(__file__).parent.parent

In [22]:
corpora = {
    "english": [
        "tempeval_3",
        "meantime_english",
        "tcr",
        "ancient_time_english",
        "wikiwars",
        "ph_english"
    ],
    "portuguese": [
        "timebankpt",
        "ph_portuguese"
    ],
    "spanish": [
        "spanish_timebank",
        "tempeval_2_spanish",
        "meantime_spanish",
        "traint3",
        "ancient_time_spanish",
        "ph_spanish"
    ],
    "italian": [
        "tempeval_2_italian",
        "meantime_italian",
        "narrative_container",
        "ancient_time_italian",
        "ph_italian"
    ],
    "french": [
        "fr_timebank",
        "tempeval_2_french",
        "ancient_time_french",
        "ph_french"
    ],
    "german": [
        "krauts",
        "wikiwars_de",
        "ancient_time_german",
        "ph_german"
    ],
}

In [23]:
def get_sentences_timexs_count(documents) -> Tuple[int, int]:
    n_sents, n_tmxs = 0, 0
    for doc in documents:
        annotated_sentences = documents_to_sentence_annotation(doc)
        for sent, tmx in annotated_sentences:
            if tmx:
                n_sents += 1
                n_tmxs += len(tmx)
    return n_sents, n_tmxs

In [24]:
header = [" ", "Train", " ", " ", "Validation", " ", " ", "Test", " ", " "]
print(f"{header[0]:<30} & {header[1]:<8} & {header[2]:<8} & {header[3]:<8} & {header[4]:<8} & {header[5]:<8} & {header[6]:<8} & {header[7]:<8} & {header[8]:<8} & {header[9]:<8}")
header = [" ", "#Docs", "#Sents", "#Timexs", "#Docs", "#Sents", "#Timexs", "#Docs", "#Sents", "#Timexs"]
print(f"{header[0]:<30} & {header[1]:<8} & {header[2]:<8} & {header[3]:<8} & {header[4]:<8} & {header[5]:<8} & {header[6]:<8} & {header[7]:<8} & {header[8]:<8} & {header[9]:<8}")

for language in corpora:
    for corpus in corpora[language]:
        train_docs, val_docs, test_docs = read_dataset(corpus, root_path / "data" / "raw")

        n_train_docs = len(train_docs)
        n_train_sents, n_train_tmxs = get_sentences_timexs_count(train_docs)

        n_val_docs = len(val_docs)
        n_val_sents, n_val_tmxs = get_sentences_timexs_count(val_docs)

        n_test_docs = len(test_docs)
        n_test_sents, n_test_tmxs = get_sentences_timexs_count(test_docs)

        print(f"{corpus:<30} & "
              f"{n_train_docs:<8} & {n_train_sents:<8} & {n_train_tmxs:<8} & "
              f"{n_val_docs:<8} & {n_val_sents:<8} & {n_val_tmxs:<8} & "
              f"{n_test_docs:<8} & {n_test_sents:<8} & {n_test_tmxs:<8}")

                               & Train    &          &          & Validation &          &          & Test     &          &         
                               & #Docs    & #Sents   & #Timexs  & #Docs    & #Sents   & #Timexs  & #Docs    & #Sents   & #Timexs 
tempeval_3                     & 204      & 1149     & 1427     & 51       & 307      & 383      & 20       & 106      & 138     
meantime_english               & 86       & 177      & 258      & 10       & 19       & 28       & 24       & 48       & 63      
tcr                            & 16       & 97       & 123      & 4        & 29       & 32       & 5        & 50       & 62      
ancient_time_english           & 4        & 195      & 256      & 0        & 0        & 0        & 1        & 22       & 50      
wikiwars                       & 16       & 1547     & 2111     & 2        & 100      & 136      & 4        & 302      & 393     
ph_english                     & 17743    & 129065   & 165907   & 1971     & 14049    & 