# Dataset sizes

## In subword-tokens

Statistics about the datasets used for training and evaluation.

* Dataset size in examples (i.e. paragraphs/sentences)
* Statistics on the length of the examples per dataset (mean, outlier boundary)


In [1]:
import datasets
import pandas as pd 

import numpy as np
from transformers import AutoTokenizer

from transnormer.preprocess import translit

  from .autonotebook import tqdm as notebook_tqdm


In [None]:

paths = {
    "train" : [
        ("dtaeval", "../../data/interim/dtaeval/dtaeval-train.jsonl"),
        ("ridges", "../../data/interim/ridges_bollmann/ridges_bollmann-train.jsonl"),
        ("deu_news_2020", "../../data/interim/deu_news_2020/deu_news_2020-train.jsonl"),
        ("dtak-1600-1699", "../../data/interim/dtak-1600-1699/dtak-1600-1699-train.jsonl"),
        ("dtak-1700-1799", "../../data/interim/dtak-1700-1799/dtak-1700-1799-train.jsonl"),
        ("dtak-1800-1899", "../../data/interim/dtak-1800-1899/dtak-1800-1899-train.jsonl"),
    ],
    "validation" : [
        ("dtaeval", "../../data/interim/dtaeval/dtaeval-validation.jsonl"),
        ("ridges", "../../data/interim/ridges_bollmann/ridges_bollmann-validation.jsonl"),
        ("deu_news_2020", "../../data/interim/deu_news_2020/deu_news_2020-validation.jsonl"),
        ("dtak-1600-1699", "../../data/interim/dtak-1600-1699/dtak-1600-1699-validation.jsonl"),
        ("dtak-1700-1799", "../../data/interim/dtak-1700-1799/dtak-1700-1799-validation.jsonl"),
        ("dtak-1800-1899", "../../data/interim/dtak-1800-1899/dtak-1800-1899-validation.jsonl"),
    ],
    "test" : [
        ("dtaeval", "../../data/interim/dtaeval/dtaeval-test.jsonl"),
        ("ridges", "../../data/interim/ridges_bollmann/ridges_bollmann-test.jsonl"),
        ("deu_news_2020", "../../data/interim/deu_news_2020/deu_news_2020-test.jsonl"),
        ("dtak-1600-1699", "../../data/interim/dtak-1600-1699/dtak-1600-1699-test.jsonl"),
        ("dtak-1700-1799", "../../data/interim/dtak-1700-1799/dtak-1700-1799-test.jsonl"),
        ("dtak-1800-1899", "../../data/interim/dtak-1800-1899/dtak-1800-1899-test.jsonl"),
    ]
}

checkpoint_encoder = "dbmdz/bert-base-historic-multilingual-cased"
checkpoint_decoder = "bert-base-multilingual-cased"

# Load tokenizers
tokenizer_input = AutoTokenizer.from_pretrained(checkpoint_encoder)
tokenizer_output = AutoTokenizer.from_pretrained(checkpoint_decoder)
# Replace input tokenizer's normalization component with a custom transliterator
transliterator = translit.Transliterator1()
tokenizer_input = translit.exchange_transliterator(tokenizer_input, transliterator)

def tokenize_input_and_output(batch, tokenizer_input, tokenizer_output):
    # Tokenize the inputs and labels
    inputs = tokenizer_input(batch["orig"], padding=False, truncation=False)
    outputs = tokenizer_output(batch["norm"], padding=False, truncation=False,)
    batch["input_ids"] = inputs.input_ids
    batch["attention_mask"] = inputs.attention_mask
    batch["labels"] = outputs.input_ids.copy()
    return batch

# Compute the token length for each paragraph
def get_length(example):
    example["input_length"] = len(example["input_ids"])
    example["output_length"] = len(example["labels"])
    return example

def get_upper_outer_fence(lengths):
    q3 = np.percentile(lengths, 75)
    iqr = q3 - np.percentile(lengths, 25)
    upper_outer_fence = q3 + 3*iqr
    return upper_outer_fence



tokenization_kwargs = {
    "tokenizer_input": tokenizer_input,
    "tokenizer_output": tokenizer_output,
}

df_data = {
    "name" : [], "split" : [], "examples" : [], 
    "input_len (mean)" : [], "output_len (mean)" : [], 
    "input_len (uof)" : [], "output_len (uof)" : []
}

for split in paths:
    for dname, path in paths[split]:
        # Load all datasets for this split
        ds = datasets.load_dataset("json", data_files=path, split="train")
        df_data["name"].append(dname)
        df_data["split"].append(split)
        df_data["examples"].append(ds.num_rows)
        # Tokenize by applying a mapping
        ds_tok = ds.map(
            tokenize_input_and_output,
            fn_kwargs=tokenization_kwargs,
            remove_columns=["orig", "norm"],
            batched=True,
            batch_size=64,
        )
        ds_tok = ds_tok.map(get_length) 
        df_data["input_len (mean)"].append(np.mean(ds_tok["input_length"]))
        df_data["output_len (mean)"].append(np.mean(ds_tok["output_length"]))
        df_data["input_len (uof)"].append(get_upper_outer_fence(ds_tok["input_length"]))
        df_data["output_len (uof)"].append(get_upper_outer_fence(ds_tok["output_length"]))

In [38]:
df = pd.DataFrame(data=df_data)
print("m = mean; uof = upper outer fence (i.e. 3*IQR above Q3)")
df.head(20)

m = mean; uof = upper outer fence (i.e. 3*IQR above Q3)


Unnamed: 0,name,split,examples,input_len (mean),output_len (mean),input_len (uof),output_len (uof)
0,dtaeval,train,200524,33.682143,34.116061,141.0,145.0
1,ridges,train,2921,30.264635,27.36152,125.0,108.0
2,deu_news_2020,train,800000,29.79974,28.026419,92.0,83.0
3,dtak-1600-1699,train,754435,41.986521,39.78389,194.0,185.0
4,dtak-1700-1799,train,1714657,38.80428,37.652118,165.0,161.0
5,dtak-1800-1899,train,2174712,39.749745,39.734626,168.0,168.0
6,dtaeval,validation,18278,31.56609,32.407649,126.0,130.0
7,ridges,validation,671,30.385991,27.424739,121.0,112.0
8,deu_news_2020,validation,100000,29.81356,28.04331,92.0,87.0
9,dtak-1600-1699,validation,157748,39.111108,37.02019,172.0,163.0


In [11]:
# Add more datasets to existing stats file

df_new_datasets = df # TODO: compute df with cell above, change paths there
df_previously = pd.read_csv('./dataset-sizes.tsv', delimiter='\t')
df_added = pd.concat([df_previously, df_new_datasets],ignore_index=True)

df_added

df_added.to_csv("./dataset-sizes-new.tsv", sep="\t")
# TODO : check dataset-sizes-new.tsv and rename it to dataset-sizes.tsv
#        if everything is alright

## In bytes

In [None]:

paths = {
    "train" : [
        ("dtaeval", "../../data/interim/dtaeval/dtaeval-train.jsonl"),
        # ("ridges", "../../data/interim/ridges_bollmann/ridges_bollmann-train.jsonl"),
        # ("deu_news_2020", "../../data/interim/deu_news_2020/deu_news_2020-train.jsonl"),
        # ("dtak-1600-1699", "../../data/interim/dtak-1600-1699/dtak-1600-1699-train.jsonl"),
        # ("dtak-1700-1799", "../../data/interim/dtak-1700-1799/dtak-1700-1799-train.jsonl"),
        # ("dtak-1800-1899", "../../data/interim/dtak-1800-1899/dtak-1800-1899-train.jsonl"),
    ],
    "validation" : [
        ("dtaeval", "../../data/interim/dtaeval/dtaeval-validation.jsonl"),
        # ("ridges", "../../data/interim/ridges_bollmann/ridges_bollmann-validation.jsonl"),
        # ("deu_news_2020", "../../data/interim/deu_news_2020/deu_news_2020-validation.jsonl"),
        # ("dtak-1600-1699", "../../data/interim/dtak-1600-1699/dtak-1600-1699-validation.jsonl"),
        # ("dtak-1700-1799", "../../data/interim/dtak-1700-1799/dtak-1700-1799-validation.jsonl"),
        # ("dtak-1800-1899", "../../data/interim/dtak-1800-1899/dtak-1800-1899-validation.jsonl"),
    ],
    "test" : [
        ("dtaeval", "../../data/interim/dtaeval/dtaeval-test.jsonl"),
        # ("ridges", "../../data/interim/ridges_bollmann/ridges_bollmann-test.jsonl"),
        # ("deu_news_2020", "../../data/interim/deu_news_2020/deu_news_2020-test.jsonl"),
        # ("dtak-1600-1699", "../../data/interim/dtak-1600-1699/dtak-1600-1699-test.jsonl"),
        # ("dtak-1700-1799", "../../data/interim/dtak-1700-1799/dtak-1700-1799-test.jsonl"),
        # ("dtak-1800-1899", "../../data/interim/dtak-1800-1899/dtak-1800-1899-test.jsonl"),
    ]
}

def utf8len(s):
    return len(s.encode('utf-8'))

# Compute the token length for each paragraph
def get_length(example):
    example["input_length"] = utf8len(example["orig"])
    example["output_length"] = utf8len(example["norm"])
    return example

def get_upper_outer_fence(lengths):
    q3 = np.percentile(lengths, 75)
    iqr = q3 - np.percentile(lengths, 25)
    upper_outer_fence = q3 + 3*iqr
    return upper_outer_fence


df_data = {
    "name" : [], "split" : [], "examples" : [], 
    "input_len (mean)" : [], "output_len (mean)" : [], 
    "input_len (uof)" : [], "output_len (uof)" : []
}

for split in paths:
    for dname, path in paths[split]:
        # Load all datasets for this split
        ds = datasets.load_dataset("json", data_files=path, split="train")
        df_data["name"].append(dname)
        df_data["split"].append(split)
        df_data["examples"].append(ds.num_rows)
        # Tokenize by applying a mapping
        ds_tok = ds.map(get_length) 
        df_data["input_len (mean)"].append(np.mean(ds_tok["input_length"]))
        df_data["output_len (mean)"].append(np.mean(ds_tok["output_length"]))
        df_data["input_len (uof)"].append(get_upper_outer_fence(ds_tok["input_length"]))
        df_data["output_len (uof)"].append(get_upper_outer_fence(ds_tok["output_length"]))

In [4]:
df = pd.DataFrame(data=df_data)
print("m = mean; uof = upper outer fence (i.e. 3*IQR above Q3)")
df.head(20)

m = mean; uof = upper outer fence (i.e. 3*IQR above Q3)


Unnamed: 0,name,split,examples,input_len (mean),output_len (mean),input_len (uof),output_len (uof)
0,dtaeval,train,200524,127.380952,123.001975,578.0,560.0
1,dtaeval,validation,18278,121.044972,116.952621,533.0,519.0
2,dtaeval,test,21916,97.155959,93.110787,439.0,422.0
