# Inspecting normalizations

Note: This notebook should only be used to generate predictions on git branches created from a dvc experiment. On the main/dev branch this notebook will only be updated when changes to the code are necessary, but it will not be used to generate predictions there.

## Prepare data and get normalizations

In [1]:
import os
import random
import tomli

import datasets
import numpy as np
import pandas as pd
import torch
import transformers
from transnormer.models.train_model import tokenize_datasets
from transnormer.evaluation.analysis import get_spans_of_unknown_tokens
from transnormer.visualization.formatting import markup_spans

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# Number of examples to generate predictions for
N = 10

In [3]:
# Load configs
ROOT = "../../"
CONFIGFILE = os.path.join(ROOT, "models/models_2023-03-16_15-24/training_config.toml")
with open(CONFIGFILE, mode="rb") as fp:
    CONFIGS = tomli.load(fp)

# OR: Use custom configs (if so: uncomment the following)
# CONFIGS = {
#     "gpu": "cuda:0",
#     "random_seed": 42,
#     "tokenizer": {
#         "max_length_input": 128,
#         "max_length_output": 128,
#         "input_transliterator": "Transliterator1",
#     },
#     "language_models": {
#         "checkpoint_encoder": "prajjwal1/bert-tiny",
#         "checkpoint_decoder": "prajjwal1/bert-tiny",
#     },
#     "beam_search_decoding": {
#         "no_repeat_ngram_size": 3,
#         "early_stopping": True,
#         "length_penalty": 2.0,
#         "num_beams": 4,
#     },
# }

In [4]:
# Fix seeds for reproducibilty
random.seed(CONFIGS["random_seed"])
np.random.seed(CONFIGS["random_seed"])
torch.manual_seed(CONFIGS["random_seed"])

# GPU set-up
device = torch.device(CONFIGS["gpu"] if torch.cuda.is_available() else "cpu")

In [5]:
# Load data 
data_files = {
    "1600to1699": os.path.join(ROOT, "data/interim/dtak-v03-1600-1699/dtak-v03-1600-1699-train.jsonl"),
    # "1700to1799": os.path.join(ROOT, "data/interim/dtak-1700-1799/dtak-1700-1799-validation.jsonl"),
    # "1800to1899": os.path.join(ROOT, "data/interim/dtak-1800-1899/dtak-1800-1899-validation.jsonl"),
}
ds = datasets.load_dataset("json", data_files=data_files)

ds["1600to1699"] = ds["1600to1699"].shuffle().select(range(N))
# ds["1700to1799"] = ds["1700to1799"].shuffle().select(range(N))
# ds["1800to1899"] = ds["1800to1899"].shuffle().select(range(N))

Using custom data configuration default-0e8e502d3435a016
Found cached dataset json (/home/bracke/.cache/huggingface/datasets/json/default-0e8e502d3435a016/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51)
100%|██████████| 1/1 [00:00<00:00, 30.77it/s]


In [6]:
# Tokenize data 

# In case we use locally saved models for the tokenizers
# the relative path must be completed. Uncomment the respective line.
# CONFIGS["language_models"]["checkpoint_encoder"] = os.path.join(ROOT, CONFIGS["language_models"]["checkpoint_encoder"])
# CONFIGS["language_models"]["checkpoint_decoder"] = os.path.join(ROOT, CONFIGS["language_models"]["checkpoint_decoder"])

prepared_dataset, tokenizer_input, tokenizer_output = tokenize_datasets(ds, CONFIGS)

100%|██████████| 1/1 [00:00<00:00, 42.57ba/s]


In [7]:
# Load model
checkpoint = os.path.join(ROOT, "models/models_2023-03-16_15-24/checkpoint-100000") # TODO
model = transformers.EncoderDecoderModel.from_pretrained(checkpoint).to(device)

In [8]:
# Generate normalizations
# TODO: Do we have to include a configuration for beam search decoding here?
def generate_normalization(batch):
    inputs = tokenizer_input(batch["orig"], padding="max_length", truncation=True, max_length=128, return_tensors="pt")
    input_ids = inputs.input_ids.to(device)
    attention_mask = inputs.attention_mask.to(device)

    outputs = model.generate(input_ids, attention_mask=attention_mask)
    output_str = tokenizer_output.batch_decode(outputs, skip_special_tokens=True)

    batch["norm_pred_str"] = output_str

    return batch


ds = ds.map(
    generate_normalization, 
    batched=True, 
    batch_size=8, 
    load_from_cache_file=False,
    )


100%|██████████| 2/2 [00:09<00:00,  4.75s/ba]


### Apply visual modifications 



1. Markup unknown tokens in red
2. Add token-separator ("|") in input text
2. Add token-separator ("|") in output text

In [9]:
# Apply HTML markup to unknown tokens in original text
def markup_unknown_tokens(batch):
    spans = get_spans_of_unknown_tokens(batch["orig"], tokenizer_input)
    text_marked_up = markup_spans(batch["orig"], spans, opening_tag="<span style='color:#FF0000'>")

    batch["orig_marked_up"] = text_marked_up
    return batch

ds = ds.map(markup_unknown_tokens, batched=False, )

100%|██████████| 10/10 [00:00<00:00, 2021.84ex/s]


In [10]:
# Add token-separator 
def separate_tokens(batch, column, tokenizer):
    # We have to do the normalization explicitly before getting the encoding
    # to avoid mismatches in case the normalization changes the string length, e.g. "æ -> ae"
    norm_str = tokenizer.backend_tokenizer.normalizer.normalize_str(batch[column])
    encoding = tokenizer(norm_str, add_special_tokens=False)
    spans = [
        # map a token index to a pair of character indices
        encoding.token_to_chars(token_index)[:] for token_index in range(len(encoding["input_ids"]))
    ]
    text_marked_up = markup_spans(
        tokenizer.backend_tokenizer.normalizer.normalize_str(batch[column]),
        spans,
        opening_tag="",
        closing_tag="<span style='color:#FFA500'>|</span>",
        )

    batch[f"{column}_xlit_tok"] = text_marked_up
    return batch

ds = ds.map(separate_tokens, fn_kwargs={"tokenizer":tokenizer_input, "column" : "orig"}, batched=False, load_from_cache_file=False)
ds = ds.map(separate_tokens, fn_kwargs={"tokenizer":tokenizer_output, "column" : "norm_pred_str"}, batched=False, load_from_cache_file=False)

100%|██████████| 10/10 [00:00<00:00, 735.33ex/s]
100%|██████████| 10/10 [00:00<00:00, 1271.77ex/s]


In [11]:
# Create pandas dataframes from predictions

# Do no truncate cells with long text
pd.set_option('display.max_colwidth', None)

part = "1600to1699"
df1600to1699 = pd.DataFrame(
    data={
        "orig_xlit" : ds[part]["orig_xlit_tok"], 
        "norm_tok" : ds[part]["norm_pred_str_xlit_tok"], 
        }
    )

# part = "1700to1799"
# df1700to1799 = pd.DataFrame(
#     data={
#         "orig_xlit" : ds[part]["orig_xlit_tok"], 
#         "norm_tok" : ds[part]["norm_pred_str_xlit_tok"], 
#         }
#     )

# part = "1800to1899"
# df1800to1899 = pd.DataFrame(
#     data={
#         "orig_xlit" : ds[part]["orig_xlit_tok"], 
#         "norm_tok" : ds[part]["norm_pred_str_xlit_tok"], 
#         }
#     )

---

## Look at the dataframes

### 1600 to 1699

In [12]:
from IPython.core.display import HTML
display(HTML(df1600to1699.head(N).to_html(escape=False)))

Unnamed: 0,orig_xlit,norm_tok
0,Wie| sie| j|hre| Freunde| so| sie| bes|uch| en| empfangen|.|,Wie| sie| ihre| Freunde| so| sie| be|su|che| en| em|pf|angen|.|
1,XXX|VI|.|,XXX|VI|.|
2,Nach| et|lichen| Ta|entz|en| kam| vn|ser| Pra|esi|dent|in| wider| daher|/| vn|d| verme|ld|et| vn|s| wie| das| sich| die| Ku|ens|tl|er| vn|d| Stud|ios|i| gegen| j|hrer| Ko|eni|g|.| Majest|.| erb|otten|/| deren| zu| ehr|en|/| vn|d| gefallen| vor| der|o| abzu|g|/| ein| fro|eli|che| Com|œ|di|am| zu| ag|ieren|/| wol|ten| nu|hn| wir| derselben| auch| bey|woh|nen|/| vn|d| Ko|eni|g|.| Ma|.| auff| der| Sonn|en| Hau|ß| begle|iten|/| daß| were| der|o| Lieb|/| vn|d| wol|te| solches| in| allen| gn|aden| erkennen|:|,Nach| et|lichen| T|än|zen| kam| uns|er| Präsident|in| wider| daher| /| und| vermeld|et| uns| wie| das| sich| die| Künstler| und| Studios|i| gegen| ihrer| König|.| Maj|est|.| er|bot|en| /| deren| zu| ehr|en| /| und| ge|fallen| vor| deren| Ab|zug| /| ein| fr|ö|hlich|e| Como|edia|m| zu| ag|ieren| /| wollten| nun| wir| derselben| auch| bei|wo|hnen| /| und| König|.| Ma|.| auf| der| Sonne|n| Haus| be|gle|iten| /| daß| wäre| deren| Liebe| /| und| wollte| solche|s| in| allen| G|naden| erkennen| :|
3,Ich| bin| wie| Stu|m| und| Tau|b| dab|e|y|/| bege|hr|’| auch| nicht| ein| Wort| zus|agen|:| wil| mit| ged|ult| und| oh|n|’| gesch|rey|/| Ihr| sch|arf|fe|s| la|ester|-|Mau|l| vertra|gen|.|,Ich| bin| wie| St|umm| und| Tau|b| dabei| /| be|ge|hre| auch| nicht| ein| Wort| zu|sa|gen| :| will| mit| G|edu|ld| und| ohne|'|G|esch|rei| /| Ihr| s|char|fes| Läst|er| -| Mau|l| vert|ragen|.|
4,Es| ist| mir| von| Hert|zen| lieb|/| sprach| Kli|ng|en|feld|/| daß| ich| das| Gl|ue|ck| habe|/| dieses| hoch|gel|eh|rt|e| Muse|n|-|Kind| zu| sehen|.|,Es| ist| mir| von| Herz|en| lie|b| /| sprach| K|lingen|feld| /| daß| ich| das| Glück| habe| /| dieses| hoch|gel|ehrt|e| Muse|n| -| Kind| zu| sehen|.|
5,mel|ck|,me|lc|k|
6,Jn| Summa| sie| Rac|ket|lein| lies|sn|/| Au|ß| j|hren| Ha|enden| aus|ser| spr|ies|sn|/| Eins| v|bers| ander|/| daß| mich| wu|nder|t|/| Dann| es| off|t| flo|gen| et|lich| hundert|/| Ja| taus|ent| waren| au|ß| gest|ob|n|/| Das| Feu|r|wer|ck| kan| man| nicht| gn|ug| lo|bn|/| Dann| es| v|ber| die| masse|n| war|/| Zi|erl|ich| vn|d| wu|nder|bar|lich| gar|/| Vn|d| glau|b| nicht| daß| in| Wu|erte|nb|erg|/| Ein| mal| se|y| g|wes|n| ein| sol|ch| Feu|r|wer|ck|.|,In| Su|mma| sie| Ra|ket|lein| ließen| /| Aus| ihren| Hän|den| außer| sp|rie|ßen| /| Ein|s| über|s| ander| /| daß| mich| w|unde|rt| /| Dann| es| oft| fl|ogen| et|lich| hundert| /| Ja| tau|send| waren| aus| ge|sto|ben| /| Das| Feuer|werk| kann| man| nicht| gen|ug| lo|ben| /| Dann| er| die| ma|ßen| war| /| Zie|rli|ch| und| w|unde|rbar|lich| gar| /| Und| g|laub|e| nicht| daß| in| Württemberg| /| Ein| Mal| sei| ge|wusst| ein| sol|ch| Feuer|werk|.|
7,Pl|in|.| l|.| 2|.| c|.| 5|.| daher| ist| get|ichtet|;| daß| Prom|eth|eus| seine| Fac|kel| an| der| Sonn|e| ange|zu|endet|/| und| das| Feuer| auf| die| Erde| bra|cht| habe|.|,Pl|in|.| l|.| 2|.| c|.| 5|.| daher| ist| get|ichte|t| ;| daß| Prometheus| seine| Fa|ckel| an| der| Sonne| ang|ez|ünde|t| /| und| das| Feuer| auf| die| Erde| brachte| habe|.|
8,Ein| Un|weise|r| ist| auch| bey| Schmer|tz|haf|ften| Kra|nc|k|heiten| ele|nder| daran| als| ein| wel|ser| Mann|.| n|.| 123|.| Weiß|heit| und| Tu|gend| sind| wesent|liche| Stu|ec|ke| der| Gem|uet|hs|-|Ruhe| n|.| 124|.| und| der|er| Mangel| mache|t| den| Menschen| ho|ech|st| ele|nde|.| n|.| 125|.| welches| man| aber| nicht| von| der| Sche|in|-|Weiß|heit| und| von| der| Sche|in| Tu|gend| verste|hen| muß|.| n|.| 126|.| 127|.| War|umb| man| der| Wohl|lust| des| Leib|es| nicht| erw|eh|net| n|.| 128|.| 129|.|,Ein| Un|weise|r| ist| auch| bei| Sc|hme|rz|haften| Krankheit|en| ele|nder| daran| als| ein| wel|ser| Mann|.| n|.| 123|.| Wei|she|it| und| Tu|gend| sind| wesentlich|e| Stücke| der| G|em|üt|sr|uh|e| n|.| 124|.| und| der|er| Mange|l| macht| den| Menschen| h|ö|chs|t| ele|nde|.| ann|.| 125|.| welches| man| aber| nicht| von| der| Sc|hei|n| -| Weiß|heit| und| von| der| Lei|n| Tu|gend| vers|tehen| mu|ß|.| n| sehr| 126|.| 127|.| War|um| man| der| Wo|llus|t| des| Lei|bes| nicht| erwähnt| n|.| 128|.| 129|.|
9,§|.| 13|.|,§|.| 13|.|


### 1700 to 1799

In [13]:
from IPython.core.display import HTML
display(HTML(df1700to1799.head(N).to_html(escape=False)))

NameError: name 'df1700to1799' is not defined

### 1800 to 1899

In [None]:
from IPython.core.display import HTML
display(HTML(df1800to1899.head(N).to_html(escape=False)))

---

In [23]:
s_orig = "Nach etlichen Taͤntzen kam vnſer Præſidentin wider daher/ vnd vermeldet vns wie das ſich die Kuͤnſtler vnd Studioſi gegen jhrer Koͤnig. Majeſt. erbotten/ deren zu ehren/ vnd gefallen vor dero abzug/ ein froͤliche Comœdiam zu agieren/ wolten nuhn wir derſelben auch beywohnen/ vnd Koͤnig. Ma. auff der Sonnen Hauß begleiten/ daß were dero Lieb/ vnd wolte ſolches in allen gnaden erkennen:"
# s_orig = "Groß-Fuͤrſten in Finland/ Hertzogen zu Schonen/ Eheſten/ Lieffland/ Carelen/ Bremen/ Vehrden/ Stettin/ Pommern/ der Caſſuben und Wenden/"
s_norm = "Nach etlichen Tänzen kam unser Präsidentin wider daher/ und vermeldet uns wie das sich die Künstler und Studiosi gegen ihrer König. Majest. erboten/ deren zu ehren/ und gefallen vor deren Abzug/ ein fröhliche Comoediam zu agieren/ wollten nun wir derselben auch beiwohnen/ und König. Ma. auf der Sonnen Haus begleiten/ dass wäre deren Liebe/ und wollte solches in allen Gnaden erkennen:" 

# s_orig = "ſolen " #* 10
# s_norm = "Solen " #* 150
s_orig = "Weilen des menschen Lebent ist kürtz seine gedechtnüs Schwag, und die Sinne zum Misverstendtnüs stets geneiget (...)" 

inputs = tokenizer_input(s_orig, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
outputs = tokenizer_output(s_norm, padding="max_length", truncation=True, max_length=256, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
output_ids = outputs.input_ids.to(device)
output_ids[output_ids==0] = -100 # replace padding token with -100
attention_mask = inputs.attention_mask.to(device)
example = {
    "input_ids":input_ids,
    "attention_mask":attention_mask,
    "labels":output_ids
}

# Loss
output_forward = model(**example)

print(output_forward.loss)

# Generation
output_gen = model.generate(
    input_ids, 
    attention_mask=attention_mask, 
    max_length=256,
    no_repeat_ngram_size=3, # hier 0 zu setzen führt zu leerer Ausgabe (warum?)
    early_stopping=True,
    length_penalty=2.0,
    num_beams=3,
    )

outputs_as_str = tokenizer_output.batch_decode(output_gen, skip_special_tokens=True)
# print(outputs_as_str[0])

# Display output
from IPython.core.display import HTML
df_example = pd.DataFrame(data=outputs_as_str)
display(HTML(df_example.head(1).to_html(escape=False)))



tensor(10.1295, device='cuda:0', grad_fn=<NllLossBackward0>)


Unnamed: 0,0
0,"Weilen des Menschen Lebent ist kürz seine Gedächtnis Schwag, und die Sinne zum Missverstendtnis stets geneigt (... )"
