In [1]:
import os
import random
import tomli

import datasets
import numpy as np
import pandas as pd
import torch
import transformers
from transnormer.models.train_model import tokenize_datasets
from transnormer.evaluation.analysis import get_spans_of_unknown_tokens
from transnormer.visualization.formatting import markup_spans


In [6]:
# Load configs
ROOT = os.path.dirname(os.path.dirname(os.path.abspath(''))) # "../../"
CONFIGFILE = os.path.join(ROOT, "training_config.toml")
with open(CONFIGFILE, mode="rb") as fp:
    CONFIGS = tomli.load(fp)

# Fix seeds for reproducibilty
random.seed(CONFIGS["random_seed"])
np.random.seed(CONFIGS["random_seed"])
torch.manual_seed(CONFIGS["random_seed"])

# GPU set-up
device = torch.device(CONFIGS["gpu"] if torch.cuda.is_available() else "cpu")

In [None]:

tokenizer_input = transformers.AutoTokenizer.from_pretrained(
    CONFIGS["language_models"]["checkpoint_encoder"]
)
tokenizer_output = transformers.AutoTokenizer.from_pretrained(
    CONFIGS["language_models"]["checkpoint_decoder"]
)

# Load model
checkpoint = os.path.join(ROOT, "models/model/checkpoint-300") # TODO
model = transformers.EncoderDecoderModel.from_pretrained(checkpoint).to(device)


s= "Groß-Fuͤrſten in Finland/ Hertzogen zu Schonen/ Eheſten/ Lieffland/ Carelen/ Bremen/ Vehrden/ Stettin/ Pommern/ der Caſſuben und Wenden/"
inputs = tokenizer_input(s, padding="max_length", truncation=True, max_length=512, return_tensors="pt")
input_ids = inputs.input_ids.to(device)
attention_mask = inputs.attention_mask.to(device)

outputs = model.generate(input_ids, attention_mask=attention_mask)
del input_ids, attention_mask # free memory
output_str = tokenizer_output.batch_decode(outputs, skip_special_tokens=True)

from IPython.core.display import HTML
df_example = pd.DataFrame(data=output_str)
display(HTML(df_example.head(1).to_html(escape=False)))



In [11]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

model = T5ForConditionalGeneration.from_pretrained("google/byt5-small")
tokenizer = AutoTokenizer.from_pretrained("google/byt5-small")

model_inputs = tokenizer(
    ["Groß-Fuͤrſten in Finland/ Hertzogen zu Schonen/ Eheſten/ Lieffland/ Carelen/ Bremen/ Vehrden/ Stettin/ Pommern/ der Caſſuben und Wenden/"], padding="longest", return_tensors="pt"
)
labels_dict = tokenizer(
    ["Groß-Fürsten in Finnland/ Herzogs zu Schonen/ Ehesten/ Livland/ Carelen/ Bremen/ Vehrden/ Stettin/ Pommern/ der Kasuben und Wenden/"], padding="longest", return_tensors="pt"
)
labels = labels_dict.input_ids

loss = model(**model_inputs, labels=labels).loss
loss.item()


1.0134801864624023

In [16]:
output_ids = model.generate(**model_inputs, max_length=100)[0].tolist()

In [17]:

# Now we need to split on the sentinel tokens, let's write a short loop for this

output_ids_list = []

start_token = 0

sentinel_token = 258

while sentinel_token in output_ids:
    split_idx = output_ids.index(sentinel_token)
    output_ids_list.append(output_ids[start_token:split_idx])
    start_token = split_idx
    sentinel_token -= 1

output_ids_list.append(output_ids[start_token:])
output_string = tokenizer.batch_decode(output_ids_list)
print(output_string)

['<pad>', 'en/ Stettin/ Pommern/ Pommern/ Pommern/ Pommern/ Pommern/ Pommern/ Pommern/ Pommern/ Pommern/ Pomm']


---

---

---

In [8]:
model = transformers.T5ForConditionalGeneration.from_pretrained("../../tests/testdata/saved-tmp/checkpoint-600").to(device)



In [12]:
output_ids = model.generate(**model_inputs, max_length=100)[0].tolist()

In [22]:
for output_id in output_ids:
    try:    
        print(chr(output_id - 3), end="")
    except ValueError:
        print(output_id)

print([for output_id in output_ids])
# print(output_ids)
# print(chr(101))

ValueError: chr() arg not in range(0x110000)