# LongT5 pretraining with chopped dataset

Repeating a previous experiment (20241203-chopped-training-longt5-untrained) with a bit of modernized code.

In [None]:
import shutil
!pip install -U git+https://github.com/huggingface/transformers.git@13493215abceafc1653af88b045120014fb4c1fc
!pip install -U git+https://github.com/huggingface/accelerate.git@8ade23cc6aec7c3bd3d80fef6378cafaade75bbe
!pip install -U git+https://github.com/huggingface/datasets.git@01f91bae037c98f2e05456287bab21470adb8f07
!pip install -U git+https://github.com/huggingface/evaluate.git@55f1bc6e072b05c2d9db1589a07e20f38902b1ec
!pip install -U git+https://github.com/huggingface/safetensors.git@f5839b6aee407652aa3078d91206b618dd84e3c2

!pip install sacrebleu
!pip install pytest-playwright
!playwright install
!pip install matplotlib
!pip install pillow
!pip install torchvision
!pip install lpips

!playwright install-deps  

!pip install -U numpy
!pip install tensorboard

Collecting git+https://github.com/huggingface/transformers.git@13493215abceafc1653af88b045120014fb4c1fc
  Cloning https://github.com/huggingface/transformers.git (to revision 13493215abceafc1653af88b045120014fb4c1fc) to /tmp/pip-req-build-7n_0oce3
  Running command git clone --filter=blob:none --quiet https://github.com/huggingface/transformers.git /tmp/pip-req-build-7n_0oce3


In [1]:
!apt install zip -y
!rm -rf data-rb-chopped
!mkdir -p data-rb-chopped
!wget "https://www.dropbox.com/scl/fi/hsqsp79okuob4u63oj7j3/data-rb-chopped.zip?rlkey=ey3a4ap5h6v9mcaava1bps52n&dl=1" -O model.zip
!unzip model.zip -d data-rb-chopped

!wget "https://www.dropbox.com/scl/fi/bx38ass59j6my2g2m01pb/tokenizer.json?rlkey=62zub5qk6ow7xa2vpa9gyglho&dl=1" -O data-rb-chopped/tokenizer.json

!rm -rf data-rb-validate
!mkdir -p data-rb-validate
!wget "https://www.dropbox.com/scl/fi/5szml8y5l248mcabj9rqg/verify-dataset.zip?rlkey=se33rwtxgngn0ts1i0pc8f6wk&st=1d68x9zt&dl=1" -O validate.zip
!unzip validate.zip -d data-rb-validate


Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  unzip
The following NEW packages will be installed:
  unzip zip
0 upgraded, 2 newly installed, 0 to remove and 52 not upgraded.
Need to get 350 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 zip amd64 3.0-12build2 [176 kB]
Fetched 350 kB in 2s (160 kB/s)m
debconf: delaying package configuration, since apt-utils is not installed

Selecting previously unselected package unzip.
(Reading database ... 16754 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m[30mProgress: [ 11%]

In [1]:
from datasets import load_from_disk, DatasetDict, concatenate_datasets

validation_size = 4
test_size = 100

dataset = load_from_disk('data-rb-chopped')
validation_dataset = load_from_disk('data-rb-validate')

train_test_split = dataset.train_test_split(test_size=test_size/len(dataset))
train_valid_split = train_test_split['train'].train_test_split(test_size=validation_size/len(dataset))

dataset = DatasetDict({
    "train": train_valid_split["train"],
    "valid": concatenate_datasets([validation_dataset.select(range(4)), train_valid_split["test"]]),
    "test": train_test_split["test"],
})

print(dataset)

DatasetDict({
    train: Dataset({
        features: ['svg', 'html'],
        num_rows: 214708
    })
    valid: Dataset({
        features: ['svg', 'html'],
        num_rows: 8
    })
    test: Dataset({
        features: ['svg', 'html'],
        num_rows: 100
    })
})


In [2]:
print(dataset['train'][0])

{'svg': '<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="393" height="852" viewBox="0 0 393 852"><g id="html1"><g data-tag="head" id="head1" data-z-index="auto" data-stacking-context="true" aria-owns="script1"><g data-tag="script" id="script1" data-z-index="auto" data-stacking-context="true"/></g><g data-tag="body" id="body1" data-z-index="auto" data-stacking-context="true" role="document" mask="url(#mask-for-body11)" aria-owns="TNRewc1 style1"><mask id="mask-for-body11"><rect width="393" height="852" x="0" y="0" fill="#ffffff"/></mask><g data-tag="div" id="TNRewc1" class="TNRewc" data-z-index="auto" data-stacking-context="true" aria-owns="TUWFt61"><g data-tag="button" id="TUWFt61" class="TUWFt6" data-z-index="auto" data-stacking-context="true" role="button" aria-labelledby=""><text color="rgb(0, 0, 0)" dominant-baseline="text-after-edge" font-family="Arial" font-size="13.3333px" font-size-adjust="none" font-stretch="100%" font-style="normal" f

In [3]:
from transformers import T5TokenizerFast
from tokenizers.processors import TemplateProcessing

tokenizer = T5TokenizerFast(tokenizer_file="data-rb-chopped/tokenizer.json",  
                                padding=True, 
                                eos_token="</s>",unk_token="<unk>", 
                                pad_token="<pad>")

post_processor = TemplateProcessing(
    single="$A </s>",
    pair="$A </s> $B </s>",
    special_tokens=[("</s>", tokenizer.eos_token_id)]
)

tokenizer._tokenizer.post_processor = post_processor

In [4]:
tokenized_output = tokenizer.encode(dataset['train'][0]['svg'], add_special_tokens=False)
unk_token_id = tokenizer.unk_token_id
if unk_token_id in tokenized_output:
    print("Alert: The tokenized SVG contains unknown tokens.")
else:
    print("No unknown tokens found in the SVG. Length: " + str(len(tokenized_output)))

tokenized_output = tokenizer.encode(dataset['train'][0]['html'], add_special_tokens=False)
unk_token_id = tokenizer.unk_token_id
if unk_token_id in tokenized_output:
    print("Alert: The tokenized HTML contains unknown tokens.")
else:
    print("No unknown tokens found in the HTML. Length: " + str(len(tokenized_output)))

No unknown tokens found in the SVG. Length: 224
No unknown tokens found in the HTML. Length: 5200


In [5]:
print(tokenizer.decode(tokenizer.encode(dataset['train'][0]['html'], add_special_tokens=False)))

<body><div class="TNRewc" style="display:var(--showQuickView-display, inherit)"><button class="TUWFt6" data-hook="product-item-quick-view-button" type="button">Quick View</button></div></body> <style> @keyframes slide-horizontal-new{0%{transform:translateX(100%)}}@keyframes slide-horizontal-old{80%{opacity:1}to{opacity:0;transform:translateX(-100%)}}@keyframes slide-vertical-new{0%{transform:translateY(-100%)}}@keyframes slide-vertical-old{80%{opacity:1}to{opacity:0;transform:translateY(100%)}}@keyframes out-in-new{0%{opacity:0}}@keyframes out-in-old{to{opacity:0}}@media(prefers-reduced-motion:reduce){::view-transition-group(*),::view-transition-new(*),::view-transition-old(*){animation:none!important}}body{background:transparent;border:0;margin:0;outline:0;padding:0;vertical-align:baseline}body{font-family:Arial,Helvetica,sans-serif;font-size:10px}body{height:100%}body{overflow-x:auto;overflow-y:scroll}:root{}@supports(-webkit-appearance:none) and (stroke-color:transparent){:root{}}@s

In [6]:
MAX_SEQUENCE_LENGTH = 16_384 / 8

def get_token_lengths(examples):
    inputs = tokenizer(
        examples['svg'],
        truncation=False,  # Don't truncate yet
        padding=False,     # Don't pad yet
        return_length=True,
    )
    labels = tokenizer(
        examples['html'],
        truncation=False,
        padding=False,
        return_length=True,
    )

    inputs["input_length"] = inputs["length"]
    inputs["labels"] = labels["input_ids"]
    inputs["label_length"] = labels["length"]

    inputs.pop("length", None)

    return inputs

tokenized_data = dataset.map(get_token_lengths, batched=True)

def filter_function(example):
    return example['input_length'] <= MAX_SEQUENCE_LENGTH and example['label_length'] <= MAX_SEQUENCE_LENGTH

filtered_data = tokenized_data.filter(filter_function)

def tokenize_examples(examples):
    inputs = tokenizer(
        examples['svg'],
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
    )
    labels = tokenizer(
        examples['html'],
        max_length=MAX_SEQUENCE_LENGTH,
        padding='max_length',
        truncation=True,
        return_attention_mask=True,
        add_special_tokens=True,
    )

    labels = labels['input_ids']
    labels[labels == tokenizer.pad_token_id] = -100
    inputs["labels"] = labels
    

    return inputs

filtered_data = filtered_data.map(get_token_lengths, batched=True, remove_columns=dataset["train"].column_names)

print(filtered_data)

Map:   0%|          | 0/214708 [00:00<?, ? examples/s]

Map:   0%|          | 0/8 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/214708 [00:00<?, ? examples/s]

Filter:   0%|          | 0/8 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/93081 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/39 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'input_length', 'labels', 'label_length'],
        num_rows: 93081
    })
    valid: Dataset({
        features: ['input_ids', 'attention_mask', 'input_length', 'labels', 'label_length'],
        num_rows: 5
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'input_length', 'labels', 'label_length'],
        num_rows: 39
    })
})


In [7]:
import numpy as np
from utils.similarity import calculate_metrics
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import torch
from types import SimpleNamespace
import shutil

VIEWPORT_SIZES = {
    'DESKTOP': {'width': 1440, 'height': 900},
    'TABLET': {'width': 834, 'height': 1210},
    'MOBILE': {'width': 393, 'height': 852},
}

VIEWPORTS = ['MOBILE']

def add_image_to_tensorboard(name, step, img_path):
    image = Image.open(img_path)
    image = image.convert('RGB')
    image_array = np.array(image)
    image_tensor = torch.from_numpy(image_array)
    image_tensor = image_tensor.permute(2, 0, 1)
    image_tensor = image_tensor.float() / 255.0
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_image(name, image_tensor, step)
    
def add_text_to_tensorboard(name, step, text):
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_text(name, text, step)

def postprocess_text(preds, labels):
    preds = [pred.strip().replace('<unk>', '') for pred in preds]
    labels = [[label.strip().replace('<unk>', '')] for label in labels]

    return preds, labels

def create_compute_metrics(trainer, save_best_model_dir = None):
    best_perceptual_loss = None

    def compute_metrics(eval_predictions, to_tensorboard = True):
        nonlocal best_perceptual_loss

        predictions, labels = eval_predictions
        
        if isinstance(predictions, tuple):
            predictions = predictions[0]

        decoded_predictions = tokenizer.batch_decode(predictions, skip_special_tokens=True)
        labels = np.where(labels != -100, labels, tokenizer.pad_token_id)
        decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)

        decoded_predictions, decoded_labels = postprocess_text(decoded_predictions, decoded_labels)
    
        similarity_scores = []
        perceptual_losses = []
        index = 1
        for prediction, label in zip(decoded_predictions, decoded_labels):
            if to_tensorboard:
                add_text_to_tensorboard(f'valid_{index}_label_text', trainer.state.global_step, label[0])
                add_text_to_tensorboard(f'valid_{index}_prediction_text', trainer.state.global_step, prediction)

            for viewport in VIEWPORTS:
            
                metrics = calculate_metrics(prediction, label[0],
                                        VIEWPORT_SIZES[viewport]['width'], VIEWPORT_SIZES[viewport]['height'])

                if metrics is not None:
                    similarity_scores.append(metrics['similarity'])
                    perceptual_losses.append(metrics['perceptual_loss'])

                    if to_tensorboard:
                        add_image_to_tensorboard(f'valid_{index}_expectation_' + viewport.lower(), trainer.state.global_step, metrics['expected_screenshot_path'])
                        add_image_to_tensorboard(f'valid_{index}_prediction_' + viewport.lower(), trainer.state.global_step, metrics['predicted_screenshot_path'])
            
            index += 1


        result = {
            'similarity': float(np.mean(similarity_scores)),
            'perceptual_loss': float(np.mean(perceptual_losses)),
        }

        if save_best_model_dir is not None:
            if best_perceptual_loss is None or result['perceptual_loss'] < best_perceptual_loss:
                if os.path.exists(save_best_model_dir):
                    shutil.rmtree(save_best_model_dir)
                trainer.save_model(save_best_model_dir)
                tokenizer.save_pretrained(save_best_model_dir)
                with open(os.path.join(save_best_model_dir, 'best_results.txt'), 'w') as f:
                    f.write(str(result) + '\nStep: ' + str(trainer.state.global_step))

                best_perceptual_loss = result['perceptual_loss']
    
        result = {k: round(v, 4) for k, v in result.items()}
        return result
    
    return compute_metrics

def eval_outside_of_training(model, dataset, to_tensorboard = False, step = None):
    def pad_to_matrix(seqs, pad_value=0):
        max_len = max(len(s) for s in seqs)
        matrix  = np.full((len(seqs), max_len), pad_value, dtype=np.int64)
        for i, s in enumerate(seqs):
            matrix[i, :len(s)] = s                 # left-align, pad on the right
        return matrix
    
    model.eval()
    model.to('cuda:0')

    predictions = []
    labels = []
    for row in dataset:
        inputs = tokenizer(row['svg'], return_tensors="pt").input_ids
        labels.append(tokenizer(row['html'], return_tensors="pt").input_ids)
        inputs = inputs.to('cuda:0')
        outputs = model.generate(inputs)
        predictions.append(outputs[0])

    mock_trainer = SimpleNamespace(state=SimpleNamespace(global_step=step))
    compute_metrics = create_compute_metrics(mock_trainer)
    return compute_metrics((
            pad_to_matrix([t.cpu().numpy() for t in predictions]), 
            pad_to_matrix([t.squeeze(0).cpu().numpy() for t in labels])
        ), to_tensorboard=to_tensorboard)



In [8]:
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(tokenizer=tokenizer, model="google/long-t5-tglobal-base")

In [9]:
from transformers import AutoModelForSeq2SeqLM, AutoConfig

config = AutoConfig.from_pretrained(
    "google/long-t5-tglobal-base",
    vocab_size=len(tokenizer),
    pad_token_id=tokenizer.pad_token_id,
    eos_token_id = tokenizer.eos_token_id,
    decoder_start_token_id = tokenizer.pad_token_id,
)

model = AutoModelForSeq2SeqLM.from_config(config)
model.resize_token_embeddings(len(tokenizer))

config.json:   0%|          | 0.00/851 [00:00<?, ?B/s]

Embedding(32228, 768)

In [10]:
from transformers import GenerationConfig

generation_config = GenerationConfig.from_model_config(model.config)
generation_config._from_model_config = False
generation_config.max_new_tokens = 1024

generation_config

GenerationConfig {
  "decoder_start_token_id": 3,
  "eos_token_id": 1,
  "max_new_tokens": 1024,
  "pad_token_id": 3
}

In [11]:
from datetime import datetime
timestamp = datetime.now().strftime("%Y-%m-%d_%H-%M-%S")
model_output_dir = f"model-{timestamp}"
log_dir = f'{model_output_dir}/runs'

In [12]:
results = eval_outside_of_training(model, dataset["valid"], True, 0)
writer = SummaryWriter(log_dir=log_dir)   # or reuse the one you have
for k, v in results.items():
    writer.add_scalar(f"eval/{k}", v, 0)

In [13]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments
import os

training_args = Seq2SeqTrainingArguments(
    output_dir=model_output_dir,
    eval_strategy="steps",
    eval_steps=1000,
    save_strategy="steps",
    save_steps=1000,
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    weight_decay=0.01,
    save_total_limit=3,
    num_train_epochs=2,
    logging_steps=25,
    logging_dir=log_dir,
    predict_with_generate=True,
    # load_best_model_at_end=True,
    bf16=True,
    generation_config=generation_config,
    report_to="tensorboard",
    save_safetensors=False # Fixing missing encoder.embed_tokens.weight and decoder.embed_tokens.weight due to a bug.
)

trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=filtered_data["train"],
    eval_dataset=filtered_data["valid"],
    processing_class=tokenizer,
    data_collator=data_collator,
    compute_metrics=None,
)

trainer.compute_metrics = create_compute_metrics(trainer, os.path.join(model_output_dir, "best"))

trainer.train()

Detected kernel version 5.4.0, which is below the recommended minimum of 5.5.0; this can cause the process to hang. It is recommended to upgrade the kernel to the minimum version or higher.
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]


Step,Training Loss,Validation Loss,Similarity,Perceptual Loss,Runtime,Samples Per Second,Steps Per Second
1000,34.5008,31.811798,0.524,0.6208,123.1573,0.041,0.008
2000,24.5703,21.587921,0.524,0.6208,119.9176,0.042,0.008
3000,21.0675,18.419531,0.524,0.6208,119.0915,0.042,0.008
4000,17.409,15.980848,0.524,0.6208,118.9101,0.042,0.008
5000,16.056,13.955285,0.524,0.6208,118.3949,0.042,0.008
6000,14.0557,12.818359,0.524,0.6208,119.2105,0.042,0.008
7000,13.5465,11.733341,0.524,0.6208,118.5205,0.042,0.008
8000,12.5396,11.367616,0.524,0.6208,118.8942,0.042,0.008


Passing a tuple of `past_key_values` is deprecated and will be removed in Transformers v4.48.0. You should pass an instance of `EncoderDecoderCache` instead, e.g. `past_key_values=EncoderDecoderCache.from_legacy_cache(past_key_values)`.
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
  with torch.enable_grad(), device_autocast_ctx, torch.cpu.amp.autocast(**ctx.cpu_autocast_kwargs):  # type: ignore[attr-defined]
IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopu

In [14]:
import os

trainer.save_model(os.path.join(model_output_dir, "final"))
tokenizer.save_pretrained(os.path.join(model_output_dir, "final"))

('model-2025-04-28_21-42-34/final/tokenizer_config.json',
 'model-2025-04-28_21-42-34/final/special_tokens_map.json',
 'model-2025-04-28_21-42-34/final/tokenizer.json')

In [15]:
from transformers import LongT5ForConditionalGeneration

text = dataset['test'][0]['svg']

inputs = tokenizer(text, return_tensors="pt").input_ids
inputs = inputs.to('cuda:0')

model = LongT5ForConditionalGeneration.from_pretrained(os.path.join(model_output_dir, "best"))
model.to('cuda:0')
model.generation_config = generation_config

outputs = model.generate(inputs)
# outputs = model.generate(inputs, max_new_tokens=16_384, do_sample=True, top_k=30, top_p=0.95)

translation = tokenizer.decode(outputs[0], skip_special_tokens=True)

print(dataset['test'][0]['html'])
print(translation)


<body><div class="" data-mesh-id="comp-l9x50dsoinlineContent" data-testid="inline-content"><div data-mesh-id="comp-l9x50dsoinlineContent-gridContainer" data-testid="mesh-container-content"><div class="FubTgk comp-l9x50dsx1" aria-disabled="false" id="comp-l9x50dsx1"><a aria-disabled="false" class="PlZyDq uDW_Qe wixui-button" data-testid="linkElement" href="https://www.fournituresdebeauce.com/accessoires" target="_self"><span class="l7_2fn wixui-button__label">ACCESSOIRES</span></a></div></div></div></body>

<style>

@keyframes slide-horizontal-new{0%{transform:translateX(100%)}}@keyframes slide-horizontal-old{80%{opacity:1}to{opacity:0;transform:translateX(-100%)}}@keyframes slide-vertical-new{0%{transform:translateY(-100%)}}@keyframes slide-vertical-old{80%{opacity:1}to{opacity:0;transform:translateY(100%)}}@keyframes out-in-new{0%{opacity:0}}@keyframes out-in-old{to{opacity:0}}@media(prefers-reduced-motion:reduce){::view-transition-group(*),::view-transition-new(*),::view-transition-o

In [17]:
test_results = eval_outside_of_training(model, dataset["test"], False)

print(test_results)

{'similarity': 0.9592, 'perceptual_loss': 0.1043}
