 ## Perform the Span-MLM training step of Vocabulary Transfer

Load the new and old tokenizer as well as mT5

In [None]:
from transformers import AutoTokenizer
from transformers import AutoModelForSeq2SeqLM

model_id = "google/mT5-small"
tokenizer = AutoTokenizer.from_pretrained("german_tokenizer")
tokenizer_old = AutoTokenizer.from_pretrained(model_id, legacy=False, force_download=True, device_map={"":0})
model = AutoModelForSeq2SeqLM.from_pretrained(model_id)

Initialize the embeddings for the new vocabulary based on the Vocabulary Transfer proposed by Mosin et al.(https://doi.org/10.1016/j.artint.2023.103860)

In [None]:
from vocabulary_transfer import initialize_embeddings

model = initialize_embeddings(model=model, tokenizer_old=tokenizer_old, tokenizer_new=tokenizer)

Load the pre-tokenized noised dataset for span-MLM

In [None]:
from datasets import load_from_disk

data_path = "german_ds_smlm_noised"
span_mlm_ds = load_from_disk(data_path)

In [None]:
span_mlm_ds

Setup DataCollator, optimizer and Training Arguments

In [None]:
from transformers import Seq2SeqTrainer, Seq2SeqTrainingArguments, Adafactor
from transformers import DataCollatorForSeq2Seq

data_collator = DataCollatorForSeq2Seq(
    tokenizer=tokenizer,
    model=model,
    pad_to_multiple_of=8
)

optimizer = Adafactor(model.parameters(), lr=1e-3, scale_parameter=False, relative_step=False,
                      clip_threshold=1.0, decay_rate=0.0)

output_dir = "mT5-small_vt_smlm"
training_args = Seq2SeqTrainingArguments(
    per_device_train_batch_size=16,
    warmup_ratio=0.1,
    output_dir=output_dir,
    num_train_epochs=1,
    logging_dir=f'logs',
    logging_strategy="steps",
    logging_steps=10,
    save_strategy="no",
    optim="adafactor",
    lr_scheduler_type="constant",
)

# Create Trainer instance
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=span_mlm_ds["train"].select(range(100000)), # select only part of the dataset so shorten training time
    optimizers=(optimizer, None)
)

Start training

In [None]:
trainer.train()

In [None]:
import matplotlib.pyplot as plt
import pandas as pd

history = pd.DataFrame(trainer.state.log_history)

plt.plot(history['loss'])
plt.show()

Push the model to a repository in the Hugging Face Hub

In [None]:
# set Hugging Face Hub token here to push to hub
token = ''

trainer.model.push_to_hub(f'{dir}', save_embedding_layers=True, token=token, private=True)
tokenizer.push_to_hub(f'{dir}', token=token, private=True)

The resulting model can be Fine-Tuned with LoRA with the [finetuning_evaluation_pipeline.ipynb](finetuning_evaluation_pipeline.ipynb) Notebook.