### In-Context Cross-lingual Transfer.
Training example notebook.

In [3]:
## import libraries
import pandas as pd
from transformers import AutoTokenizer, MT5ForConditionalGeneration, TrainingArguments
from datasets import Dataset

from src.data_handling import get_class_objects
from src.ic_xlt_utils import train_lora, preprocess_function

In [4]:
## set source language
source_language = 'english'

## load data
data_dir = 'data/massive' ## or 'data/acd'

## convert to transformer Dataset object
dataset_train = Dataset.load_from_disk('/'.join([data_dir,'train',source_language]))
dataset_test = Dataset.load_from_disk('/'.join([data_dir,'test',source_language]))

## retrieve useful variables
class_set,lbl2id_class, id2lbl_class = get_class_objects(dataset_train,dataset_test)

We employ an mT5 model.

In [5]:
## import model and tokenizer

tokenizer = AutoTokenizer.from_pretrained('google/mt5-large')
base_model = MT5ForConditionalGeneration.from_pretrained('google/mt5-large')

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


$M$ is the number of examples prepended to the context.<br>
If $M=0$ or set to None, the training is done as Prompt-based FT (PFT) with input output $x_i\to y_i$. <br>
If $M\geq1$ then the training is done through In-Context Tuning (ICT) with $X^{src},x_i\to y_i$.  Where $X^{src}$ are the context examples drawn from the training dataset.<br>

In [6]:
## preprocess and tokenize text

M = 10

def preprocess_wrapper_icl(sample):
    '''
    Wrapper for preprocessing each training sample and add context examples if required
    '''
    return preprocess_function(
        sample, 
        tokenizer, 
        ict_n = M,
        )

tokenized_dataset_train = dataset_train.map(
    preprocess_wrapper_icl, 
    batched = True,
    remove_columns=["text",'label'])

Map:   0%|          | 0/11514 [00:00<?, ? examples/s]

In [7]:
print('Training data sample:')

tokenizer.decode(tokenized_dataset_train['input_ids'][0], skip_special_tokens = True)

Training data sample:


'Text: could you please recommend a mexican restaurant for take out Labels: takeaway Text: please pause yourself Labels: audio Text: please make the room brighter Labels: iot Text: change the color of the lights in my bedroom to red Labels: iot Text: remind me to play this song tonight Labels: play Text: please turn off my alarm for this evening Labels: alarm Text: change the volume of the speakers Labels: audio Text: how is the weather today Labels: weather Text: when is my alarm set for Labels: alarm Text: show me the alarms i set Labels: alarm Text: wake me up at nine am on friday Labels: '

In [8]:
training_args = TrainingArguments(
        output_dir = 'checkpoints_trained', #directory to save the checkpoint
        learning_rate = 0.0004,
        auto_find_batch_size = True,
        per_device_train_batch_size = 8,
        per_device_eval_batch_size = 8,
        num_train_epochs = 10,
        save_strategy = 'epoch',
        seed = 1,
        data_seed = 1,
        ddp_find_unused_parameters = False,
    )

In [10]:
model = train_lora(    
    base_model = base_model,
    peft_training_args = training_args,
    dataset_train = tokenized_dataset_train,
    lora_config = None, ## to load a LoRA with custom parameters (LoraConfig object)
    lora_checkpoint = None, ## provide to continue to fine-tune an already trained LoRA
)


 > New LoRA loaded




Step,Training Loss


KeyboardInterrupt: 