In [1]:
# if running on colab, install the following packagess
# !pip install -U transformers datasets peft accelerate

In [2]:
# helps make your notebook pretty
import jupyter_black

jupyter_black.load()

In [3]:
from copy import copy
from time import time
from typing import Dict, List, Optional

import pandas as pd
import torch
from datasets import Dataset
from peft import LoraConfig, get_peft_model
from peft.peft_model import PeftModelForCausalLM
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BatchEncoding,
    PreTrainedTokenizerBase,
    Trainer,
    TrainingArguments,
)
from trl import DataCollatorForCompletionOnlyLM, SFTTrainer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
INSTRUCTION_TEMPLATE_BASE = "\n\n### Human:"
RESPONSE_TEMPLATE_BASE = "\n\n### Assistant:"


def load_model(
    model_name: str, peft_kwargs: Optional[Dict] = None
) -> PeftModelForCausalLM:
    model = AutoModelForCausalLM.from_pretrained(model_name)
    if peft_kwargs is None:
        peft_kwargs = {}
    peft_config = LoraConfig(task_type="CAUSAL_LM", **peft_kwargs)
    # alterantively, you can use the following to load the model
    # model = PeftModelForCausalLM.from_pretrained(model_name)
    model = get_peft_model(model, peft_config)
    return model


def add_special_tokens(
    example: Dict,
    tokenizer: PreTrainedTokenizerBase,
) -> Dict:
    # add eos_token before human text and bos_token before assistant text
    example["text"] = (
        example["text"]
        .replace(
            INSTRUCTION_TEMPLATE_BASE, tokenizer.eos_token + INSTRUCTION_TEMPLATE_BASE
        )
        .replace(RESPONSE_TEMPLATE_BASE, RESPONSE_TEMPLATE_BASE + tokenizer.bos_token)
    )
    if not example["text"].endswith(tokenizer.eos_token):
        example["text"] += tokenizer.eos_token
    # Remove leading EOS tokens
    while example["text"].startswith(tokenizer.eos_token):
        example["text"] = example["text"][len(tokenizer.eos_token) :]

    return example

We're going to use bloom-560m, a small multilingual model where we can reasonably finetune it without needing GPU. 


In [5]:
# Preprocessing
model_name = "bigscience/bloom-560m"
tokenizer = AutoTokenizer.from_pretrained(
    model_name, trust_remote_code=True, padding_side="right"
)  # padding side should be right for causal lm

# overfit to 5 examples
str1 = '\n\n### Human: How do you say "dog" in Spanish?\n\n### Assistant: perro'
str2 = '\n\n### Human: How do you say "water" in Spanish?\n\n### Assistant: agua'
str3 = '\n\n### Human: How do you say "mother" in Spanish?\n\n### Assistant: madre'
str4 = '\n\n### Human: How do you say "hello" in Spanish?\n\n### Assistant: hola'
str5 = '\n\n### Human: How do you say "tree" in Spanish?\n\n### Assistant: árbol'
train_data = {
    "text": [str1, str2, str3, str4, str5],
}
dataset_text = Dataset.from_dict(train_data)
dataset_text = dataset_text.map(lambda x: add_special_tokens(x, tokenizer))
print(f"{dataset_text=}")
print(f"{dataset_text[0]=}")

Map: 100%|██████████| 5/5 [00:00<?, ? examples/s]

dataset_text=Dataset({
    features: ['text'],
    num_rows: 5
})
dataset_text[0]={'text': '\n\n### Human: How do you say "dog" in Spanish?\n\n### Assistant:<s> perro</s>'}





In [6]:
# tokenize the text
dataset = dataset_text.map(
    lambda example: tokenizer(example["text"]), batched=True, remove_columns=["text"]
)
# copy the input_ids to labels
dataset = dataset.map(lambda x: {"labels": x["input_ids"]}, batched=True)
print(f"{dataset=}")
print(f"{dataset[0]['input_ids']=}")
print(f"{dataset[0]['labels']=}")
print(f"{dataset[0]['attention_mask']=}")

Map: 100%|██████████| 5/5 [00:00<00:00, 308.06 examples/s]
Map: 100%|██████████| 5/5 [00:00<00:00, 1629.87 examples/s]

dataset=Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 5
})
dataset[0]['input_ids']=[603, 105311, 22256, 29, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 82208, 2]
dataset[0]['labels']=[603, 105311, 22256, 29, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 82208, 2]
dataset[0]['attention_mask']=[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]





To start, labels and input_ids are identical. This means during training we will predict each token one at a time based on the previous tokens and learn by comparing what we predicted to the true label. Let's see what happens when we train a model like that. 

In [7]:
# training code inspired by
# https://mlabonne.github.io/blog/posts/Fine_Tune_Your_Own_Llama_2_Model_in_a_Colab_Notebook.html

model = load_model(model_name)
# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"

# How many times to iterate over the entire dataset
num_train_epochs = 15

# We're not aligning the sequence length (ie padding or truncating)
# so batch training won't work for our toy example.
per_device_train_batch_size = 1


training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    seed=1,
)

In [8]:
trainer = Trainer(
    model=model,
    train_dataset=dataset,
    args=training_arguments,
)

In [9]:
holdout_str = (
    '\n\n### Human: How do you say "good" in Spanish?\n\n### Assistant:<s>'  # bueno
)
device = "cuda" if torch.cuda.is_available() else "cpu"
holdout_input = tokenizer(holdout_str, return_tensors="pt").to(device)

In [10]:
def sample_generate(
    model: torch.nn.Module,
    tokenizer: PreTrainedTokenizerBase,
    inputs: BatchEncoding,
    **kwargs
) -> str:
    """Runs tokenized text through the model and returns the generated text."""
    outputs = model.generate(**inputs, **kwargs)
    gen_text = tokenizer.batch_decode(
        # strip the text of the prompt
        outputs[:, inputs["input_ids"].shape[1] :]
    )
    return gen_text[0]

In [11]:
ENGLISH_WORDS = ["dog", "water", "mother", "hello", "tree"]
SPANISH_WORDS = ["perro", "agua", "madre", "hola", "árbol"]


def predict_training_set(
    model: torch.nn.Module,
    tokenizer: PreTrainedTokenizerBase,
    english_words: List[str] = ENGLISH_WORDS,
    spanish_words: List[str] = SPANISH_WORDS,
):
    """Runs predictions on the entire training set."""
    for eng, span in zip(english_words, spanish_words):
        inputs2 = tokenizer(
            f'\n\n### Human: How do you say "{eng}" in Spanish?\n\n### Assistant:<s>',
            return_tensors="pt",
        ).to(device)
        print(
            "real answer:",
            span,
            "\tpredicted answer:",
            sample_generate(model, tokenizer, inputs2, max_new_tokens=5),
        )

In [12]:
def print_iterative_generate(model, tokenizer, inputs):
    """Approximates the training forward pass by iterating through a sequence
    and predicting one token at a time.
    """
    tok_outputs = []
    for tok_id in range(1, len(inputs["input_ids"][0]) + 1):
        iterative_inputs = inputs.copy()
        iterative_inputs["input_ids"] = inputs["input_ids"][:, :tok_id]
        iterative_inputs["attention_mask"] = inputs["attention_mask"][:, :tok_id]
        tok_outputs.append(
            sample_generate(model, tokenizer, iterative_inputs, max_new_tokens=1)
        )
    print("".join(tok_outputs))

In [13]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer: </s>
real answer: agua 	predicted answer: </s>
real answer: madre 	predicted answer: </s>
real answer: hola 	predicted answer: </s>
real answer: árbol 	predicted answer: </s>


In [14]:
original_output = sample_generate(model, tokenizer, holdout_input, max_new_tokens=5)
original_output

'</s>'

In [15]:
print_iterative_generate(model, tokenizer, holdout_input)

#include
 code
 to I get "it's" to a?
A: Spanish: How</s>


In [16]:
training1 = trainer.train()

100%|██████████| 75/75 [01:10<00:00,  1.06it/s]

{'train_runtime': 70.8013, 'train_samples_per_second': 1.059, 'train_steps_per_second': 1.059, 'train_loss': 4.480442301432292, 'epoch': 15.0}





In [17]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer: </s>
real answer: agua 	predicted answer: </s>
real answer: madre 	predicted answer: mamá</s>
real answer: hola 	predicted answer: </s>
real answer: árbol 	predicted answer: tree</s>


We're still kind of confused. Let's continue training. 

In [18]:
trainer.train()

  0%|          | 0/75 [00:00<?, ?it/s]

100%|██████████| 75/75 [01:06<00:00,  1.12it/s]

{'train_runtime': 66.6733, 'train_samples_per_second': 1.125, 'train_steps_per_second': 1.125, 'train_loss': 2.1419362386067706, 'epoch': 15.0}





TrainOutput(global_step=75, training_loss=2.1419362386067706, metrics={'train_runtime': 66.6733, 'train_samples_per_second': 1.125, 'train_steps_per_second': 1.125, 'train_loss': 2.1419362386067706, 'epoch': 15.0})

In [19]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer:  perro</s>
real answer: agua 	predicted answer:  agua</s>
real answer: madre 	predicted answer:  madre</s>
real answer: hola 	predicted answer:  hola</s>
real answer: árbol 	predicted answer:  árbol</s>


In [20]:
sample_generate(model, tokenizer, holdout_input, max_new_tokens=5)

' bueno</s>'

After 30 epochs, we learned what we were supposed to!

In [21]:
print_iterative_generate(model, tokenizer, holdout_input)

#
: How do you say "how morning in Spanish?

### Assistant: gu bueno


As we expected, we learned how to write the whole prompt (better). But we kinda don't care if we're focused on building a chat bot. Is there a way to learn just the chat part? 

### Masked approach
Ok so we're wasting time learning the human prompt. Let's switch up the labels to only focus on the prompt. Let's exploit the bos and eos tokens we added to the text earlier. 

In [22]:
def create_special_mask(example: Dict) -> Dict:
    """Mask human text and keep assistant text as it is.

    Args:
        example (Dict): Result of tokenizing some text

    Returns:
        Dict: The dict with the label masked
    """
    # setting a token to -100 is how we "mask" a token
    # and tell the model to ignore it when calculating the loss
    mask_token_id = -100
    # assume we always start with a human text
    human_text = True
    for idx, tok_id in enumerate(example["labels"]):
        if human_text:
            # mask all human text up until and including the bos token
            example["labels"][idx] = mask_token_id
            if tok_id == tokenizer.bos_token_id:
                human_text = False
        elif not human_text and tok_id == tokenizer.eos_token_id:
            human_text = True
        elif not human_text:
            # leave example['labels'] text as it is when assistant text
            continue
    return example

In [23]:
# sanity check that our format is correct
# we'd expect -100 for the human text and the actual token(s) for the assistant text
# mask human characters but keep the assistant text as it is
dataset_masked = dataset.map(create_special_mask)
# convert dataset from lists to torch tensors
dataset_masked.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(f"{dataset_masked[0]["labels"]=}")
label_ex = dataset_masked[0]["labels"]
# let's see just the non-masked text
print(f"non masked text: {tokenizer.decode(label_ex[label_ex != -100], skip_special_tokens=False)}")
# let's see just the masked text
# -100 is not a real token, convert to something the tokenizer understands
label_ex[label_ex == -100] = 0
print(f"full 'label': {tokenizer.decode(label_ex, skip_special_tokens=False)}")

# )

Map: 100%|██████████| 5/5 [00:00<00:00, 1222.47 examples/s]

dataset_masked[0]["labels"]=tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 82208,     2])
non masked text:  perro</s>
full 'label': <unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk> perro</s>





In [24]:
# Reset the model
model = load_model(model_name)

In [25]:
trainer = Trainer(
    model=model,
    train_dataset=dataset_masked,
    args=training_arguments,
)

In [26]:
training2 = trainer.train()

  0%|          | 0/75 [00:00<?, ?it/s]

100%|██████████| 75/75 [01:01<00:00,  1.22it/s]

{'train_runtime': 61.7164, 'train_samples_per_second': 1.215, 'train_steps_per_second': 1.215, 'train_loss': 1.8535097249348957, 'epoch': 15.0}





In [27]:
print(f"{training2.metrics['train_runtime']=}")
print(f"{training1.metrics['train_runtime'] =}")
print(
    f"{100*round((training1.metrics['train_runtime']  - training2.metrics['train_runtime']) / training1.metrics['train_runtime'] , 2)}%"
)

training2.metrics['train_runtime']=61.7164
training1.metrics['train_runtime'] =70.8013
13.0%


We were faster this time by more than 10%. Presumably, the fact that we have fewer loss calculations makes things a bit quicker. 
I wouldn't bank on this speed up being this large - our example is pretty lopsided with much more human text than generated text. But when training times are in the hours, every little percentage matters.  

In [28]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer:  perro</s>


real answer: agua 	predicted answer:  agua</s>
real answer: madre 	predicted answer:  madre</s>
real answer: hola 	predicted answer:  hola</s>
real answer: árbol 	predicted answer:  árbol</s>


In [29]:
sample_generate(model, tokenizer, holdout_input, max_new_tokens=5)

' bueno</s>'

Cool, this time we only needed 15 epochs to learn the task. Let's go back to how things are under the hood during training 

In [30]:
print_iterative_generate(model, tokenizer, holdout_input)

#include
 code
 to I get "we" in English?
A: Spanish: How bueno


Iteratively predicting the prompt leads to non-sense compared with our first training approach. This checks out: we masked the prompt during training and therefore don’t learn how to predict anything up until our real target: the assistant response.

### Finetuning using HuggingFace TRL

Let's use a pipeline to have huggingface do this for us use the collator, use the parser for the assitant and human text 

In [31]:
# reset the model
model = load_model(model_name)

In [32]:
# a huggingface function to do the copying of labels for you.
# using the insttruction and response templates will mask everything between the instruction template
# and the start of the response_template
collator = DataCollatorForCompletionOnlyLM(
    instruction_template=tokenizer.eos_token,
    response_template=tokenizer.bos_token,
    tokenizer=tokenizer,
)

In [33]:
# a beefed up trainer class for supervised fine-tuning.
# we can feed the raw text and have it run the DataCollatorForCompletionOnlyLM on that
# test to get the text data in the tokenized format causallm models expect
trainersft = SFTTrainer(
    model,
    train_dataset=dataset_text,
    dataset_text_field="text",
    data_collator=collator,
    args=training_arguments,
    tokenizer=tokenizer,
)

Map: 100%|██████████| 5/5 [00:00<00:00, 1248.38 examples/s]


Let's quickly check what the collator does on a single example. Note the brackets: DataCollatorForCompletionOnlyLM expects a list of examples. One upside is that it will handle truncation and padding so you can batch efficiently. 

In [34]:
print(dataset_text[0]["text"])
collator_output = collator([tokenizer(dataset_text[0]["text"])])
collator_output



### Human: How do you say "dog" in Spanish?

### Assistant:<s> perro</s>


{'input_ids': tensor([[   603, 105311,  22256,     29,   7535,    727,   1152,   5894,  20587,
            744,      5,    361,  49063,   7076, 105311, 143005,     29,      1,
          82208,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 82208,  -100]])}

In [35]:
sftrain = trainersft.train()

  1%|▏         | 1/75 [00:00<01:09,  1.06it/s]

100%|██████████| 75/75 [01:00<00:00,  1.23it/s]

{'train_runtime': 60.8437, 'train_samples_per_second': 1.233, 'train_steps_per_second': 1.233, 'train_loss': 2.96381591796875, 'epoch': 15.0}





In [36]:
print(f"{sftrain.metrics['train_runtime']=}")
print(f"{training1.metrics['train_runtime'] =}")
print(
    f"{100*round((training1.metrics['train_runtime']  - sftrain.metrics['train_runtime']) / training1.metrics['train_runtime'] , 2)}%"
)

sftrain.metrics['train_runtime']=60.8437
training1.metrics['train_runtime'] =70.8013
14.000000000000002%


Training took longer with the TRL approach. This might be credited to the fact that we have to tokenize at training time rather than as a preprocessing step in the masked approach. Remember that this approach gives us free batching (you’d need to tweak the tokenization process to use the masked approach to batch properly), which should make things faster in the long run. 


In [37]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer:  perro</s>


real answer: agua 	predicted answer:  agua</s>
real answer: madre 	predicted answer:  madre</s>
real answer: hola 	predicted answer:  hola</s>
real answer: árbol 	predicted answer:  árbol</s>


In [38]:
sample_generate(model, tokenizer, holdout_input, max_new_tokens=5)

' bueno</s>'

In [39]:
print_iterative_generate(model, tokenizer, holdout_input)

#include
 code
 to I get "we" in a?
A: Spanish: How bueno


If we look closely at the sample output from the data collator, our labels looked like this: 

`tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 82208,  -100]])`
This is slightly different than our manual masking approach:

`tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 82208,     2])`
Ignoring the slightly different tensor shape, we have an additional *2* token with the manual approach. This is because the data collator does exclusive masking of the instruction template. We hacked it to say that the "\</s>" character is the "instruction template" which signals the human is about to be the source of the text. We generally don't want to generate that actual instruction template, so the collator masks it. We still correctly outputed the end of sequence token, but that was luck not actual learning. If you look back at the pretraining output, we always knew to end our sequences with \</s> even before we finetuned the model on our task. In fact, if we continue training, we forget that ability to end sequences with an end of sequence token.

In [40]:
argcopy = copy(trainersft.args)
argcopy.num_train_epochs = 5
trainersft.args = argcopy
trainersft.train()

100%|██████████| 25/25 [00:21<00:00,  1.15it/s]

{'train_runtime': 21.807, 'train_samples_per_second': 1.146, 'train_steps_per_second': 1.146, 'train_loss': 0.39307949066162107, 'epoch': 5.0}





TrainOutput(global_step=25, training_loss=0.39307949066162107, metrics={'train_runtime': 21.807, 'train_samples_per_second': 1.146, 'train_steps_per_second': 1.146, 'train_loss': 0.39307949066162107, 'epoch': 5.0})

In [41]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer:  perro</s>
real answer: agua 	predicted answer:  agua
### Assistant:
real answer: madre 	predicted answer:  madre</s>
real answer: hola 	predicted answer:  hola
### Human:
real answer: árbol 	predicted answer:  árbol</s>


### Using special tokens to split the human and assistant text

How might we fix this problem?

One solution is to use the instruction and response template more as originally intended where they aren't the bos and eos tokens but rather actual text indicating whether it's the human or assistant speaking. Let's try using "\n\n Human:" and "\n\n Assistant:" as our instruciton and response templates respectively. 

In [42]:
collator = DataCollatorForCompletionOnlyLM(
    instruction_template=INSTRUCTION_TEMPLATE_BASE,
    response_template=RESPONSE_TEMPLATE_BASE,
    tokenizer=tokenizer,
)

In [43]:
print(f"{RESPONSE_TEMPLATE_BASE=}")
print(f"{tokenizer(RESPONSE_TEMPLATE_BASE)=}")

RESPONSE_TEMPLATE_BASE='\n\n### Assistant:'
tokenizer(RESPONSE_TEMPLATE_BASE)={'input_ids': [603, 105311, 143005, 29], 'attention_mask': [1, 1, 1, 1]}


How to interpret this: We're expecting `\n\n### Assistant:` towards the end of our text to signal that this is the model's output. Specifically, we should see the pattern of tokens: [603, 105311, 143005, 29].

In [44]:
print(f"{dataset_text["text"][0]=}")
print(f"{tokenizer(dataset_text["text"][0])=}")

dataset_text["text"][0]='\n\n### Human: How do you say "dog" in Spanish?\n\n### Assistant:<s> perro</s>'
tokenizer(dataset_text["text"][0])={'input_ids': [603, 105311, 22256, 29, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 82208, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


We do see "\n\n### Assistant:" in the text, but rather than [**603**, 105311, 143005, 29], we get: [**7076**, 105311, 143005, 29].
What gives?


In [45]:
# desired token
tokenizer.decode(603)

'\n\n'

In [46]:
# token from the real string
tokenizer.decode(7076)

'?\n\n'

What happens when our tokenizer acts up like this? 

In [47]:
no_special_tokens = collator([tokenizer(dataset_text["text"][0])])
print(no_special_tokens)

{'input_ids': tensor([[   603, 105311,  22256,     29,   7535,    727,   1152,   5894,  20587,
            744,      5,    361,  49063,   7076, 105311, 143005,     29,      1,
          82208,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100, -100, -100]])}



### Assistant:` in the following instance: 

### Human: How do you say "dog" in Spanish?

### Assistant:<s> perro</s> This instance will be ignored in loss calculation. Note, if this happens often, consider increasing the `max_seq_length`.

### Human:` in the following instance: 

### Human: How do you say "dog" in Spanish?

### Assistant:<s> perro</s> This instance will be ignored in loss calculation. Note, if this happens often, consider increasing the `max_seq_length`.


_This instance will be ignored in loss calculation_
That's a rather ominous warnings. 

If we inpsect the transformed labels we get: 
```
'labels': tensor([[-100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
         -100, -100, -100, -100, -100, -100]]).
```
Basically, if we moved forward, we'd be masking everything so during training we will learn nothing. That's not great. What should we do?

You might be tempted to finagle with formatting until you hit sucess. Not a bad impulse, but contexualized tokenization is a dangerous rabbit hole. See what happens when we try some straightforward fixes: 

In [48]:
# Try adding a space between the question mark and response template
print(f"{tokenizer(dataset_text["text"][0].replace("?\n\n", "? \n\n"))}")


{'input_ids': [603, 105311, 22256, 29, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 80379, 105311, 143005, 29, 1, 82208, 2], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


In [49]:
# We got: [80379, 105311, 143005, 29,]
tokenizer.decode(80379)

'? \n\n'

In short: not only is `?\n\n` part of the vocabulary, thus breaking our response_template, but so is `'? \n\n'`. I'll stop with the experimenting there and you'll have to trust me that continuning to finagle the formatting will only end in anguish. 

The more stable solution is to add your special token directly to the tokenizer. 

In [50]:
# add the special tokens we're looking for to the vocabulary
model = load_model(model_name)
tokenizer2 = AutoTokenizer.from_pretrained(
    model_name, trust_remote_code=True, padding_side="right"
)
num_added_toks = tokenizer2.add_special_tokens(
    {"additional_special_tokens": [INSTRUCTION_TEMPLATE_BASE, RESPONSE_TEMPLATE_BASE]}
)

# best practice to prepare the model for the new tokens.
model.resize_token_embeddings(len(tokenizer2))

collator = DataCollatorForCompletionOnlyLM(
    instruction_template=INSTRUCTION_TEMPLATE_BASE,
    # let's add the bos_token because we don't really need to learn that
    # it should always be after the base response template
    response_template=RESPONSE_TEMPLATE_BASE + tokenizer2.bos_token,
    tokenizer=tokenizer2,
)


with_special_tokens = collator([tokenizer2(dataset_text["text"][0])])
print(with_special_tokens)

{'input_ids': tensor([[250680,   7535,    727,   1152,   5894,  20587,    744,      5,    361,
          49063,     34, 250681,      1,  82208,      2]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'labels': tensor([[ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
          -100,  -100,  -100, 82208,     2]])}


Ran with no warnings! If we look at labels, we can see the last tokens are unmasked as we want. 

In [51]:
# sanity check
print(f"{RESPONSE_TEMPLATE_BASE=}")
print(f"{tokenizer2(RESPONSE_TEMPLATE_BASE)=}")
print(f"{str1=}")
print(f"{tokenizer2(str1)=}")

RESPONSE_TEMPLATE_BASE='\n\n### Assistant:'
tokenizer2(RESPONSE_TEMPLATE_BASE)={'input_ids': [250681], 'attention_mask': [1]}
str1='\n\n### Human: How do you say "dog" in Spanish?\n\n### Assistant: perro'
tokenizer2(str1)={'input_ids': [250680, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 34, 250681, 82208], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}


After we added the special tokens, the response_template is encoded as a single token: **250681**, which is now how it gets tokenized in the actual string example: 

'input_ids': [250680, 7535, 727, 1152, 5894, 567, 1988, 182448, 361, 49063, 34, **250681**, 82208]
Sucess!

In [52]:
holdout_input2 = tokenizer2(holdout_str, return_tensors="pt").to(device)

In [53]:
trainersft = SFTTrainer(
    model,
    train_dataset=dataset_text,
    dataset_text_field="text",
    data_collator=collator,
    args=training_arguments,
    max_seq_length=512,
    tokenizer=tokenizer2,
)

Map: 100%|██████████| 5/5 [00:00<00:00, 1677.05 examples/s]


In [54]:
spec_tok_sft = trainersft.train()

  0%|          | 0/75 [00:00<?, ?it/s]

100%|██████████| 75/75 [01:01<00:00,  1.22it/s]

{'train_runtime': 61.5561, 'train_samples_per_second': 1.218, 'train_steps_per_second': 1.218, 'train_loss': 3.4137898763020833, 'epoch': 15.0}





In [55]:
predict_training_set(model, tokenizer2)

real answer: perro 	predicted answer:  Dog</s>
real answer: agua 	predicted answer: water</s>
real answer: madre 	predicted answer: mamá</s>
real answer: hola 	predicted answer:  hola</s>
real answer: árbol 	predicted answer: tree</s>


We didn't learn the output properly! One cost of creating new special tokens is that the model has never seen them before and needs to learn them from scratch. Let's train a bit longer to see if we can figure it out. 

In [56]:
trainersft.train()

  0%|          | 0/75 [00:00<?, ?it/s]

100%|██████████| 75/75 [01:00<00:00,  1.23it/s]

{'train_runtime': 60.9864, 'train_samples_per_second': 1.23, 'train_steps_per_second': 1.23, 'train_loss': 0.5193242390950521, 'epoch': 15.0}





TrainOutput(global_step=75, training_loss=0.5193242390950521, metrics={'train_runtime': 60.9864, 'train_samples_per_second': 1.23, 'train_steps_per_second': 1.23, 'train_loss': 0.5193242390950521, 'epoch': 15.0})

In [57]:
predict_training_set(model, tokenizer2)

real answer: perro 	predicted answer:  perro</s>


real answer: agua 	predicted answer:  agua</s>
real answer: madre 	predicted answer:  madre</s>
real answer: hola 	predicted answer:  hola</s>
real answer: árbol 	predicted answer:  árbol</s>


In [58]:
sample_generate(model, tokenizer2, holdout_input2, max_new_tokens=5)

' buenas</s>'

We're close enough on the holdout case. However, we can see the tradeoff when we add instruction and response tokens. We now properly predict the end of sequence token, but we had to see more data and therefore train for longer to learn the new special tokens. Fortunately, many chat models like LLAMA-2-chat will already have specific instruction and response tokens. 

### Multi-turn example

A core functionality of modern chatbots is the ability to remember previous interactions. Up to until now, we've trained off of individual human/assistant interactions. Let's briefly try learning a very simple multi-turn conversation to make sure our approach holds up. 

In [59]:
model = load_model(model_name)
tokenizer = AutoTokenizer.from_pretrained(
    model_name, trust_remote_code=True, padding_side="right"
)
# concatenate human:assistant interactions
multi_turn_training_data = {
    "text": [str2 + str1, str3 + str4, str5],
}
dataset_text_multi = Dataset.from_dict(multi_turn_training_data)
dataset_text_multi = dataset_text_multi.map(lambda x: add_special_tokens(x, tokenizer))
print(f"{dataset_text_multi[0]=}")

Map: 100%|██████████| 3/3 [00:00<00:00, 986.51 examples/s]

dataset_text_multi[0]={'text': '\n\n### Human: How do you say "water" in Spanish?\n\n### Assistant:<s> agua</s>\n\n### Human: How do you say "dog" in Spanish?\n\n### Assistant:<s> perro</s>'}





In [60]:
# tokenize the text
dataset_multi = dataset_text_multi.map(
    lambda example: tokenizer(example["text"]), batched=True, remove_columns=["text"]
)
# copy the input_ids to labels
dataset_multi = dataset_multi.map(lambda x: {"labels": x["input_ids"]}, batched=True)
print(f"{dataset_multi=}")
print(f"{dataset_multi[0]['input_ids']=}")
print(f"{dataset_multi[0]['labels']=}")

Map: 100%|██████████| 3/3 [00:00<?, ? examples/s]
Map: 100%|██████████| 3/3 [00:00<00:00, 182.39 examples/s]

dataset_multi=Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 3
})
dataset_multi[0]['input_ids']=[603, 105311, 22256, 29, 7535, 727, 1152, 5894, 567, 37636, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 14423, 2, 603, 105311, 22256, 29, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 82208, 2]
dataset_multi[0]['labels']=[603, 105311, 22256, 29, 7535, 727, 1152, 5894, 567, 37636, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 14423, 2, 603, 105311, 22256, 29, 7535, 727, 1152, 5894, 20587, 744, 5, 361, 49063, 7076, 105311, 143005, 29, 1, 82208, 2]





In [61]:
# let's test our special characters method
dataset_multi = dataset_multi.map(create_special_mask)
# convert dataset from lists to torch tensors
dataset_multi.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])
print(f"{dataset_multi[0]["labels"]=}")
label_ex = dataset_multi[0]["labels"]
# let's see just the non-masked text
print(f"non masked text: {tokenizer.decode(label_ex[label_ex != -100], skip_special_tokens=False)}")
label_ex[label_ex == -100] = 0
print(f"full 'label': {tokenizer.decode(label_ex, skip_special_tokens=False)}")
 

Map: 100%|██████████| 3/3 [00:00<00:00, 842.85 examples/s]

dataset_multi[0]["labels"]=tensor([ -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 14423,     2,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100,
         -100,  -100,  -100,  -100,  -100,  -100,  -100,  -100, 82208,     2])
non masked text:  agua</s> perro</s>
full 'label': <unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk> agua</s><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk><unk> perro</s>





In [62]:
trainer = Trainer(
    model=model,
    train_dataset=dataset_multi,
    args=training_arguments,
)

In [63]:
trainer.train()

100%|██████████| 45/45 [00:51<00:00,  1.15s/it]

{'train_runtime': 51.605, 'train_samples_per_second': 0.872, 'train_steps_per_second': 0.872, 'train_loss': 2.0485241360134547, 'epoch': 15.0}





TrainOutput(global_step=45, training_loss=2.0485241360134547, metrics={'train_runtime': 51.605, 'train_samples_per_second': 0.872, 'train_steps_per_second': 0.872, 'train_loss': 2.0485241360134547, 'epoch': 15.0})

In [64]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer: </s>
real answer: agua 	predicted answer: </s>
real answer: madre 	predicted answer: madre</s>
real answer: hola 	predicted answer: </s>
real answer: árbol 	predicted answer: tree</s>


We haven't learned the task properly. Multi-turn chat is a moderately more complex task and the number of examples per epoch is down because we've concatenated pieces together. Let's keep training. 

In [65]:
trainer.train()

  0%|          | 0/45 [00:00<?, ?it/s]

100%|██████████| 45/45 [00:51<00:00,  1.14s/it]

{'train_runtime': 51.4634, 'train_samples_per_second': 0.874, 'train_steps_per_second': 0.874, 'train_loss': 0.4767167833116319, 'epoch': 15.0}





TrainOutput(global_step=45, training_loss=0.4767167833116319, metrics={'train_runtime': 51.4634, 'train_samples_per_second': 0.874, 'train_steps_per_second': 0.874, 'train_loss': 0.4767167833116319, 'epoch': 15.0})

In [66]:
predict_training_set(model, tokenizer)

real answer: perro 	predicted answer:  perro</s>
real answer: agua 	predicted answer:  agua</s>
real answer: madre 	predicted answer:  madre</s>
real answer: hola 	predicted answer:  hola</s>
real answer: árbol 	predicted answer:  árbol</s>


In [67]:
sample_generate(model, tokenizer, holdout_input, max_new_tokens=5)

' bueno</s>'

Sucess! 

## Conclusion
Obviously our data is dumbed down and our task is very simple. However, as a proof of concepts of sorts, we've learned a few tricks to finetune CausalLM models, ranging from manually copying input_ids, to manually masking labels, to using the TRL library to mask labels using bos/eos tokens, to adding special tokens to play nicely with the TRL modules.  