# Minimal Working Example: CE+NTL Loss Function

This notebook demonstrates a minimal working example for training a decoder only Hugging Face language model using the NTL+CE loss function.

In [None]:
from typing import Dict, List, Union
import torch
from transformers import DataCollatorForLanguageModeling
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments

from ntl.tokenizer.abstract_tokenizer import NumberEncodingTokenizer
from ntl.data.data import load_txt_dataset
from ntl.loss_functions.base_number_token_loss import CEWithNTL

%load_ext autoreload
%autoreload 2

In [None]:
# Setting up custom collator for decoder only models


class LlamaQACollator(DataCollatorForLanguageModeling):
    def __init__(self, tokenizer: NumberEncodingTokenizer):
        super().__init__(tokenizer, mlm=False)
        self.tokenizer = tokenizer
        self.pad_token_id = tokenizer.pad_token_id

    def __call__(self, examples: List[Dict[str, Union[str, List[int]]]]) -> Dict[str, torch.Tensor]:
        # Merge questions and answers into single sequences (needed for decoder only)
        merged_sequences = [f"{ex['question']}{ex['answer']}" for ex in examples]

        # Tokenize merged sequences
        batch = self.tokenizer(
            merged_sequences, padding=True, truncation=True, return_tensors="pt", return_attention_mask=True
        )

        # Masking questions to create labels
        labels = batch["input_ids"].clone()

        for i, ex in enumerate(examples):
            # Find question length in tokens
            question_tokens = self.tokenizer(
                ex["question"],
                return_tensors="pt",
                add_special_tokens=False,
            )
            question_length = question_tokens["input_ids"].size(1)

            # Mask question tokens and preserve answer tokens
            labels[i, :question_length] = -100

        # Mask padding tokens
        labels[labels == self.pad_token_id] = -100

        return {"input_ids": batch["input_ids"], "attention_mask": batch["attention_mask"], "labels": labels}


In [None]:
# Loading model and tokenizer

model_name = "tinyllama/tinyLlama-1.1B-intermediate-step-1431k-3T"

tokenizer = AutoTokenizer.from_pretrained(model_name, token=False)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    model_name,
)

In [None]:
# Loading dataset and collator

dataset = load_txt_dataset("data/mathematics_dataset-v1.0/arithmetic_val.txt")
data_collator = LlamaQACollator(tokenizer=tokenizer)

In [None]:
# Setting up the trainer with the integrated NTL+CE loss function

ce_with_ntl = CEWithNTL(tokenizer=tokenizer, ntl_weight=0.3)

class CustomTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        # Get model outputs
        outputs = model(**inputs)

        logits = outputs.logits
        labels = inputs.get("labels")

        # Compute loss
        loss = ce_with_ntl(logits, labels)

        return (loss, outputs) if return_outputs else loss

In [None]:
# Train for 2 steps

training_args = TrainingArguments(
    output_dir="./llama2-finetuned",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    max_steps=2,
    logging_strategy="steps",
    logging_steps=1,
    save_strategy="no",
    report_to="none",
    remove_unused_columns=False,
    use_cpu=True,
)

trainer = CustomTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
    eval_dataset=dataset,
    data_collator=data_collator,
)

trainer.train()