### Installation

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer

### Normal Huggingface

In [1]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig

In [2]:
max_seq_length = 2048
bnb_config = BitsAndBytesConfig(
  load_in_4bit=True,
  bnb_4bit_use_double_quant=True,
  bnb_4bit_quant_type="nf4",
  bnb_4bit_compute_dtype=torch.bfloat16
  )
model  = AutoModelForCausalLM.from_pretrained(
    "unsloth/Llama-3.2-3B-Instruct", # or choose "unsloth/Llama-3.2-1B-Instruct"
    quantization_config=bnb_config
)
tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-3B-Instruct")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
`low_cpu_mem_usage` was None, now default to True since model is quantized.


In [3]:
from peft import LoraConfig, TaskType

peft_config = LoraConfig(task_type=TaskType.SEQ_2_SEQ_LM,
                         inference_mode=False,
                         r=16, lora_alpha=16, lora_dropout=0,
                         target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                              "gate_proj", "up_proj", "down_proj",]
                         )

In [4]:
from peft import get_peft_model

model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

trainable params: 24,313,856 || all params: 3,237,063,680 || trainable%: 0.7511


<a name="Data"></a>
### Data Prep


In [5]:
from datasets import load_dataset
dataset = load_dataset("mlabonne/FineTome-100k", split = "train")

In [6]:
dataset['conversations'][0]

[{'from': 'human',
  'value': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.'},
 {'from': 'gpt',
  'value': 'Boole

In [7]:
def standardize_data_formats(
    dataset,
    tokenizer             = None,
    aliases_for_system    = ["system",],
    aliases_for_user      = ["user", "human", "input",],
    aliases_for_assistant = ["gpt", "assistant", "output",],
):
    """
    Standardizes ShareGPT and other formats to user/assistant Hugging Face format.

    Get aliases for the system, user and assistant roles.
    These shall map to "system", "user" and "assistant" respectively.

    aliases_for_system    = ["system",],
    aliases_for_user      = ["user", "human", "input",],
    aliases_for_assistant = ["gpt", "assistant", "output",],
    """
    import collections
    import itertools

    # Check if vision tokenizer is used - if yes, we must use the format:
    # Text : {"role" : role, "content" : "Happy"}
    # VLMs : {"role" : role, "content" : [{"type" : "text", "text" : "Happy"}]}
    is_vlm = False
    if tokenizer is not None:
        if hasattr(tokenizer, "image_processor") or hasattr(tokenizer, "tokenizer"):
            is_vlm = True

    column_names = set(next(iter(dataset)).keys())
    if "conversations" not in column_names:
        return dataset

    convos = dataset[:10]["conversations"]
    uniques = collections.defaultdict(list)
    for convo in convos:
        for message in convo:
            for key, value in message.items():
                if type(value) is not str:
                    raise RuntimeError("Unsloth: Cannot standardize non text datasets!")
                uniques[key].append(value)
    pass

    # Must be only 2 entries
    assert(len(uniques.keys()) == 2)

    keys = list(uniques.keys())
    length_first  = len(set(uniques[keys[0]]))
    length_second = len(set(uniques[keys[1]]))

    if length_first < length_second:
        # Role is assigned to the first element
        role_key    = keys[0]
        content_key = keys[1]
    else:
        role_key    = keys[1]
        content_key = keys[0]
    pass

    # Check roles are in aliases
    all_aliases = set(aliases_for_system + aliases_for_user + aliases_for_assistant)
    roles = set(uniques[role_key])
    leftover_aliases = (all_aliases | roles) - all_aliases
    if len(leftover_aliases) != 0:
        raise TypeError(
            f"Unsloth: {list(leftover_aliases)} are not in aliases. Please update aliases."
        )
    pass

    # Mapping for aliases
    aliases_mapping = {}
    for x in aliases_for_system:    aliases_mapping[x] = "system"
    for x in aliases_for_user:      aliases_mapping[x] = "user"
    for x in aliases_for_assistant: aliases_mapping[x] = "assistant"

    def _standardize_dataset(examples):
        convos = examples["conversations"]
        all_convos = []
        for convo in convos:
            new_convo = []
            for message in convo:
                role = aliases_mapping[message[role_key]]
                text = message[content_key]
                if is_vlm: text = [ {"type" : "text", "text" : text} ]
                x = {"role" : role, "content" : text}
                new_convo.append(x)
            pass
            all_convos.append(new_convo)
        pass
        return { "conversations" : all_convos, }
    pass

    from multiprocessing import cpu_count
    num_proc = cpu_count()

    return dataset.map(
        _standardize_dataset,
        batched = True,
        desc = "Unsloth: Standardizing formats",
        num_proc = num_proc,
    )
pass


In [8]:
dataset = standardize_data_formats(dataset)

In [9]:
dataset['conversations'][0]

[{'content': 'Explain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code that handles cases where truthiness and falsiness are implemented differently across different programming languages.',
  'role': 'user'},
 {'content': 'Boolean operators a

In [10]:
def formatting_prompts_func(example):
    convos = example["conversations"]
    texts = tokenizer.apply_chat_template(convos, tokenize = False, add_generation_prompt = False)
    return { "text" : texts, }

In [11]:
tokenizer.apply_chat_template(dataset['conversations'][0], tokenize = False, add_generation_prompt = False)

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 10 Apr 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code th

In [12]:
dataset = dataset.map(formatting_prompts_func, batched = True,)

In [13]:
dataset['text'][0]

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 10 Apr 2025\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nExplain what boolean operators are, what they do, and provide examples of how they can be used in programming. Additionally, describe the concept of operator precedence and provide examples of how it affects the evaluation of boolean expressions. Discuss the difference between short-circuit evaluation and normal evaluation in boolean expressions and demonstrate their usage in code. \n\nFurthermore, add the requirement that the code must be written in a language that does not support short-circuit evaluation natively, forcing the test taker to implement their own logic for short-circuit evaluation.\n\nFinally, delve into the concept of truthiness and falsiness in programming languages, explaining how it affects the evaluation of boolean expressions. Add the constraint that the test taker must write code th

In [14]:
from trl import DataCollatorForCompletionOnlyLM
response_template = "<|start_header_id|>assistant<|end_header_id|>\n\n"
instruction_template = "<|start_header_id|>user<|end_header_id|>\n\n"
collator = DataCollatorForCompletionOnlyLM(response_template, instruction_template, tokenizer=tokenizer)

In [15]:
batch = collator([tokenizer(example, padding=False) for example in dataset["text"][:2]])

In [16]:
batch

{'input_ids': tensor([[128000, 128000, 128006,  ...,   5718,     13, 128009],
        [128004, 128004, 128004,  ...,    220,     16, 128009]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [0, 0, 0,  ..., 1, 1, 1]]), 'labels': tensor([[  -100,   -100,   -100,  ...,   5718,     13, 128009],
        [  -100,   -100,   -100,  ...,    220,     16, 128009]])}

<a name="Train"></a>
### Train the model

In [17]:
from trl import SFTTrainer, SFTConfig

trainer = SFTTrainer(
    model = model,
    train_dataset = dataset,
    data_collator=collator,
    formatting_func=formatting_prompts_func,
    args = SFTConfig(
        max_seq_length = max_seq_length,
        per_device_train_batch_size = 1,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60,
        learning_rate = 2e-4,
        fp16 = True,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        # gradient_checkpointing=True,
        seed = 3407,
        output_dir = "outputs",
        label_names=["labels"],
        report_to = "none", # Use this for WandB etc
    ),
)

In [18]:
# @title Show current memory stats
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = Tesla T4. Max memory = 14.741 GB.
2.225 GB of memory reserved.


In [19]:
trainer_stats = trainer.train() #batch size 1 takes around 10.3 GB VRAM

Step,Training Loss
1,0.9336
2,0.7192
3,0.9034
4,0.7844
5,1.1195
6,1.0084
7,0.7553
8,0.9089
9,0.9096
10,0.5617


In [20]:
# @title Show final memory and time stats
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

221.5583 seconds used for training.
3.69 minutes used for training.
Peak reserved memory = 14.199 GB.
Peak reserved memory % of max memory = 96.323 %.
Peak reserved memory for training % of max memory = 81.229 %.
