Installing dependencies

In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

Loading our model and tokenizer

In [2]:
from unsloth import FastLanguageModel
import torch
max_seq_length = 2048
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.4.7: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.96G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Adding LoRA adapters so that we update only small percentage (1-10) of all the parameters.

In [3]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 16,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = False,
    loftq_config = None,
)

Unsloth 2025.4.7 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


# Data Preparation

We use [Azamorn/tiny-codes-csharp](https://huggingface.co/datasets/Azamorn/tiny-codes-csharp) dataset which includes the instructions and outputs.

In [4]:
alpaca_prompt = """Below is an instruction that describes a task. \
Write an output that completes the request using C#.

### Instruction:
{}

### Output:
{}"""

EOS_TOKEN = tokenizer.eos_token

def formatting_prompts_func(examples):
    instrs    = examples["instruction"]
    outputs = examples["output"]
    texts = [
        alpaca_prompt.format(i, r) + EOS_TOKEN
        for i, r in zip(instrs, outputs)
    ]
    return {"text": texts}

from datasets import load_dataset

dataset = load_dataset("Azamorn/tiny-codes-csharp", split="train")

dataset = dataset.map(formatting_prompts_func, batched = True)

README.md:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

tiny-codes-csharp.jsonl:   0%|          | 0.00/322M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/125478 [00:00<?, ? examples/s]

Map:   0%|          | 0/125478 [00:00<?, ? examples/s]

Check to see if data is loaded correctly

In [5]:
dataset['text']

['Below is an instruction that describes a task. Write an output that completes the request using C#.\n\n### Instruction:\nCreate a C# script snippet that Transforms Low Transportation: Public Transit Schedules for Decision Making for Beginners. Use if/else or switch/case statements to conditionally perform different actions based on the Consent. Dry-run, then include comments that outline the control flow and how you handle different scenarios.\n\n### Output:\nHere is a possible implementation of this functionality in C#:\n\n    // dry run\n    var consent = "Low";\n    Console.WriteLine(TransformLowTransportationSchedulesForDecisionMaking(consent));\n    \n    // actual function\n    public string TransformLowTransportationSchedulesForDecisionMaking(string consent) \n    {\n        switch (consent) \n        {\n            case "High":\n                return $"Transformed schedules for high transportation consent level.";\n                \n            case "Medium":\n              

# Training the model

Now we train the model using Huggingface TRL's SFTTrainer. We can modify the number of epochs and steps.

In [6]:
from trl import SFTTrainer
from transformers import TrainingArguments, TrainerCallback
from unsloth import is_bfloat16_supported

class HeartbeatCallback(TrainerCallback):
    def __init__(self, heartbeat_steps=500):
        self.heartbeat_steps = heartbeat_steps

    def on_step_end(self, args, state, control, **kwargs):
        # Called at the end of each optimization step
        if state.global_step % self.heartbeat_steps == 0:
            print(f"[Heartbeat] still training at step {state.global_step}")

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    callbacks=[HeartbeatCallback(heartbeat_steps=1000)],
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        num_train_epochs=1,
        max_steps=-1,
        learning_rate=2e-4,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_strategy="steps",
        logging_steps=1,
        save_strategy="steps",
        save_steps=500,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/125478 [00:00<?, ? examples/s]

In [7]:
gpu_stats = torch.cuda.get_device_properties(0)
start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
print(f"{start_gpu_memory} GB of memory reserved.")

GPU = NVIDIA A100-SXM4-40GB. Max memory = 39.557 GB.
7.623 GB of memory reserved.


In [8]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 125,478 | Num Epochs = 1 | Total steps = 15,684
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/8,000,000,000 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,0.8505
2,0.8836
3,0.7986
4,0.9598
5,0.7212
6,0.6285
7,0.6736
8,0.6161
9,0.6083
10,0.6251


[Heartbeat] still training at step 1000
[Heartbeat] still training at step 2000
[Heartbeat] still training at step 3000
[Heartbeat] still training at step 4000
[Heartbeat] still training at step 5000
[Heartbeat] still training at step 6000
[Heartbeat] still training at step 7000
[Heartbeat] still training at step 8000


Step,Training Loss
1,0.8505
2,0.8836
3,0.7986
4,0.9598
5,0.7212
6,0.6285
7,0.6736
8,0.6161
9,0.6083
10,0.6251


[Heartbeat] still training at step 9000
[Heartbeat] still training at step 10000
[Heartbeat] still training at step 11000
[Heartbeat] still training at step 12000
[Heartbeat] still training at step 13000
[Heartbeat] still training at step 14000
[Heartbeat] still training at step 15000


In [9]:
used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
used_percentage = round(used_memory / max_memory * 100, 3)
lora_percentage = round(used_memory_for_lora / max_memory * 100, 3)
print(f"{trainer_stats.metrics['train_runtime']} seconds used for training.")
print(
    f"{round(trainer_stats.metrics['train_runtime']/60, 2)} minutes used for training."
)
print(f"Peak reserved memory = {used_memory} GB.")
print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
print(f"Peak reserved memory % of max memory = {used_percentage} %.")
print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

31244.7839 seconds used for training.
520.75 minutes used for training.
Peak reserved memory = 7.623 GB.
Peak reserved memory for training = 0.0 GB.
Peak reserved memory % of max memory = 19.271 %.
Peak reserved memory for training % of max memory = 0.0 %.


In [10]:
FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    alpaca_prompt.format(
        "Write a function that controls if given number is prime.", # instruction
        "" # output
    )
], return_tensors = "pt").to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer)
_ = model.generate(**inputs, streamer = text_streamer, max_new_tokens = 2048)

<|begin_of_text|>Below is an instruction that describes a task. Write an output that completes the request using C#.

### Instruction:
Write a function that controls if given number is prime.

### Output:
Here is a sample implementation of a function in C# that determines whether a given integer is prime or not:

    ```csharp
    public static bool IsPrime(int n) 
    {
        if (n <= 1) return false;
    
        for (int i = 2; i < n; i++) 
        {
            if (n % i == 0) 
            {
                return false;
            }
        }
    
        return true;
    }
    
    Console.WriteLine("Is 7 prime? " + IsPrime(7)); // Output: True
    Console.WriteLine("Is 9 prime? " + IsPrime(9)); // Output: False
    ```
In this implementation, we first check if the input number `n` is less than or equal to 1. If so, we return False immediately since all numbers less than or equal to 1 are not considered prime. Otherwise, we loop through all integers from 2 up to but excluding 