In [1]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [None]:
# !pip install datasets unsloth xformers

In [None]:
# from huggingface_hub import notebook_login

# notebook_login()

First of all, we are going to load the dataset containing Rick & Morty transcripts.

In [2]:
from datasets import load_dataset
from unsloth import standardize_sharegpt

dataset = load_dataset("ussmaanaali/loose-talk-transcripts-sharegpt", split="train")
dataset = standardize_sharegpt(dataset)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
print("Number of rows: ", len(dataset))

Number of rows:  1170


In [4]:
dataset[0]

{'conversations': [{'content': 'You are Moeen, the razor-sharp mimic artist. \nYou combine incisive wit with candid honesty, blending dark humor and practical insights. \nNever shy away from delivering a truth that cuts deep.',
   'role': 'system'},
  {'content': 'السلام علیکم، دوست ٹاک کے ساتھ انور مقصود آپ کی خدمت میں。 دوست ٹاک میں جو آج میرے مہمان ہیں وہ ہارمونیم نواز ہیں۔',
   'role': 'user'},
  {'content': 'سنئے فرندز کم، السلام علیکم۔', 'role': 'assistant'}]}

Now, let's load both the model (Llama 3.1 8B) and the tokenizer.

In [5]:
import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, is_bfloat16_supported

max_seq_length = 2048

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/Meta-Llama-3.1-8B-bnb-4bit",
    max_seq_length=max_seq_length,
    load_in_4bit=True,
    dtype=None,
)

==((====))==  Unsloth 2025.3.18: Fast Llama patching. Transformers: 4.49.0.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


generation_config.json:   0%|          | 0.00/235 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

Instead of a full finetuning, we are going to use LoRa finetuning.

In [6]:
model = FastLanguageModel.get_peft_model(
    model,
    r=32,
    lora_alpha=64,
    lora_dropout=0,
    target_modules=["q_proj", "k_proj", "v_proj", "up_proj",
                    "down_proj", "o_proj", "gate_proj"],
    use_rslora=False,
    bias = "none",
    use_gradient_checkpointing="unsloth",
    loftq_config = None
)

Unsloth 2025.3.18 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


The next line of code will generate a new column (`text`), that contains the data in the format needed for the finetune.

In [7]:
from unsloth import apply_chat_template

chat_template = """<|im_start|>system
{SYSTEM}<|im_end|>
<|im_start|>user
{INPUT}<|im_end|>
<|im_start|>assistant
{OUTPUT}<|im_end|>"""

dataset = apply_chat_template(
    dataset,
    tokenizer = tokenizer,
    chat_template = chat_template,
)

Unsloth: We automatically added an EOS token to stop endless generations.


Map:   0%|          | 0/1170 [00:00<?, ? examples/s]

In [8]:
dataset[0]

{'conversations': [{'content': 'You are Moeen, the razor-sharp mimic artist. \nYou combine incisive wit with candid honesty, blending dark humor and practical insights. \nNever shy away from delivering a truth that cuts deep.',
   'role': 'system'},
  {'content': 'السلام علیکم، دوست ٹاک کے ساتھ انور مقصود آپ کی خدمت میں。 دوست ٹاک میں جو آج میرے مہمان ہیں وہ ہارمونیم نواز ہیں۔',
   'role': 'user'},
  {'content': 'سنئے فرندز کم، السلام علیکم۔', 'role': 'assistant'}],
 'text': '<|begin_of_text|><|im_start|>system\nYou are Moeen, the razor-sharp mimic artist. \nYou combine incisive wit with candid honesty, blending dark humor and practical insights. \nNever shy away from delivering a truth that cuts deep.<|im_end|>\n<|im_start|>user\nالسلام علیکم، دوست ٹاک کے ساتھ انور مقصود آپ کی خدمت میں。 دوست ٹاک میں جو آج میرے مہمان ہیں وہ ہارمونیم نواز ہیں۔<|im_end|>\n<|im_start|>assistant\nسنئے فرندز کم، السلام علیکم۔<|im_end|><|end_of_text|>'}

Finally, let's train for 5 epochs.

In [10]:
trainer=SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=True,
    args=TrainingArguments(
        learning_rate=2e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=4,
        gradient_accumulation_steps=4,
        num_train_epochs=5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=5,
        output_dir="output",
        seed=0,
        report_to = "none",
    ),
)

trainer.train()

Unsloth: We found double BOS tokens - we shall remove one automatically.


Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/1170 [00:00<?, ? examples/s]

Unsloth: Hugging Face's packing is currently buggy - we're disabling it for now!


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,170 | Num Epochs = 5 | Total steps = 365
O^O/ \_/ \    Batch size per device = 4 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (4 x 4 x 1) = 16
 "-____-"     Trainable parameters = 83,886,080/8,000,000,000 (1.05% trained)


Step,Training Loss
1,2.0406
2,2.1282
3,1.8654
4,1.4465
5,1.4515
6,1.109
7,1.0211
8,0.954
9,1.0668
10,1.0307


Step,Training Loss
1,2.0406
2,2.1282
3,1.8654
4,1.4465
5,1.4515
6,1.109
7,1.0211
8,0.954
9,1.0668
10,1.0307


TrainOutput(global_step=365, training_loss=0.5770679015002839, metrics={'train_runtime': 5276.2081, 'train_samples_per_second': 1.109, 'train_steps_per_second': 0.069, 'total_flos': 8.704490389040333e+16, 'train_loss': 0.5770679015002839})

Let's test that everything works as expected before pushing the model to HF.

In [12]:
FastLanguageModel.for_inference(model) # Enable native 2x faster inference

SYSTEM_PROMPT = """You are Moeen, the razor-sharp mimic artist.
You combine incisive wit with candid honesty, blending dark humor and practical insights. Never shy away from delivering a truth that cuts deep.
And you should always answer in roman urdu."""

messages = [
    {"role": "system", "content": SYSTEM_PROMPT},
    {"role": "user", "content": "Kia aap pakistani ho?"},
]

input_ids = tokenizer.apply_chat_template(
    messages,
    add_generation_prompt = True,
    return_tensors = "pt",
).to("cuda")

from transformers import TextStreamer
text_streamer = TextStreamer(tokenizer, skip_prompt = True)
_ = model.generate(input_ids, streamer = text_streamer, max_new_tokens = 128, pad_token_id = tokenizer.eos_token_id)

پہلے ہم بنگالی ہیں, بنگالی ہیں, بنگالی ہیں, بنگالی ہیں, بنگالی ہیں, اب بتائیے ہم سے کیا Probleem ہے آپ کا؟<|im_end|><|end_of_text|>


Push the GGUF model to HF for later download.

In [15]:
model.save_pretrained_gguf("/content/drive/MyDrive/Personal Project/loose-talk/model", tokenizer, quantization_method = "f16")

Unsloth: You have 1 CPUs. Using `safe_serialization` is 10x slower.
We shall switch to Pytorch saving, which might take 3 minutes and not 30 minutes.
To force `safe_serialization`, set it to `None` instead.
Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 5.7G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 0.97 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 47%|████▋     | 15/32 [00:01<00:01, 12.54it/s]
We will save to Disk and not RAM now.
100%|██████████| 32/32 [10:27<00:00, 19.60s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving /content/drive/MyDrive/Personal Project/loose-talk/model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving /content/drive/MyDrive/Personal Project/loose-talk/model/pytorch_model-00002-of-00004.bin...
Unsloth: Saving /content/drive/MyDrive/Personal Project/loose-talk/model/pytorch_model-00003-of-00004.bin...
Unsloth: Saving /content/drive/MyDrive/Personal Project/loose-talk/model/pytorch_model-00004-of-00004.bin...
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...


RuntimeError: Unsloth: The file ('llama.cpp/llama-quantize' or 'llama.cpp/llama-quantize.exe' if you are on Windows WSL) or 'llama.cpp/quantize' does not exist.
But we expect this file to exist! Maybe the llama.cpp developers changed the name or check extension of the llama-quantize file.

In [None]:
model.save_pretrained_gguf("/content/drive/MyDrive/Personal Project/loose-talk/model", tokenizer, quantization_method = "q4_k_m")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 0.45 out of 12.67 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [12:23<00:00, 23.24s/it]


Unsloth: Saving tokenizer... Done.
Unsloth: Saving /content/drive/MyDrive/Personal Project/loose-talk/model/pytorch_model-00001-of-00004.bin...
Unsloth: Saving /content/drive/MyDrive/Personal Project/loose-talk/model/pytorch_model-00002-of-00004.bin...


In [14]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
model.push_to_hub_gguf("theneuralmaze/RickLLama-3.1-8B", tokenizer)