In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    import torch; v = re.match(r"[0-9]{1,}\.[0-9]{1,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.33.post1" if v=="2.9" else "0.0.32.post2" if v=="2.8" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets==4.3.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.56.2
!pip install --no-deps trl==0.22.2
!pip install sentence-transformers  # For semantic similarity

In [None]:
# Core imports
import torch
import time
import json
import pandas as pd
import numpy as np
from datasets import load_dataset, Dataset
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template, train_on_responses_only
from trl import SFTTrainer, SFTConfig
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import re

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [None]:
def train_model(model_name, train_dataset, config, system_prompt=None):
    """
    Train a model with QLoRA and return training stats.
    Supports continuing training from an existing adapter.
    """

    print(f"Training {model_name}")

    # Track GPU memory before
    torch.cuda.reset_peak_memory_stats()
    start_time = time.time()

    # Load model
    # Unsloth handles loading base model + adapter automatically if an adapter path is provided

    model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="yuhueng/qwen3-4b-singlish-base",
    max_seq_length=2048,
    load_in_4bit=True,
    load_in_8bit=False,
    full_finetuning=False,
    )

    # Initialize LoRA config
    # If we loaded an adapter, this step ensures Unsloth's training patches are active
    model = FastLanguageModel.get_peft_model(
        model,
        r=config["r"],
        target_modules=config["target_modules"],
        lora_alpha=config["lora_alpha"],
        lora_dropout=0,
        bias="none",
        use_gradient_checkpointing="unsloth",
        random_state=3407,
    )


    trainable_params = 0
    for name, param in model.named_parameters():
        if "lora" in name or "modules_to_save" in name:
            param.requires_grad = True
            trainable_params += 1

    if trainable_params == 0:
        print("WARNING: No trainable parameters found! Forcing LoRA gradients...")
        for name, param in model.named_parameters():
            if "lora" in name:
                param.requires_grad = True

    print(f"Verified trainable parameters.")

    # Setup tokenizer
    tokenizer = get_chat_template(tokenizer, chat_template="qwen3-instruct")

    # Format dataset
    def formatting_prompts_func(examples):
        convos = examples["conversations"]
        texts = []
        for convo in convos:
            messages = []
            if system_prompt:
                messages.append({"role": "system", "content": system_prompt})
            messages.extend(convo)
            texts.append(tokenizer.apply_chat_template(messages, tokenize=False, add_generation_prompt=False))
        return {"text": texts}


    formatted_dataset = train_dataset.map(formatting_prompts_func, batched=True)

    # Trainer
    trainer = SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=formatted_dataset,
        args=SFTConfig(
            dataset_text_field="text",
            per_device_train_batch_size=config["per_device_train_batch_size"],
            gradient_accumulation_steps=config["gradient_accumulation_steps"],
            warmup_steps=config["warmup_steps"],
            max_steps=config["max_steps"],
            learning_rate=config["learning_rate"],
            logging_steps=10,
            optim="adamw_8bit",
            weight_decay=0.001,
            lr_scheduler_type="linear",
            seed=3407,
            report_to="none",
            output_dir=f"outputs_{model_name}",
        ),
    )

    trainer = train_on_responses_only(
        trainer,
        instruction_part="<|im_start|>user\n",
        response_part="<|im_start|>assistant\n",
    )

    # Train
    trainer_stats = trainer.train()

    # Collect metrics
    training_time = time.time() - start_time
    peak_memory = torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024
    final_loss = trainer_stats.metrics.get("train_loss", trainer.state.log_history[-1].get("loss", None))

    # Save adapter
    save_path = f"singlish_adapter_{model_name.replace(' ', '_')}"
    model.save_pretrained(save_path)
    tokenizer.save_pretrained(save_path)

    results = {
        "model_name": model_name,
        "training_time_min": round(training_time / 60, 2),
        "peak_vram_gb": round(peak_memory, 2),
        "final_loss": round(final_loss, 4) if final_loss else None,
        "adapter_path": save_path,
    }

    print(f"\n{model_name} Training Complete:")
    print(f"  Time: {results['training_time_min']} min")
    print(f"  Peak VRAM: {results['peak_vram_gb']} GB")
    print(f"  Final Loss: {results['final_loss']}")
    print(f"  Saved to: {save_path}")

    # Cleanup to free VRAM
    del model, trainer
    torch.cuda.empty_cache()

    return results

def generate_response(model, tokenizer, prompt, system_prompt=None, max_new_tokens=128, temperature=0.7, top_p=0.9):
    """
    Generate a response for a given prompt, with an optional system prompt and decoding parameters.
    """
    messages = []
    if system_prompt:
        messages.append({"role": "system", "content": system_prompt})
    messages.append({"role": "user", "content": prompt})

    inputs = tokenizer.apply_chat_template(
        messages,
        tokenize=True,
        add_generation_prompt=True,
        return_tensors="pt"
    ).to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            input_ids=inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True,
            temperature=temperature,
            top_p=top_p,
            pad_token_id=tokenizer.pad_token_id,
        )

    response = tokenizer.decode(outputs[0][inputs.shape[1]:], skip_special_tokens=True)
    return response.strip()

print("Functions `train_model` and `generate_response` updated.")

Functions `train_model` and `generate_response` updated.


In [None]:
persona_dataset = load_dataset("json", data_files="ah_beng_persona.json", split="train")

PERSONA_TRAINING_CONFIG = {
    "r": 32,
    "lora_alpha": 32,
    "target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"], # All linear layers
    "per_device_train_batch_size": 2,
    "gradient_accumulation_steps": 4,
    "max_steps": 90,
    "learning_rate": 2e-4,
    "warmup_steps": 5,
}

Generating train split: 0 examples [00:00, ? examples/s]

In [None]:


persona_dataset = persona_dataset.rename_column("messages", "conversations")

In [None]:
def load_trained_model(model_path, adapter_path):
    """
    Load a trained model with its adapter for evaluation.
    """
    from peft import PeftModel

    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_path,
        max_seq_length=2048,
        load_in_4bit=True,
    )

    model = PeftModel.from_pretrained(model, adapter_path)
    tokenizer = get_chat_template(tokenizer, chat_template="qwen3-instruct")

    # Enable inference mode
    FastLanguageModel.for_inference(model)

    return model, tokenizer

print("Function `load_trained_model` redefined.")

Function `load_trained_model` redefined.


# Without System Prompt

In [None]:
print("Training Ah Beng persona adapter...")

persona_results_singlish_AhBeng = train_model(
    model_name="4B-AhBeng-on-Singlish_no_system_prompt",
    train_dataset=persona_dataset,
    config=PERSONA_TRAINING_CONFIG,
)

# Save the results of the persona adapter training
with open("training_results_4B_ah_beng_persona_no_system_prompt.json", "w") as f:
    json.dump(persona_results_singlish_AhBeng, f, indent=2)

print("Ah Beng persona adapter training complete and results saved.")


Training Ah Beng persona adapter...
Training 4B-AhBeng-on-Singlish_no_system_prompt
==((====))==  Unsloth 2025.11.6: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.65G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/237 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

added_tokens.json:   0%|          | 0.00/707 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/614 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/11.4M [00:00<?, ?B/s]

chat_template.jinja: 0.00B [00:00, ?B/s]

Unsloth 2025.11.6 patched 36 layers with 36 QKV layers, 36 O layers and 36 MLP layers.


Verified trainable parameters.


Map:   0%|          | 0/102 [00:00<?, ? examples/s]

Unsloth: Tokenizing ["text"] (num_proc=16):   0%|          | 0/102 [00:00<?, ? examples/s]

Map (num_proc=16):   0%|          | 0/102 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 102 | Num Epochs = 7 | Total steps = 90
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 66,060,288 of 4,088,528,384 (1.62% trained)


Step,Training Loss
10,3.986
20,2.9145
30,2.4765
40,1.993
50,1.3804
60,0.9477
70,0.5675
80,0.3332
90,0.1931


Unsloth: Will smartly offload gradients to save VRAM!

4B-AhBeng-on-Singlish_no_system_prompt Training Complete:
  Time: 3.91 min
  Peak VRAM: 3.83 GB
  Final Loss: 1.6435
  Saved to: singlish_adapter_4B-AhBeng-on-Singlish_no_system_prompt
Ah Beng persona adapter training complete and results saved.


In [None]:
import gc

# Force memory cleanup before loading to prevent VRAM errors
gc.collect()
torch.cuda.empty_cache()

print("Loading Ah Beng Adapter...")
model_beng, tokenizer_beng = load_trained_model(
    model_path="yuhueng/qwen3-4b-singlish-base",
    adapter_path="singlish_adapter_4B-AhBeng-on-Singlish_no_system_prompt"
)

# Test Prompts
persona_prompts = [
    "why u so stupid knn?",
    "Hi",
    "what's up",
    "How is life",
    "Tell me a story",
    "do you wanna go on a date with me?",
    "Why is the sky blue?",
    "Best place to eat in singapore?"
]

print("\n--- Persona Check ---\n")
for prompt in persona_prompts:
    response = generate_response(
        model_beng,
        tokenizer_beng,
        prompt,
        temperature=0.87, # Specific decoding parameters
        top_p=0.87,
        max_new_tokens=65
    )
    print(f"User: {prompt}")
    print(f"Ah Beng: {response}\n")

Loading Ah Beng Adapter...
==((====))==  Unsloth 2025.11.6: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!

--- Persona Check ---

User: why u so stupid knn?
Ah Beng: KNN not stupid lah, just slow like snail. But effective, if you don’t need speed.

User: Hi
Ah Beng: Hi lah! How’s your day so far? I still stuck with this non-stop scrolling problem.

User: what's up
Ah Beng: Nothing much lah. Still waiting for my refund from that scamming cousin. Every time I give, always got excuse.

User: How is life
Ah Beng: Wah knn confirm jialat sia. Money no pain no love, but later you die also no pain. Same for everything else

In [None]:
import torch
import gc
from unsloth import FastLanguageModel
from huggingface_hub import login
from peft import PeftModel # Import PeftModel
from unsloth.chat_templates import get_chat_template # Import get_chat_template

# Clear memory before loading to prevent VRAM issues
gc.collect()
torch.cuda.empty_cache()

# Login to Hugging Face Hub
# You will be prompted to enter your Hugging Face token. Ensure it has write permissions.
login()

# Define model and adapter paths
base_model_name = "yuhueng/qwen3-4b-singlish-base"
adapter_path = "singlish_adapter_4B-AhBeng-on-Singlish_no_system_prompt" # This is the adapter trained WITHOUT a system prompt

print(f"Loading base model from {base_model_name}...")

# Load the base model first
base_model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=base_model_name,
    max_seq_length=2048,
    load_in_4bit=True, # Load in 4-bit for initial loading
)

print(f"Loading adapter from {adapter_path} and applying it to the base model...")
# Load the PEFT adapter and apply it to the base model
model = PeftModel.from_pretrained(base_model, adapter_path)

# Set the chat template for the tokenizer (important for inference and consistent tokenization)
tokenizer = get_chat_template(tokenizer, chat_template="qwen3-instruct")

# Enable inference mode (optional, but good practice before pushing if you were to use it)
FastLanguageModel.for_inference(model)

# Define the name for your Hugging Face repository
hf_repo_name = "JithinBathula/ah-beng-singlish-no-system-prompt" # Renamed for clarity

print(f"Pushing the merged model to Hugging Face Hub: {hf_repo_name}")

# Push the merged model and tokenizer to Hugging Face Hub
# merge_and_unload=True merges the LoRA adapter into the base model and then unloads the LoRA weights.
model.push_to_hub(hf_repo_name, tokenizer=tokenizer, merge_and_unload=True)

print("Model successfully uploaded to Hugging Face!")


VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

Loading base model from yuhueng/qwen3-4b-singlish-base...
==((====))==  Unsloth 2025.11.6: Fast Qwen3 patching. Transformers: 4.56.2.
   \\   /|    NVIDIA L4. Num GPUs = 1. Max memory: 22.161 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 8.9. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.33.post1. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Loading adapter from singlish_adapter_4B-AhBeng-on-Singlish_no_system_prompt and applying it to the base model...
Pushing the merged model to Hugging Face Hub: JithinBathula/ah-beng-singlish-no-system-prompt


Processing Files (0 / 0)      : |          |  0.00B /  0.00B            

New Data Upload               : |          |  0.00B /  0.00B            

  ...adapter_model.safetensors:   0%|          |  559kB /  264MB            

Model successfully uploaded to Hugging Face!


In [None]:
# Load model directly
from transformers import AutoModel
model = AutoModel.from_pretrained("JithinBathula/ah-beng-singlish-no-system-prompt", dtype="auto")