In [7]:
%%sh
# download model
huggingface-cli download "unsloth/mistral-7b-instruct-v0.2-bnb-4bit" \
    --token "${HF_TOKEN}" \
    --local-dir "./base_model"

Downloading '.gitattributes' to 'base_model/.cache/huggingface/download/wPaCkH-WbT7GsmxMKKrNZTV4nSM=.a6344aac8c09253b3b630fb776ae94478aa0275b.incomplete'
Download complete. Moving file to base_model/.gitattributes
Downloading 'README.md' to 'base_model/.cache/huggingface/download/Xn7B-BWUGOee2Y6hCZtEhtFu4BE=.15be1d0739032f5907387b43c73353325a49cc4c.incomplete'
Download complete. Moving file to base_model/README.md
Downloading 'config.json' to 'base_model/.cache/huggingface/download/8_PA_wEVGiVa2goH2H4KQOQpvVY=.40e1f573d0d8a844df016c015a0dd4d38bfbba26.incomplete'
Download complete. Moving file to base_model/config.json
Downloading 'generation_config.json' to 'base_model/.cache/huggingface/download/3EVKVggOldJcKSsGjSdoUCN1AyQ=.3e22f6e907f99d57857ae62725aacfd251ca8e37.incomplete'
Download complete. Moving file to base_model/generation_config.json
Downloading 'model.safetensors' to 'base_model/.cache/huggingface/download/xGOKKLRSlIhH692hSVvI1-gpoa8=.5ac048c8614d6888b433a9ddb4ba8ae063376575

/workspace/rasa-rag-challange-2025/base_model


In [1]:
from unsloth import FastLanguageModel
from transformers import BitsAndBytesConfig

max_seq_length = 2048
random_seed = 42


# configure quantization method for base model
quantization_config = BitsAndBytesConfig(
    load_in_4bit=True,
)

# load quantized model and tokenizer from disk
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="./base_model",
    max_seq_length=max_seq_length,
    quantization_config=quantization_config,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Mistral patching. Transformers: 4.50.3.
   \\   /|    NVIDIA A100 80GB PCIe. Num GPUs = 1. Max memory: 79.254 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth: Will load ./base_model as a legacy tokenizer.


In [2]:
from unsloth import FastLanguageModel

# adapt model for peft
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,
    bias="none",
    use_gradient_checkpointing="unsloth",
    random_state=random_seed,
    use_rslora=False,
    loftq_config=None,
)

Unsloth 2025.3.19 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


In [3]:
import datasets
from trl.extras.dataset_formatting import get_formatting_func_from_dataset
from unsloth.chat_templates import get_chat_template

train = '/workspace/rasa-rag-challange-2025/tests/e2e_finetune/output_conversational/4_train_test_split/ft_splits/train.jsonl'
eval_file = '/workspace/rasa-rag-challange-2025/tests/e2e_finetune/output_conversational/4_train_test_split/ft_splits/val.jsonl'

# Load the training and evaluation datasets from JSONL files on disk
train_dataset = datasets.load_dataset(
    "json", data_files={"train": train}, split="train"
)
eval_dataset = datasets.load_dataset(
    "json", data_files={"eval": eval_file}, split="eval"
)

# Uncomment the following line if you want to test prompt formatting on a single example from the eval dataset
# print(get_formatting_func_from_dataset(train_dataset, tokenizer)(eval_dataset[0]))

# Get a tokenizer with a chat template to format conversations according to a specified structure
tokenizer = get_chat_template(
    tokenizer,
    chat_template="llama-3",  # Specifies the chat template format (options: zephyr, chatml, mistral, llama, alpaca, etc.)
    mapping={"role": "from", "content": "value", "user": "human", "assistant": "gpt"},  # Maps dataset roles and messages to expected format
)

# Define a function to format prompts for each example in the dataset
def formatting_prompts_func(examples):
    # Extract conversation messages from each example
    print([k for k in examples.keys()])
    convos = examples["messages"]
    
    # Apply the chat template to each conversation without tokenizing or adding generation prompts
    texts = [tokenizer.apply_chat_template(convo, tokenize=False, add_generation_prompt=False) for convo in convos]
    
    # Return the formatted texts in a new dictionary key
    return {"text": texts}

# Apply the formatting function to both the training and evaluation datasets in batches
train_dataset = train_dataset.map(formatting_prompts_func, batched=True)
eval_dataset = eval_dataset.map(formatting_prompts_func, batched=True)

In [4]:
import torch
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# configure training args
args = TrainingArguments(
    ###### training
    seed = random_seed,
    per_device_train_batch_size = 2,
    gradient_accumulation_steps = 4,
    warmup_steps = 5,
    #max_steps = 60,
    num_train_epochs = 5,
    learning_rate = 2e-4,
    lr_scheduler_type = "linear",
    optim = "adamw_8bit",
    weight_decay = 0.01,
    ###### datatypes
    fp16 = not is_bfloat16_supported(),
    bf16 = is_bfloat16_supported(),
    ###### evaluation
    eval_strategy = "steps",
    eval_steps = 50,
    per_device_eval_batch_size = 8,
    ###### outputs
    logging_steps = 30,
    output_dir = "outputs",
)

# setup trainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = eval_dataset,
    max_seq_length = max_seq_length,
    args = args,
)

In [5]:
# run fine-tuning
finetune_metrics = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,826 | Num Epochs = 5 | Total steps = 1,140
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040/3,794,014,208 (1.11% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
50,0.4706,0.01162
100,0.0065,0.003033
150,0.0027,0.002687
200,0.0026,0.002575
250,0.0026,0.002591
300,0.0028,0.002611
350,0.0025,0.002704
400,0.0027,0.002588
450,0.0024,0.002629
500,0.0027,0.002581


Unsloth: Not an error, but MistralForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient




After fine-tuning, the base model and fine-tuned adapters are [merged together and saved to disk](https://docs.unsloth.ai/basics/saving-models/saving-to-vllm) in 16-bit for future compatibility with the [vLLM](https://github.com/vllm-project/vllm) model serving library.

In [6]:
# save model to disk in 16-bit
model.save_pretrained_merged("./finetuned_model_2", tokenizer, save_method="merged_16bit")

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 593.94 out of 944.44 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 32/32 [00:00<00:00, 70.79it/s]


Unsloth: Saving tokenizer... Done.
Done.


In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# plot step against train and val losses
log_history = pd.DataFrame(trainer.state.log_history)
log_history


In [None]:
fig, ax = plt.subplots()
eval_loss = log_history[["step", "loss"]].dropna().plot(x="step", ax=ax)
train_loss = log_history[["step", "train_loss"]].dropna().plot(x="step", ax=ax)
fig.show()