## Fine-tune Mistral NeMo (12B)

In [1]:
# Install necessary packages
!pip install unsloth bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
!pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
!pip install --no-deps unsloth

Collecting unsloth
  Downloading unsloth-2025.3.19-py3-none-any.whl.metadata (46 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/46.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m46.2/46.2 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting bitsandbytes
  Downloading bitsandbytes-0.45.5-py3-none-manylinux_2_24_x86_64.whl.metadata (5.0 kB)
Collecting xformers==0.0.29.post3
  Downloading xformers-0.0.29.post3-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (1.0 kB)
Collecting trl==0.15.2
  Downloading trl-0.15.2-py3-none-any.whl.metadata (11 kB)
Collecting cut_cross_entropy
  Downloading cut_cross_entropy-25.1.1-py3-none-any.whl.metadata (9.3 kB)
Collecting unsloth_zoo
  Downloading unsloth_zoo-2025.3.17-py3-none-any.whl.metadata (8.0 kB)
Collecting datasets>=2.21.0 (from trl==0.15.2)
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from



In [2]:
from unsloth import FastLanguageModel
from datasets import load_dataset
from transformers import TrainingArguments
from trl import SFTTrainer
import torch

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [3]:
# Parameters for the model and LoRA setup
max_seq_length = 2048
dtype = None  # Auto-detect dtype
load_in_4bit = True

In [6]:
# Load the pre-trained Mistral NeMo (12B) model
import os
model_name = "facebook/opt-125m"
# 1. Setting Hugging Face token as environment variable
os.environ["HUGGING_FACE_HUB_TOKEN"] = "YOUR_HUGGING_FACE_TOKEN"
# Or
# 2. Passing token directly to the function
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    token="YOUR_HUGGING_FACE_TOKEN"  # Replace with your token
)

==((====))==  Unsloth 2025.3.19: Fast Opt patching. Transformers: 4.50.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 7.5. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`
Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


In [7]:
# Add LoRA for efficient fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank, tune this based on hardware
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha=16,
    lora_dropout=0,  # Dropout rate
    bias="none",  # Optimized setting
    use_gradient_checkpointing="unsloth",  # Enable for long context lengths
    random_state=3407
)

Unsloth: Making `model.base_model.model.model.decoder` require gradients


In [19]:
from datasets import load_dataset

# Load the Python dataset from CodeSearchNet
dataset = load_dataset("code_search_net", "python")

# Formatting function
def formatting_func(example):
    doc = example["func_documentation_string"]
    code = example["func_code_string"]
    return f'"""{doc}"""\n{code}'

# Apply the formatting only to the train split
formatted_train = dataset["train"].map(formatting_prompts_func, batched=True)

In [21]:
from unsloth.trainer import SFTTrainer  # Import SFTTrainer from unsloth.trainer

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=dataset["train"],
    formatting_func=formatting_func,
    dataset_text_field="func_code_string",  # Technically this is optional if formatting_func is set
    max_seq_length=max_seq_length,
    dataset_num_proc=2,
    packing=False,
    args=TrainingArguments(
        per_device_train_batch_size=2,
        gradient_accumulation_steps=4,
        warmup_steps=5,
        max_steps=1000,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",
        report_to="none",
    ),
)

Unsloth: Tokenizing ["func_code_string"] (num_proc=2):   0%|          | 0/412178 [00:00<?, ? examples/s]

In [22]:
# Start the fine-tuning process
trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 412,178 | Num Epochs = 1 | Total steps = 1,000
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 884,736/83,656,704 (1.06% trained)
`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`.


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1408
2,2.2026
3,2.3137
4,2.3995
5,2.7164
6,2.2459
7,1.7626
8,2.725
9,1.9335
10,2.0722


TrainOutput(global_step=1000, training_loss=1.907670191168785, metrics={'train_runtime': 595.2907, 'train_samples_per_second': 13.439, 'train_steps_per_second': 1.68, 'total_flos': 2662323684249600.0, 'train_loss': 1.907670191168785})

In [23]:
# Inference - Test the fine-tuned model on a coding prompt
FastLanguageModel.for_inference(model)

# Test prompt to generate Python code from model
test_input = "### Code:\n# Write a function that returns the Fibonacci sequence\n### Response: "
inputs = tokenizer([test_input], return_tensors="pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens=64, use_cache=True)
generated_code = tokenizer.batch_decode(outputs, skip_special_tokens=True)

print("Generated Code:\n", generated_code[0])

OPTForCausalLM has no `_prepare_4d_causal_attention_mask_with_cache_position` method defined in its base modeling class. Compiled forward passes will be sub-optimal. If you're writing code, see Llama for an example implementation. If you're a user, please report this issue on GitHub.


Generated Code:
 ### Code:
# Write a function that returns the Fibonacci sequence
### Response:  ------------------------------------------,, is is is is is is and and,,,,,,,,,,,


In [24]:
# Save the fine-tuned model and tokenizer
model.save_pretrained("lora_mistral_neMo_12B")
tokenizer.save_pretrained("lora_mistral_neMo_12B")

('lora_mistral_neMo_12B/tokenizer_config.json',
 'lora_mistral_neMo_12B/special_tokens_map.json',
 'lora_mistral_neMo_12B/vocab.json',
 'lora_mistral_neMo_12B/merges.txt',
 'lora_mistral_neMo_12B/added_tokens.json',
 'lora_mistral_neMo_12B/tokenizer.json')