In [None]:
# Install Unsloth (Efficiency Library) and dependencies
print("Installing Unsloth & Dependencies... (This takes ~2 mins)")
!pip install "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"
!pip install --no-deps xformers trl peft accelerate bitsandbytes

print("✅ Install Complete!")

Installing Unsloth & Dependencies... (This takes ~2 mins)
Collecting unsloth@ git+https://github.com/unslothai/unsloth.git (from unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Cloning https://github.com/unslothai/unsloth.git to /tmp/pip-install-qdfvg13b/unsloth_0620cd369fd04810814b20e973df8fcf
  Running command git clone --filter=blob:none --quiet https://github.com/unslothai/unsloth.git /tmp/pip-install-qdfvg13b/unsloth_0620cd369fd04810814b20e973df8fcf
  Resolved https://github.com/unslothai/unsloth.git to commit ab4061e106792fa91e1eba3e4f3d45fa8aba121e
  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting unsloth_zoo>=2026.1.3 (from unsloth@ git+https://github.com/unslothai/unsloth.git->unsloth[colab-new]@ git+https://github.com/unslothai/unsloth.git)
  Downloading unsloth_zoo-2026.1.3-py3-none-any.whl.metadata (32 kB)
Collecting tyro (f

In [None]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 2048 # Supports RoPE Scaling internally, so choose any!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

print("Loading Model...")
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/llama-3-8b-bnb-4bit", # Pre-quantized Llama 3
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

print("✅ Model Loaded Successfully!")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!
Loading Model...
==((====))==  Unsloth 2026.1.3: Fast Llama patching. Transformers: 4.57.3.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = 0.0.33.post2. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/5.70G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/198 [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

✅ Model Loaded Successfully!


In [None]:
from datasets import load_dataset

# 1. Load the dataset (same as yesterday)
dataset = load_dataset("pubmed_qa", "pqa_labeled", split = "train")

# 2. Define the Prompt Format
alpaca_prompt = """Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
You are a medical expert. Answer the following question truthfully and concisely.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token # Must add EOS_TOKEN so the model knows when to stop generating

# 3. Formatting Function
def formatting_prompts_func(examples):
    inputs       = examples["question"]
    outputs      = examples["long_answer"]
    texts = []
    for input, output in zip(inputs, outputs):
        # Must add EOS_TOKEN, otherwise your generation will go on forever!
        text = alpaca_prompt.format(input, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }

# 4. Apply to dataset
dataset = dataset.map(formatting_prompts_func, batched = True,)
print("✅ Data Formatted. Example:", dataset['text'][0])

README.md: 0.00B [00:00, ?B/s]

pqa_labeled/train-00000-of-00001.parquet:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

✅ Data Formatted. Example: Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
You are a medical expert. Answer the following question truthfully and concisely.

### Input:
Do mitochondria play a role in remodelling lace plant leaves during programmed cell death?

### Response:
Results depicted mitochondrial dynamics in vivo as PCD progresses within the lace plant, and highlight the correlation of this organelle with other organelles during developmental PCD. To the best of our knowledge, this is the first report of mitochondria and chloroplasts moving on transvacuolar strands to form a ring structure surrounding the nucleus during developmental PCD. Also, for the first time, we have shown the feasibility for the use of CsA in a whole plant system. Overall, our findings implicate the mitochondria as playing a critical and early role in developmentally regulated PCD in the lace plant.<|end_of_text|>


In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Prepare model for training (adds the LoRA adapters)
model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    dataset_num_proc = 2,
    packing = False, # Can make training 5x faster for short sequences.
    args = TrainingArguments(
        per_device_train_batch_size = 2,
        gradient_accumulation_steps = 4,
        warmup_steps = 5,
        max_steps = 60, # Increase this to 300+ for real training!
        learning_rate = 2e-4,
        fp16 = not is_bfloat16_supported(),
        bf16 = is_bfloat16_supported(),
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
    ),
)

print("🚀 Starting Training...")
trainer_stats = trainer.train()
print("🏆 Training Complete!")

Unsloth 2026.1.3 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Unsloth: Tokenizing ["text"] (num_proc=6):   0%|          | 0/1000 [00:00<?, ? examples/s]

The model is already on multiple devices. Skipping the move to device specified in `args`.


🚀 Starting Training...


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,000 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)
wandb: (1) Create a W&B account
wandb: (2) Use an existing W&B account
wandb: (3) Don't visualize my results
wandb: Enter your choice:

 3


wandb: You chose "Don't visualize my results"


wandb: Detected [huggingface_hub.inference, openai] in use.
wandb: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
wandb: For more information, check out the docs at: https://weave-docs.wandb.ai/


Step,Training Loss
1,2.8682
2,2.9725
3,2.787
4,2.7149
5,2.5268
6,2.2618
7,1.8194
8,1.6362
9,1.5311
10,1.4415




0,1
train/epoch,▁▁▁▁▂▂▂▂▃▃▃▃▃▃▃▄▄▄▄▄▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇█████
train/global_step,▁▁▁▁▂▂▂▂▂▃▃▃▃▄▄▄▄▄▄▅▅▅▅▅▅▆▆▆▆▆▆▇▇▇▇▇▇▇██
train/grad_norm,█▇▇▅█▄▂▃▃▃▄▅▆▆▆▃▂▂▁▂▂▂▁▂▂▂▂▂▁▁▂▁▃▂▂▂▁▁▂▁
train/learning_rate,▂▄▇██▇▇▇▇▇▇▆▆▆▆▆▅▅▅▅▅▅▄▄▄▄▄▃▃▃▃▃▂▂▂▂▂▁▁▁
train/loss,██▆▅▄▂▂▃▂▂▂▂▁▂▂▂▂▃▁▂▂▂▁▂▂▂▂▂▂▂▁▂▂▁▂▁▂▁▃▃

0,1
total_flos,2789050619117568.0
train/epoch,0.48
train/global_step,60.0
train/grad_norm,0.61844
train/learning_rate,0.0
train/loss,1.611
train_loss,1.50696
train_runtime,290.5917
train_samples_per_second,1.652
train_steps_per_second,0.206


🏆 Training Complete!


In [None]:
# FastLanguageModel.for_inference(model) # Enable native 2x faster inference

inputs = tokenizer(
[
    alpaca_prompt.format(
        "Do mitochondria play a role in heart disease?", # Input
        "", # Output - leave this blank for generation!
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 64, use_cache = True)
print(tokenizer.batch_decode(outputs)[0])

<|begin_of_text|>Below is an instruction that describes a task. Write a response that appropriately completes the request.

### Instruction:
You are a medical expert. Answer the following question truthfully and concisely.

### Input:
Do mitochondria play a role in heart disease?

### Response:
Mitochondria have been implicated in the development of heart disease, and mitochondrial dysfunction has been linked to the development of cardiac disease. In addition, mitochondrial DNA mutations have been linked to the development of cardiac disease. However, whether mitochondrial dysfunction is a cause or a consequence of cardiac disease remains to be determined.<|end_of_text|>


In [None]:
# 1. Convert the model to GGUF format (Quantized for laptop use)
print("⏳ Converting to GGUF format... (This takes 5-10 mins)")
model.save_pretrained_gguf("medical_llama_3_8b", tokenizer, quantization_method = "q4_k_m")

# 2. Print download instructions
print("✅ Conversion Complete!")
print("----------------------------------------------------------------")
print("To download the model to your computer:")
print("1. Click the 'Folder' icon 📁 on the left sidebar of Colab.")
print("2. Navigate to the 'medical_llama_3_8b' folder.")
print("3. Look for the file ending in '.gguf' (approx 5GB).")
print("4. Right-click > Download.")
print("----------------------------------------------------------------")

⏳ Converting to GGUF format... (This takes 5-10 mins)
Unsloth: Merging model weights to 16-bit format...


config.json:   0%|          | 0.00/768 [00:00<?, ?B/s]

Found HuggingFace hub cache directory: /root/.cache/huggingface/hub


model.safetensors.index.json: 0.00B [00:00, ?B/s]

Checking cache directory for required files...
Cache check failed: model-00001-of-00004.safetensors not found in local cache.
Not all required files found in cache. Will proceed with downloading.
Checking cache directory for required files...
Cache check failed: tokenizer.model not found in local cache.
Not all required files found in cache. Will proceed with downloading.


Unsloth: Preparing safetensor model files:   0%|          | 0/4 [00:00<?, ?it/s]

model-00001-of-00004.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  25%|██▌       | 1/4 [02:16<06:48, 136.16s/it]

model-00002-of-00004.safetensors:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  50%|█████     | 2/4 [05:05<05:11, 155.65s/it]

model-00003-of-00004.safetensors:   0%|          | 0.00/4.92G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files:  75%|███████▌  | 3/4 [08:24<02:55, 175.33s/it]

model-00004-of-00004.safetensors:   0%|          | 0.00/1.17G [00:00<?, ?B/s]

Unsloth: Preparing safetensor model files: 100%|██████████| 4/4 [08:55<00:00, 133.78s/it]


Note: tokenizer.model not found (this is OK for non-SentencePiece models)


Unsloth: Merging weights into 16bit: 100%|██████████| 4/4 [07:51<00:00, 117.77s/it]


Unsloth: Merge process complete. Saved to `/content/medical_llama_3_8b`
Unsloth: Converting to GGUF format...
==((====))==  Unsloth: Conversion from HF to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF f16 might take 3 minutes.
\        /    [2] Converting GGUF f16 to ['q4_k_m'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: Updating system package directories
Unsloth: All required system packages already installed!
Unsloth: Install llama.cpp and building - please wait 1 to 3 minutes
Unsloth: Cloning llama.cpp repository
Unsloth: Install GGUF and other packages
Unsloth: Successfully installed llama.cpp!
Unsloth: Preparing converter script...
Unsloth: [1] Converting model into f16 GGUF format.
This might take 3 minutes...
Unsloth: Initial conversion completed! Files: ['llama-3-8b.F16.gguf']
Unsloth: [