

---



---


***Before Finetuning***



---



---



In [None]:
# Install required libraries
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1" huggingface_hub hf_transfer
    !pip install --no-deps unsloth

In [2]:
# Load dataset and split into train, validation, test
from datasets import load_dataset

# Load and shuffle once
dataset = load_dataset("medalpaca/medical_meadow_mediqa")
data = dataset["train"].shuffle(seed=42)

# First split: 80% train, 20% temp
split1 = data.train_test_split(test_size=0.2, seed=42)

# Split the temp 20% into 50/50 → 10% val, 10% test
rest = split1["test"].train_test_split(test_size=0.5, seed=42)

train_full, val_full, test_full = split1["train"], rest["train"], rest["test"]

# Now sample fixed sizes *within* each split
train_set = train_full.shuffle(seed=42).select(range(200))
val_set   = val_full.shuffle(seed=42).select(range(50))
test_set  = test_full.shuffle(seed=42).select(range(50))

print("Sizes -> Train:", len(train_set), "Validation:", len(val_set), "Test:", len(test_set))


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/653 [00:00<?, ?B/s]

medical_meadow_mediqa.json:   0%|          | 0.00/15.8M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2208 [00:00<?, ? examples/s]

Sizes -> Train: 200 Validation: 50 Test: 50


In [None]:


from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

In [4]:
# Prompt template (alpaca style, same used in fine-tuning)

prompt_template = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
"""

In [5]:
import torch, json
from tqdm import tqdm

In [6]:
# Generate predictions from test set and save outputs

# Generation settings (keep same before and after finetuning)
MAX_NEW_TOKENS = 200
DO_SAMPLE = False
TEMPERATURE = 0.0 # deterministic

N = len(test_set)
results = []

# Ensure the model is in the correct data type for inference
# model.to(model.dtype) # This line is not needed for bitsandbytes models

for i in tqdm(range(N)):
    row = test_set[i]
    instr = row["instruction"].strip()
    inp   = row["input"].strip()
    output = row["output"].strip()

    # Build prompt from template
    prompt = prompt_template.format(instr, inp)

    # Tokenize prompt
    enc = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_seq_length).to(model.device)

    # Generate prediction
    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=DO_SAMPLE,
            temperature=TEMPERATURE,
            pad_token_id=tokenizer.pad_token_id,
            eos_token_id=tokenizer.eos_token_id
        )

    # Extract only generated continuation (skip the prompt part)
    prompt_len = enc["input_ids"].shape[1]
    gen_ids = out[0][prompt_len:]
    prediction = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    # Save record
    results.append({
        "id": i + 1,
        "prompt": prompt,
        "output": output,
        "prediction": prediction
    })

100%|██████████| 50/50 [10:22<00:00, 12.46s/it]


In [13]:


# Save to Excel file
import pandas as pd

out_file = "before-finetuning.xlsx"

df = pd.DataFrame(results, columns=["id", "prompt", "output", "prediction"])
df.to_excel(out_file, index=False)

print(f"Saved {out_file} with {len(df)} rows.")



Saved before-finetuning.xlsx with 50 rows.


**NOTES**

Why we use Unsloth

1. The raw Meta-Llama-3.1-8B model is too big for free Colab (it needs >40GB VRAM).

Unsloth provides a wrapper (FastLanguageModel) that:

2. Loads the model in 4-bit quantization  fits on a T4 GPU (~15GB).

3. Gives us both the model and the tokenizer in one call.

4. Has helpers like .for_inference() to make generation easier/faster.



---



---


***Finetuning the Model using Unsloth***



---



---











In [None]:
%%capture
import os, re
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    import torch; v = re.match(r"[0-9\.]{3,}", str(torch.__version__)).group(0)
    xformers = "xformers==" + ("0.0.32.post2" if v == "2.8.0" else "0.0.29.post3")
    !pip install --no-deps bitsandbytes accelerate {xformers} peft trl triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf "datasets>=3.4.1,<4.0.0" "huggingface_hub>=0.34.0" hf_transfer
    !pip install --no-deps unsloth
!pip install transformers==4.55.4

In [None]:
# max_seq_length - maximum number of tokens the model can process in a single input but bigger means more gpu memory also.
# quantized our LLM to fit while using colab.

from unsloth import FastLanguageModel
import torch
max_seq_length = 2048 # Choose any! We auto support RoPE Scaling internally!
dtype = None # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
load_in_4bit = True # Use 4bit quantization to reduce memory usage. Can be False.

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Meta-Llama-3.1-8B",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

==((====))==  Unsloth 2025.9.3: Fast Llama patching. Transformers: 4.56.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.8.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.4.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


In [14]:
# peft - Parameter-Efficient Fine-Tuning.
# if r is bigger, more storage of memory and ability to learn, better results.
# lora_alpha - scales adapter's effects, decides how much effect lora has on the LLM.

model = FastLanguageModel.get_peft_model(
    model,
    r = 16, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = 16,
    lora_dropout = 0, # Supports any, but = 0 is optimized
    bias = "none",    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 3407,
    use_rslora = False,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.9.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


**Using our data.**

We'll be using
This data has been split into three sets.

1. train.jsonl
 2. test.jsonl
 3. validation.jsonl


**Why use Dataset.from_list(...) to make a HF Dataset?**

- Dataset.from_list([...]) converts a plain Python list of examples (e.g. [{ "text": "..." }, ...]) into a Hugging Face Dataset object that Trainer expects.

- The trainer in the notebook expects a Dataset with a column called "text" (that single field contains prompt + reference + EOS). from_list makes that exact shape quickly and simply

In [None]:
# Cell A: format in-memory splits into 'text' field for training
from datasets import Dataset

# Alpaca prompt (same as before)
alpaca_prompt = """Below is an instruction that describes a task, paired with an input that provides further context. Write a response that appropriately completes the request.

### Instruction:
{}

### Input:
{}

### Response:
{}"""

# get EOS token from tokenizer (important)
EOS_TOKEN = tokenizer.eos_token

# Simple formatting function for batched mapping
def format_batch(examples):
    # examples is a dict of lists: examples["instruction"] is a list of instructions
    texts = []
    instructions = examples["instruction"]
    inputs = examples["input"]
    outputs = examples["output"]
    for i in range(len(instructions)):
        instr = (instructions[i] or "").strip()
        inp = (inputs[i] or "").strip()
        out = (outputs[i] or "").strip()
        text = alpaca_prompt.format(instr, inp, out) + EOS_TOKEN
        texts.append(text)
    return {"text": texts}

# Apply the formatting to each split
train_dataset = train_set.map(format_batch, batched=True)
val_dataset   = val_set.map(format_batch, batched=True)
test_dataset  = test_set.map(format_batch, batched=True)

print("Prepared datasets:")
print("Train:", len(train_dataset), "Val:", len(val_dataset), "Test:", len(test_dataset))
# show a short example
print(train_dataset[0]["text"][:300])


**trl - Transformer Reinforcement Learning**

library designed to train transformer language models

In [None]:
from trl import SFTConfig, SFTTrainer
trainer = SFTTrainer(
    model = model,
    tokenizer = tokenizer,
    train_dataset = train_dataset,
    eval_dataset = val_dataset,
    dataset_text_field = "text",
    max_seq_length = max_seq_length,
    packing = False, # Can make training 5x faster for short sequences.
    args = SFTConfig(
        per_device_train_batch_size = 1,
        per_device_eval_batch_size = 1,
        gradient_accumulation_steps = 8,
        warmup_steps = 5,
        num_train_epochs = 1, # Set this for 1 full training run.
        # max_steps = 60,
        learning_rate = 2e-4,
        logging_steps = 1,
        optim = "adamw_8bit",
        weight_decay = 0.01,
        lr_scheduler_type = "linear",
        seed = 3407,
        output_dir = "outputs",
        report_to = "none",
        eval_strategy = "steps",
        eval_steps = 10,    # how often to run validation
        save_strategy = "no",
    ),
)

In [17]:
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 200 | Num Epochs = 1 | Total steps = 25
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss,Validation Loss
10,1.1812,1.204831
20,0.9675,1.126845


Unsloth: Not an error, but LlamaForCausalLM does not accept `num_items_in_batch`.
Using gradient accumulation will be very slightly less accurate.
Read more on gradient accumulation issues here: https://unsloth.ai/blog/gradient


In [20]:
# Run inference on your test set and save predictions

import json
from tqdm import tqdm

# Make sure model is in inference mode (faster)
FastLanguageModel.for_inference(model)

MAX_NEW_TOKENS = 128
DO_SAMPLE = False
TEMPERATURE = 0.0

results = []
N = len(test_set)

for i in tqdm(range(N)):
    item = test_set[i]

    instr = (item.get("instruction") or "").strip()
    inp   = (item.get("input") or "").strip()
    output = (item.get("output") or "").strip()   # <-- ground truth

    prompt = alpaca_prompt.format(instr, inp, "")

    enc = tokenizer(prompt, return_tensors="pt", truncation=True, max_length=max_seq_length).to(model.device)

    with torch.no_grad():
        out = model.generate(
            **enc,
            max_new_tokens=MAX_NEW_TOKENS,
            do_sample=DO_SAMPLE,
            temperature=TEMPERATURE,
            eos_token_id=tokenizer.eos_token_id,
            pad_token_id=tokenizer.pad_token_id
        )

    prompt_len = enc["input_ids"].shape[1]
    gen_ids = out[0][prompt_len:]
    prediction = tokenizer.decode(gen_ids, skip_special_tokens=True).strip()

    results.append({
        "id": i + 1,       # start ids at 1
        "prompt": prompt,
        "output": output,   # <-- add ground truth
        "prediction": prediction
    })


100%|██████████| 50/50 [07:37<00:00,  9.16s/it]


In [21]:
# Save the file

import pandas as pd

df = pd.DataFrame(results, columns=["id","prompt","output","prediction"])
df.to_excel("after-finetuning.xlsx", index=False)

print("Saved after-finetuning.xlsx with", len(df), "rows.")


Saved after-finetuning.xlsx with 50 rows.


In [2]:
# Evaluation : Used Chatgpt to evaluate both excel files 'before-finetuning.xlsx' and 'after-finetuning.xlsx' on the basis of
"""
A prediction is marked correct (1) if:
Core meaning preserved → it conveys the same main fact as the output.
No contradictions → does not conflict with output (e.g., wrong numbers, opposite advice).
Completeness relative to question → prediction can be shorter, but must include essential points.
Tone consistency (light check) → no inappropriate disclaimers (e.g., “I’m not a doctor”) if output is professional.
Readable & not cut off → prediction must not be incomplete or abruptly end.
A prediction is incorrect (0) if any of the above fail.
"""
# Chatgpt gave me two files 'before-finetuning-eval.xlsx' and 'after-finetuning-eval.xlsx' and we'll evaluate the metrics.




'\nA prediction is marked correct (1) if:\nCore meaning preserved → it conveys the same main fact as the output.\nNo contradictions → does not conflict with output (e.g., wrong numbers, opposite advice).\nCompleteness relative to question → prediction can be shorter, but must include essential points.\nTone consistency (light check) → no inappropriate disclaimers (e.g., “I’m not a doctor”) if output is professional.\nReadable & not cut off → prediction must not be incomplete or abruptly end.\nA prediction is incorrect (0) if any of the above fail.\n'

In [10]:
import pandas as pd
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

# 1. Load your file
df = pd.read_excel("/content/before-finetuning-eval.xlsx")

# 2. Extract the 'correct' column
# This is your predicted correctness (1 = correct, 0 = incorrect)
y_pred = df["correct_llm"]  # or df["correct"] if that's your column name

# 3. Define the ground truth labels
# Since every row has a valid output, the ground truth = 1 for all rows
y_true = [1] * len(y_pred)

# 4. Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Accuracy: 0.14
Precision: 1.0
Recall: 0.14
F1-score: 0.24561403508771928


In [7]:
# 1. Load your file
df = pd.read_excel("/content/after-finetuning-eval.xlsx")

# 2. Extract the 'correct' column
# This is your predicted correctness (1 = correct, 0 = incorrect)
y_pred = df["correct_llm"]   # or df["correct"] if that's your column name

# 3. Define the ground truth labels
# Since every row has a valid output, the ground truth = 1 for all rows
y_true = [1] * len(y_pred)

# 4. Compute metrics
accuracy = accuracy_score(y_true, y_pred)
precision = precision_score(y_true, y_pred, zero_division=0)
recall = recall_score(y_true, y_pred, zero_division=0)
f1 = f1_score(y_true, y_pred, zero_division=0)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)

50
Accuracy: 0.38
Precision: 1.0
Recall: 0.38
F1-score: 0.5507246376811594


In [None]:
# Final Results
"""
Finetuning improved accuracy by 24 percentage points and relatively increased it by 171%!

"""