# Model evaluation notebook

In [1]:
# Jupyter version
import ipywidgets as widgets
from IPython.display import display

# GColab version
MODEL = 'phi4:14b' # @param ["gemma2:2b", "gemma2:9b", "gemma3:12b", "mistral:7b", "mixtral:8x7b", "phi:2.7b", "phi4:14b", "deepseek-r1:7b"] {allow-input: true}
MODEL

'phi4:14b'

In [2]:
LORA_RANK = 32 # @param {type:"slider", min:8, max:128, step:8}
LORA_ALPHA = 32 # @param {type:"slider", min:8, max:128, step:8}
LORA_RANK_STABILIZED = False # @param {type:"boolean"}
LORA_DROPOUT = 0 # @param {type:"slider", min:0, max:0.5, step:0.05}
LORA_BIAS = "none" # @param ["none", "all", "lora_only"]
LEARNING_RATE = 1e-4 # @param {type:"number"}
LOAD_IN_4_BIT = False # @param {type:"boolean"}
MAX_EPOCHS = 2 # @param {"type":"slider","min":1,"max":50,"step":1}

import datetime
RUN_CODE = (f"r{LORA_RANK}_a{LORA_ALPHA}_s{int(LORA_RANK_STABILIZED)}"
            + f"_d{LORA_DROPOUT}_b{LORA_BIAS}_l{LEARNING_RATE}_4b{LOAD_IN_4_BIT}"
            + "_" + datetime.datetime.now().strftime("%Y%m%d_%H%M")
)
RUN_CODE

'r32_a32_s0_d0_bnone_l0.0001_4bFalse_20250516_2221'

In [3]:
# Start time
!date

Fri May 16 10:21:15 PM UTC 2025


## Setup

In [4]:
from getpass import getpass

def get_secret(prompt, secret_name, secret_input=True):
  try:
    from google.colab import userdata
    result = userdata.get(secret_name)
    assert result is not None
  except:
    if secret_input:
      result = getpass(prompt)
    else:
      result = input(prompt)
  return result


### Install required libraries

In [5]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install -U transformers
    !pip install -qU datasets

In [6]:
# Load unsloth ASAP for better optimizations
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [7]:
!pip install wandb -qU
!uv pip install -q bitsandbytes transformers peft accelerate scipy einops evaluate trl rouge_score numpy

In [8]:
# Log in to your W&B account
import wandb
import random
import math
import os

In [9]:
wandb.login(key=get_secret('W&B key: ', 'TFM_WAND_TOKEN'))

[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msergiomportela[0m ([33msergiomportela-universitat-oberta-de-catalunya[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
os.environ["WANDB_PROJECT"] = "fine-tune-tfm-2025-tests"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [11]:
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login
from huggingface_hub import login as hf_login

hf_login(token=get_secret('Hugging Face token: ', 'TFM_HUGGINGFACE_TOKEN'))

### Prepare code

In [12]:
import os

if not os.path.exists('src'):
  import urllib

  user = get_secret('User name: ', 'TFM_GH_USER')
  password = get_secret('Password: ', 'TFM_GH_TOKEN')
  password = urllib.parse.quote(password) # your password is converted into url format
  repopath = "tfm-smp-2025/fine-tuning"

  !git clone https://"$user":"$password"@github.com/"$repopath" src

  del password

Cloning into 'src'...
remote: Enumerating objects: 851, done.[K
remote: Counting objects: 100% (191/191), done.[K
remote: Compressing objects: 100% (99/99), done.[K
remote: Total 851 (delta 143), reused 132 (delta 92), pack-reused 660 (from 1)[K
Receiving objects: 100% (851/851), 157.17 KiB | 17.46 MiB/s, done.
Resolving deltas: 100% (585/585), done.


In [13]:
# Update code, if needed
!cd src && git pull

Already up to date.


#### Credentials

##### Result pusher setup

In [14]:
import os

if (
    (not os.path.exists(os.path.expanduser("~/.ssh/id_rsa")))
    or (not os.path.exists(os.path.expanduser("~/.ssh/known_hosts")))
):
  SSHKEY = get_secret('Result pusher SSH key: ', 'TFM_SSH_PUSHER_KEY')

  !mkdir ~/.ssh

  # Read locally with `cat ~/.ssh/result-pusher|tr '\n' '$';echo`
  with open(os.path.expanduser("~/.ssh/id_rsa"), 'wt') as f:
    f.write(SSHKEY.replace('$', '\n'))

  !chmod 0600 ~/.ssh/id_rsa
  !ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
  !chmod 0600 ~/.ssh/id_rsa.pub

  # This won't copy the client key (not needed), but it will initialize the server's on the client
  !ssh-copy-id -i ~/.ssh/id_rsa -o StrictHostKeyChecking=accept-new result-pusher@kb.tfm.codigoparallevar.com

  del SSHKEY

/usr/bin/ssh-copy-id: INFO: Source of key(s) to be installed: "/root/.ssh/id_rsa.pub"
/usr/bin/ssh-copy-id: INFO: attempting to log in with the new key(s), to filter out any that are already installed

		(if you think this is a mistake, you may want to use -f option)



### Pull datasets

In [15]:
# !python3 src/scripts/pull_datasets.py

## Run fine tuning

In [16]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [17]:
model_name='microsoft/phi-4'
model_name = 'unsloth/Phi-4'  # Unsloth's version
model_name = "unsloth/Phi-4-unsloth-bnb-4bit"  # Phi-4 Unsloth Dynamic 4-bit Quant
device_map = {"": 0}


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-4",
    max_seq_length = 16384, # max_seq_length,
    load_in_4bit = LOAD_IN_4_BIT, # load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.5.4: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.15M [00:00<?, ?B/s]

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left") # ,add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

tokenizer_config.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.15M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [19]:
import json

def load_jsonl(path):
  with open(path) as f:
    return [
      json.loads(line)
      for line in f
    ]

def regen_jsonl(path):
  with open(path) as f:
    data = [
      json.loads(line)
      for line in f
    ]
  with open(path, 'w') as f:
    for line in data:
      if isinstance(line, list):
        f.write(json.dumps({"messages": line}) + '\n')
      else:
        f.write(json.dumps(line) + '\n')

regen_jsonl("fine-tuning-test.jsonl")
regen_jsonl("fine-tuning-data.jsonl")

In [20]:
import json
import datasets

dataset = datasets.load_dataset("json", data_files={
    "test": "fine-tuning-test.jsonl",
    "train": "fine-tuning-data.jsonl",
}, )
dataset # ['test'][0]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

NotImplementedError: Loading a dataset cached in a LocalFileSystem is not supported.

In [None]:
!tail -n1 fine-tuning-*.jsonl

In [None]:
dataset['train'][0]

In [None]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

index = 0

conversation = dataset['train'][index]

import re

PARSE_IM_RE = re.compile(r'<\|im_start\|>([^<]+)<\|im_sep\|>(([^<]|<[^|])*)<\|im_end\|>')

# Generate output
def gen(model, conversation, max_length, skip_last=True):
    conversation_steps = []
    for idx, step in enumerate(conversation['messages']):
        # TODO: Check that these tokens are correct
        if idx < (len(conversation['messages']) - 1) or not skip_last:
          conversation_steps.append(f"<|im_start|>{step['actor']}<|im_sep|>{step['text']}<|im_end|>")
        else:
          assert step['actor'] == 'assistant'

    inputs = tokenizer('\n'.join(conversation_steps), return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_length)
    return ''.join(tokenizer.batch_decode(outputs)).split('<|endoftext|>')[0] # , skip_special_tokens=True)

def parse_out(output):
  results = []
  for chunk in PARSE_IM_RE.findall(output):
    results.append((chunk[0], chunk[1]))
  return results

res = gen(model, conversation, 8000,)
output = parse_out(res)

dash_line = '-'.join('' for x in range(100))
# print(f'INPUT PROMPT:\n{formatted_prompt}')
# print(dash_line)
print(f'BASELINE OUTPUT:\n{conversation["messages"][-1]["text"]}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output[-1][1]}')

In [None]:
def create_prompt_formats(sample):
    """
    Preparate a sample for prompt creation.
    :param sample: Sample dictionnary
    """
    batch = []
    for conversation in sample['messages']:
      conversation_steps = []
      for step in conversation:
          # TODO: Check that these tokens are correct
          conversation_steps.append(f"<|im_start|>{step['actor']}<|im_sep|>{step['text']}<|im_end|>")
      batch.append('\n'.join(conversation_steps))
    return {"text": batch}

In [None]:
from functools import partial

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats, batched=True)
    print("DS", dataset)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    # _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        # _preprocessing_function,
        lambda x: tokenizer(x['text'], truncation=False, max_length=max_length),
        batched=True,
        # remove_columns=['user', 'assistant'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [None]:
## Pre-process dataset
max_length = get_max_length(model)
print(max_length)

train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['test'])

In [None]:
model = FastLanguageModel.get_peft_model(
    model,
    r = LORA_RANK, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT, # Supports any, but = 0 is optimized
    bias = LORA_BIAS,    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = LORA_RANK_STABILIZED,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

In [None]:
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [None]:
# # 2 - Using the prepare_model_for_kbit_training method from PEFT
# # Preparing the Model for QLoRA
# original_model = prepare_model_for_kbit_training(original_model)

In [None]:
# config = LoraConfig(
#     r=32, # Rank
#     lora_alpha=16,
#     target_modules=[
#         # "q_proj",  # Not found, ignored
#         # "k_proj",  # Not found, ignored
#         # "v_proj",  # Not found, ignored
#         "o_proj",
#         # "gate_proj", # Not found, ignored
#         # "up_proj", # Not found, ignored
#         "down_proj"
#         ],
#     bias="none",
#     lora_dropout=0.2,
#     task_type="CAUSAL_LM",
# )

# # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
# original_model.gradient_checkpointing_enable()

# peft_model = get_peft_model(original_model, config)

In [None]:
model.print_trainable_parameters()

In [None]:
output_dir = f'./fine-tuned/' + RUN_CODE
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1, #
    per_device_eval_batch_size=1, # same as above
    gradient_accumulation_steps=1,
    num_train_epochs=MAX_EPOCHS,
    # max_steps=MAX_STEPS, # If given a MAX_STEPS, not all data is trained
    learning_rate=LEARNING_RATE,
    optim="paged_adamw_8bit",
    save_steps=100, # For `load_best_model_at_end=True` must be same as `eval_steps`
    eval_steps=100, # This is what takes most of the time! More steps between evals means faster training
    do_eval=True,
    logging_steps=25,
    logging_dir="./logs",
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    report_to="wandb",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


In [None]:
res = peft_trainer.train()

## Test results

In [None]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

index = 0

conversation = dataset['train'][index]

import re

PARSE_IM_RE = re.compile(r'<\|im_start\|>([^<]+)<\|im_sep\|>(([^<]|<[^|])*)<\|im_end\|>')

# Generate output
def gen(model, conversation, max_length, skip_last=True):
    conversation_steps = []
    for idx, step in enumerate(conversation['messages']):
        # TODO: Check that these tokens are correct
        if idx < (len(conversation['messages']) - 1) or not skip_last:
          conversation_steps.append(f"<|im_start|>{step['actor']}<|im_sep|>{step['text']}<|im_end|>")
        else:
          assert step['actor'] == 'assistant'

    inputs = tokenizer('\n'.join(conversation_steps), return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_length)
    return ''.join(tokenizer.batch_decode(outputs)).split('<|endoftext|>')[0] # , skip_special_tokens=True)

def parse_out(output):
  results = []
  for chunk in PARSE_IM_RE.findall(output):
    results.append((chunk[0], chunk[1]))
  return results

res = gen(model, conversation, 1000,)
output = parse_out(res)

dash_line = '-'.join('' for x in range(100))
# print(f'INPUT PROMPT:\n{formatted_prompt}')
# print(dash_line)
print(f'BASELINE OUTPUT:\n{conversation["messages"][-1]["text"]}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output[-1][1]}')

## Upload results

In [None]:
!rsync -HPrz --mkpath \
  fine-tuned \
  result-pusher@kb.tfm.codigoparallevar.com:fine-tuning

### Quantize and convert

In [None]:
peft_trainer.model.save_pretrained_gguf("fine-tuned/" + RUN_CODE + "/loadable", tokenizer, quantization_method = [ "f16", "q4_k_m", "q6_k"])

In [None]:
!rsync -HPrz --mkpath \
  fine-tuned \
  result-pusher@kb.tfm.codigoparallevar.com:fine-tuning

## Cleanup

In [None]:
# Finish time
!date

### Stop kernel

In [None]:
from google.colab import runtime
runtime.unassign()