# Model evaluation notebook

In [1]:
# Jupyter version
import ipywidgets as widgets
from IPython.display import display

# GColab version
MODEL = 'phi4:14b' # @param ["gemma2:2b", "gemma2:9b", "gemma3:12b", "mistral:7b", "mixtral:8x7b", "phi:2.7b", "phi4:14b", "deepseek-r1:7b"] {allow-input: true}
MODEL

'phi4:14b'

In [2]:
LORA_RANK = 16 # @param {type:"slider", min:8, max:128, step:8}
LORA_ALPHA = 16 # @param {type:"slider", min:8, max:128, step:8}
LORA_RANK_STABILIZED = False # @param {type:"boolean"}
LORA_DROPOUT = 0 # @param {type:"slider", min:0, max:0.5, step:0.05}
LORA_BIAS = "none" # @param ["none", "all", "lora_only"]
LEARNING_RATE = 1e-4 # @param {type:"number"}
LOAD_IN_4_BIT = False # @param {type:"boolean"}
MAX_EPOCHS = 2 # @param {"type":"slider","min":1,"max":50,"step":1}

import datetime
RUN_CODE = (f"r{LORA_RANK}_a{LORA_ALPHA}_s{int(LORA_RANK_STABILIZED)}"
            + f"_d{LORA_DROPOUT}_b{LORA_BIAS}_l{LEARNING_RATE}_4b{LOAD_IN_4_BIT}"
            + "_" + datetime.datetime.now().strftime("%Y%m%d_%H%M")
)
RUN_CODE

'r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601'

In [3]:
# Start time
!date

Fri May 16 04:01:22 PM UTC 2025


## Setup

In [4]:
from getpass import getpass

def get_secret(prompt, secret_name, secret_input=True):
  try:
    from google.colab import userdata
    result = userdata.get(secret_name)
    assert result is not None
  except:
    if secret_input:
      result = getpass(prompt)
    else:
      result = input(prompt)
  return result


### Install required libraries

In [5]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install -U transformers

In [6]:
# Load unsloth ASAP for better optimizations
from unsloth import FastLanguageModel

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [7]:
!pip install wandb -qU
!uv pip install -q bitsandbytes transformers peft accelerate scipy einops evaluate trl rouge_score numpy
!uv pip install -qU datasets

In [8]:
# Log in to your W&B account
import wandb
import random
import math
import os

In [9]:
wandb.login(key=get_secret('W&B key: ', 'TFM_WAND_TOKEN'))

[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33msergiomportela[0m ([33msergiomportela-universitat-oberta-de-catalunya[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [10]:
os.environ["WANDB_PROJECT"] = "fine-tune-tfm-2025-tests"  # name your W&B project
os.environ["WANDB_LOG_MODEL"] = "checkpoint"  # log all model checkpoints

In [11]:
from datasets import load_dataset

from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    HfArgumentParser,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    GenerationConfig
)
from tqdm import tqdm
from trl import SFTTrainer
import torch
import time
import pandas as pd
import numpy as np
from huggingface_hub import interpreter_login
from huggingface_hub import login as hf_login

hf_login(token=get_secret('Hugging Face token: ', 'TFM_HUGGINGFACE_TOKEN'))

### Prepare code

In [12]:
import os

if not os.path.exists('src'):
  import urllib

  user = get_secret('User name: ', 'TFM_GH_USER')
  password = get_secret('Password: ', 'TFM_GH_TOKEN')
  password = urllib.parse.quote(password) # your password is converted into url format
  repopath = "tfm-smp-2025/fine-tuning"

  !git clone https://"$user":"$password"@github.com/"$repopath" src

  del password

In [13]:
# Update code, if needed
!cd src && git pull

Already up to date.


#### Credentials

##### Result pusher setup

In [14]:
import os

if (
    (not os.path.exists(os.path.expanduser("~/.ssh/id_rsa")))
    or (not os.path.exists(os.path.expanduser("~/.ssh/known_hosts")))
):
  SSHKEY = get_secret('Result pusher SSH key: ', 'TFM_SSH_PUSHER_KEY')

  !mkdir ~/.ssh

  # Read locally with `cat ~/.ssh/result-pusher|tr '\n' '$';echo`
  with open(os.path.expanduser("~/.ssh/id_rsa"), 'wt') as f:
    f.write(SSHKEY.replace('$', '\n'))

  !chmod 0600 ~/.ssh/id_rsa
  !ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
  !chmod 0600 ~/.ssh/id_rsa.pub

  # This won't copy the client key (not needed), but it will initialize the server's on the client
  !ssh-copy-id -i ~/.ssh/id_rsa -o StrictHostKeyChecking=accept-new result-pusher@kb.tfm.codigoparallevar.com

  del SSHKEY

### Pull datasets

In [15]:
# !python3 src/scripts/pull_datasets.py

## Run fine tuning

In [16]:
compute_dtype = getattr(torch, "float16")
bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type='nf4',
        bnb_4bit_compute_dtype=compute_dtype,
        bnb_4bit_use_double_quant=False,
    )

In [17]:
model_name='microsoft/phi-4'
model_name = 'unsloth/Phi-4'  # Unsloth's version
model_name = "unsloth/Phi-4-unsloth-bnb-4bit"  # Phi-4 Unsloth Dynamic 4-bit Quant
device_map = {"": 0}


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-4",
    max_seq_length = 16384, # max_seq_length,
    load_in_4bit = LOAD_IN_4_BIT, # load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

==((====))==  Unsloth 2025.5.4: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

In [18]:
tokenizer = AutoTokenizer.from_pretrained(model_name,trust_remote_code=True,padding_side="left") # ,add_eos_token=True,add_bos_token=True,use_fast=False)
tokenizer.pad_token = tokenizer.eos_token

In [19]:
import json

def load_jsonl(path):
  with open(path) as f:
    return [
      json.loads(line)
      for line in f
    ]

def regen_jsonl(path):
  with open(path) as f:
    data = [
      json.loads(line)
      for line in f
    ]
  with open(path, 'w') as f:
    for line in data:
      if isinstance(line, list):
        f.write(json.dumps({"messages": line}) + '\n')
      else:
        f.write(json.dumps(line) + '\n')

regen_jsonl("fine-tuning-test.jsonl")
regen_jsonl("fine-tuning-data.jsonl")

In [20]:
import json
import datasets

dataset = datasets.load_dataset("json", data_files={
    "test": "fine-tuning-test.jsonl",
    "train": "fine-tuning-data.jsonl",
}, )
dataset # ['test'][0]

Generating test split: 0 examples [00:00, ? examples/s]

Generating train split: 0 examples [00:00, ? examples/s]

DatasetDict({
    test: Dataset({
        features: ['messages'],
        num_rows: 10
    })
    train: Dataset({
        features: ['messages'],
        num_rows: 2200
    })
})

In [21]:
!tail -n1 fine-tuning-*.jsonl

==> fine-tuning-data.jsonl <==
{"messages": [{"actor": "user", "text": "Extract the nouns from this natural language query.\n\n> List the team for which Doug Acomb played ?\n\nLet's reason step by step. Identify the nouns on the query, skip the ones that can be solved by a SPARQL verb (ignore, for example, \"count\" or \"number of\"), and output a json list like this.\n\n```json\n[\n    \"entity1\",\n    \"entity2\",\n    ...\n    \"entityN\"\n]\n```"}, {"actor": "assistant", "text": "```json\n[\n    \"List\",\n    \"team\",\n    \"Doug Acomb\",\n    \"played\"\n]\n```"}, {"actor": "user", "text": "\nThis are some examples on how the available properties can be used:\n\n### Subject: <http://dbpedia.org/resource/Doug_Acomb> ; Predicate: <http://dbpedia.org/ontology/formerTeam>\n\n```sparql\nSELECT DISTINCT ?object WHERE { <http://dbpedia.org/resource/Doug_Acomb> <http://dbpedia.org/ontology/formerTeam> ?object }\n```\n\n### Subject: <http://dbpedia.org/resource/Doug_Acomb> ; Predicate: 

In [22]:
dataset['train'][0]

{'messages': [{'actor': 'user',
   'text': 'Extract the nouns from this natural language query.\n\n> How many movies did Stanley Kubrick direct?\n\nLet\'s reason step by step. Identify the nouns on the query, skip the ones that can be solved by a SPARQL verb (ignore, for example, "count" or "number of"), and output a json list like this.\n\n```json\n[\n    "entity1",\n    "entity2",\n    ...\n    "entityN"\n]\n```'},
  {'actor': 'assistant',
   'text': '```json\n[\n    "movies",\n    "Stanley Kubrick",\n    "direct"\n]\n```'},
  {'actor': 'user',
   'text': "\nThis are some examples on how the available properties can be used:\n\n### Subject: <http://dbpedia.org/resource/Direct_selling> ; Predicate: <http://www.w3.org/ns/prov#wasDerivedFrom>\n\n```sparql\nSELECT DISTINCT ?object WHERE { <http://dbpedia.org/resource/Direct_selling> <http://www.w3.org/ns/prov#wasDerivedFrom> ?object }\n```\n\n### Subject: <http://dbpedia.org/resource/Direct_marketing> ; Predicate: <http://www.w3.org/ns/p

In [23]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

index = 0

conversation = dataset['train'][index]

import re

PARSE_IM_RE = re.compile(r'<\|im_start\|>([^<]+)<\|im_sep\|>(([^<]|<[^|])*)<\|im_end\|>')

# Generate output
def gen(model, conversation, max_length, skip_last=True):
    conversation_steps = []
    for idx, step in enumerate(conversation['messages']):
        # TODO: Check that these tokens are correct
        if idx < (len(conversation['messages']) - 1) or not skip_last:
          conversation_steps.append(f"<|im_start|>{step['actor']}<|im_sep|>{step['text']}<|im_end|>")
        else:
          assert step['actor'] == 'assistant'

    inputs = tokenizer('\n'.join(conversation_steps), return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_length)
    return ''.join(tokenizer.batch_decode(outputs)).split('<|endoftext|>')[0] # , skip_special_tokens=True)

def parse_out(output):
  results = []
  for chunk in PARSE_IM_RE.findall(output):
    results.append((chunk[0], chunk[1]))
  return results

res = gen(model, conversation, 8000,)
output = parse_out(res)

dash_line = '-'.join('' for x in range(100))
# print(f'INPUT PROMPT:\n{formatted_prompt}')
# print(dash_line)
print(f'BASELINE OUTPUT:\n{conversation["messages"][-1]["text"]}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output[-1][1]}')

BASELINE OUTPUT:
```sparql
SELECT DISTINCT COUNT(?uri) WHERE {?uri <http://dbpedia.org/ontology/director> <http://dbpedia.org/resource/Stanley_Kubrick>  . }
```

---------------------------------------------------------------------------------------------------
MODEL GENERATION - ZERO SHOT:
To find out how many movies Stanley Kubrick directed, you can use the following SPARQL query:

```sparql
SELECT (COUNT(?movie) AS ?numberOfMovies) WHERE {
    ?movie <http://dbpedia.org/ontology/director> <http://dbpedia.org/resource/Stanley_Kubrick> .
    ?movie <http://dbpedia.org/ontology/genre> <http://dbpedia.org/resource/Film> .
}
```

This query counts the number of movies (`?movie`) where Stanley Kubrick is listed as the director and the genre is "Film".
CPU times: user 9.55 s, sys: 310 ms, total: 9.86 s
Wall time: 9.87 s


In [24]:
def create_prompt_formats(sample):
    """
    Preparate a sample for prompt creation.
    :param sample: Sample dictionnary
    """
    batch = []
    for conversation in sample['messages']:
      conversation_steps = []
      for step in conversation:
          # TODO: Check that these tokens are correct
          conversation_steps.append(f"<|im_start|>{step['actor']}<|im_sep|>{step['text']}<|im_end|>")
      batch.append('\n'.join(conversation_steps))
    return {"text": batch}

In [25]:
from functools import partial

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def get_max_length(model):
    conf = model.config
    max_length = None
    for length_setting in ["n_positions", "max_position_embeddings", "seq_length"]:
        max_length = getattr(model.config, length_setting, None)
        if max_length:
            print(f"Found max lenth: {max_length}")
            break
    if not max_length:
        max_length = 1024
        print(f"Using default max length: {max_length}")
    return max_length


def preprocess_batch(batch, tokenizer, max_length):
    """
    Tokenizing a batch
    """
    return tokenizer(
        batch["text"],
        max_length=max_length,
        truncation=True,
    )

# SOURCE https://github.com/databrickslabs/dolly/blob/master/training/trainer.py
def preprocess_dataset(tokenizer: AutoTokenizer, max_length: int,seed, dataset):
    """Format & tokenize it so it is ready for training
    :param tokenizer (AutoTokenizer): Model Tokenizer
    :param max_length (int): Maximum number of tokens to emit from tokenizer
    """

    # Add prompt to each sample
    print("Preprocessing dataset...")
    dataset = dataset.map(create_prompt_formats, batched=True)
    print("DS", dataset)

    # Apply preprocessing to each batch of the dataset & and remove 'instruction', 'context', 'response', 'category' fields
    # _preprocessing_function = partial(preprocess_batch, max_length=max_length, tokenizer=tokenizer)
    dataset = dataset.map(
        # _preprocessing_function,
        lambda x: tokenizer(x['text'], truncation=False, max_length=max_length),
        batched=True,
        # remove_columns=['user', 'assistant'],
    )

    # Filter out samples that have input_ids exceeding max_length
    dataset = dataset.filter(lambda sample: len(sample["input_ids"]) < max_length)

    # Shuffle dataset
    dataset = dataset.shuffle(seed=seed)

    return dataset

In [26]:
## Pre-process dataset
max_length = get_max_length(model)
print(max_length)

train_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['train'])
eval_dataset = preprocess_dataset(tokenizer, max_length,seed, dataset['test'])

Found max lenth: 16384
16384
Preprocessing dataset...


Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

DS Dataset({
    features: ['messages', 'text'],
    num_rows: 2200
})


Map:   0%|          | 0/2200 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2200 [00:00<?, ? examples/s]

Preprocessing dataset...


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

DS Dataset({
    features: ['messages', 'text'],
    num_rows: 10
})


Map:   0%|          | 0/10 [00:00<?, ? examples/s]

Filter:   0%|          | 0/10 [00:00<?, ? examples/s]

In [27]:
model = FastLanguageModel.get_peft_model(
    model,
    r = LORA_RANK, # Choose any number > 0 ! Suggested 8, 16, 32, 64, 128
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj",
                      "gate_proj", "up_proj", "down_proj",],
    lora_alpha = LORA_ALPHA,
    lora_dropout = LORA_DROPOUT, # Supports any, but = 0 is optimized
    bias = LORA_BIAS,    # Supports any, but = "none" is optimized
    # [NEW] "unsloth" uses 30% less VRAM, fits 2x larger batch sizes!
    use_gradient_checkpointing = "unsloth", # True or "unsloth" for very long context
    random_state = 42,
    use_rslora = LORA_RANK_STABILIZED,  # We support rank stabilized LoRA
    loftq_config = None, # And LoftQ
)

Unsloth 2025.5.4 patched 40 layers with 40 QKV layers, 40 O layers and 40 MLP layers.


In [28]:
# from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

In [29]:
# # 2 - Using the prepare_model_for_kbit_training method from PEFT
# # Preparing the Model for QLoRA
# original_model = prepare_model_for_kbit_training(original_model)

In [30]:
# config = LoraConfig(
#     r=32, # Rank
#     lora_alpha=16,
#     target_modules=[
#         # "q_proj",  # Not found, ignored
#         # "k_proj",  # Not found, ignored
#         # "v_proj",  # Not found, ignored
#         "o_proj",
#         # "gate_proj", # Not found, ignored
#         # "up_proj", # Not found, ignored
#         "down_proj"
#         ],
#     bias="none",
#     lora_dropout=0.2,
#     task_type="CAUSAL_LM",
# )

# # 1 - Enabling gradient checkpointing to reduce memory usage during fine-tuning
# original_model.gradient_checkpointing_enable()

# peft_model = get_peft_model(original_model, config)

In [31]:
model.print_trainable_parameters()

trainable params: 65,536,000 || all params: 14,725,043,200 || trainable%: 0.4451


In [32]:
output_dir = f'./fine-tuned/' + RUN_CODE
import transformers

peft_training_args = TrainingArguments(
    output_dir = output_dir,
    warmup_steps=1,
    per_device_train_batch_size=1, #
    per_device_eval_batch_size=1, # same as above
    gradient_accumulation_steps=1,
    num_train_epochs=MAX_EPOCHS,
    # max_steps=MAX_STEPS, # If given a MAX_STEPS, not all data is trained
    learning_rate=LEARNING_RATE,
    optim="paged_adamw_8bit",
    save_steps=100, # For `load_best_model_at_end=True` must be same as `eval_steps`
    eval_steps=100, # This is what takes most of the time! More steps between evals means faster training
    do_eval=True,
    logging_steps=25,
    logging_dir="./logs",
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    gradient_checkpointing=True,
    report_to="wandb",
    overwrite_output_dir = 'True',
    group_by_length=True,
)

model.config.use_cache = False

peft_trainer = transformers.Trainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=peft_training_args,
    data_collator=transformers.DataCollatorForLanguageModeling(tokenizer, mlm=False),
)


In [33]:
res = peft_trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 1,635 | Num Epochs = 2 | Total steps = 3,270
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 1
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 1 x 1) = 1
 "-____-"     Trainable parameters = 65,536,000/14,725,043,200 (0.45% trained)




Step,Training Loss,Validation Loss
100,0.1337,0.131901
200,0.1086,0.12601
300,0.1054,0.129648
400,0.108,0.119499


Unsloth: Will smartly offload gradients to save VRAM!


[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-200)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-300)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-400)... Done. 0.9s


Step,Training Loss,Validation Loss
100,0.1337,0.131901
200,0.1086,0.12601
300,0.1054,0.129648
400,0.108,0.119499
500,0.119,0.119138
600,0.1112,0.11613
700,0.1008,0.114543
800,0.1082,0.111563
900,0.1164,0.109905
1000,0.1129,0.11203


[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-500)... Done. 0.8s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-600)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-700)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-800)... Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-900)... Done. 0.9s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-1000)... Done. 1.0s
[34m[1mwandb[0m: Adding directory to artifact (./fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-1100)... Done. 1.

## Test results

In [34]:
%%time
from transformers import set_seed
seed = 42
set_seed(seed)

index = 0

conversation = dataset['train'][index]

import re

PARSE_IM_RE = re.compile(r'<\|im_start\|>([^<]+)<\|im_sep\|>(([^<]|<[^|])*)<\|im_end\|>')

# Generate output
def gen(model, conversation, max_length, skip_last=True):
    conversation_steps = []
    for idx, step in enumerate(conversation['messages']):
        # TODO: Check that these tokens are correct
        if idx < (len(conversation['messages']) - 1) or not skip_last:
          conversation_steps.append(f"<|im_start|>{step['actor']}<|im_sep|>{step['text']}<|im_end|>")
        else:
          assert step['actor'] == 'assistant'

    inputs = tokenizer('\n'.join(conversation_steps), return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_length=max_length)
    return ''.join(tokenizer.batch_decode(outputs)).split('<|endoftext|>')[0] # , skip_special_tokens=True)

def parse_out(output):
  results = []
  for chunk in PARSE_IM_RE.findall(output):
    results.append((chunk[0], chunk[1]))
  return results

res = gen(model, conversation, 1000,)
output = parse_out(res)

dash_line = '-'.join('' for x in range(100))
# print(f'INPUT PROMPT:\n{formatted_prompt}')
# print(dash_line)
print(f'BASELINE OUTPUT:\n{conversation["messages"][-1]["text"]}\n')
print(dash_line)
print(f'MODEL GENERATION - ZERO SHOT:\n{output[-1][1]}')

ValueError: Input length of input_ids is 7798, but `max_length` is set to 1000. This can lead to unexpected behavior. You should consider increasing `max_length` or, better yet, setting `max_new_tokens`.

## Upload results

In [35]:
!rsync -HPrz --mkpath \
  fine-tuned \
  result-pusher@kb.tfm.codigoparallevar.com:fine-tuning

sending incremental file list
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1550/
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1555/
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/README.md
          5,087 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=428/466)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/adapter_config.json
            843 100%  823.24kB/s    0:00:00 (xfr#2, to-chk=427/466)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/adapter_model.safetensors
    262,219,392 100%   16.03MB/s    0:00:15 (xfr#3, to-chk=426/466)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/merges.txt
        916,646 100%    2.11MB/s    0:00:00 (xfr#4, to-chk=425/466)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/che

### Quantize and convert

In [36]:
peft_trainer.model.save_pretrained_gguf("fine-tuned/" + RUN_CODE + "/loadable", tokenizer, quantization_method = [ "f16", "q4_k_m", "q6_k"])

Unsloth: Kaggle/Colab has limited disk space. We need to delete the downloaded
model which will save 4-16GB of disk space, allowing you to save on Kaggle/Colab.
Unsloth: Will remove a cached repo with size 29.3G


Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 48.94 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


 15%|█▌        | 6/40 [00:00<00:01, 23.26it/s]
We will save to Disk and not RAM now.
100%|██████████| 40/40 [00:55<00:00,  1.38s/it]


Unsloth: Saving tokenizer... Done.
Done.


Unsloth: Converting llama model. Can use fast conversion = False.


==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16', 'q4_k_m', 'q6_k'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: CMAKE detected. Finalizing some steps for installation.
Unsloth: [1] Converting model at fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/loadable into f16 GGUF format.
The output location will be /content/fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/loadable/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: loadable
INFO:hf-to-gguf:Model architecture: LlamaForCausalLM
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Exporting model...
INFO:hf-to-gguf:gguf: loading model wei

In [37]:
!rsync -HPrz --mkpath \
  fine-tuned \
  result-pusher@kb.tfm.codigoparallevar.com:fine-tuning

sending incremental file list
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/README.md
          5,087 100%    4.18MB/s    0:00:00 (xfr#1, to-chk=445/484)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/adapter_config.json
            843 100%  823.24kB/s    0:00:00 (xfr#2, to-chk=444/484)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/adapter_model.safetensors
    262,219,392 100%  118.97MB/s    0:00:02 (xfr#3, to-chk=443/484)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/merges.txt
        916,646 100%    7.81MB/s    0:00:00 (xfr#4, to-chk=442/484)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/optimizer.pt
    133,784,532 100%  156.36MB/s    0:00:00 (xfr#5, to-chk=441/484)
fine-tuned/r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250516_1601/checkpoint-100/rng_state.pth
         14,244 100%   17.03kB/s    0:00:00 (xfr#6, to-chk=440/484)
fine-tuned/r

## Cleanup

In [38]:
# Finish time
!date

Fri May 16 09:22:35 PM UTC 2025


### Stop kernel

In [39]:
from google.colab import runtime
runtime.unassign()