# Model conversion to GGUF notebook

In [1]:
# Start time
!date

Thu May 15 03:05:31 PM UTC 2025


In [2]:
trainset = "r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250514_2224"
checkpoint = "1900"

## Setup

In [3]:
from getpass import getpass

def get_secret(prompt, secret_name, secret_input=True):
  try:
    from google.colab import userdata
    result = userdata.get(secret_name)
    assert result is not None
  except:
    if secret_input:
      result = getpass(prompt)
    else:
      result = input(prompt)
  return result


In [4]:
%%capture
import os
if "COLAB_" not in "".join(os.environ.keys()):
    !pip install unsloth
else:
    # Do this only in Colab notebooks! Otherwise use pip install unsloth
    !pip install --no-deps bitsandbytes accelerate xformers==0.0.29.post3 peft trl==0.15.2 triton cut_cross_entropy unsloth_zoo
    !pip install sentencepiece protobuf datasets huggingface_hub hf_transfer
    !pip install --no-deps unsloth
    !pip install -U transformers

from unsloth import FastLanguageModel  # Load unsloth ASAP

## Download pre-trained checkpoint

In [5]:
import os

if (
    (not os.path.exists(os.path.expanduser("~/.ssh/id_rsa")))
    or (not os.path.exists(os.path.expanduser("~/.ssh/known_hosts")))
):
  SSHKEY = get_secret('Result pusher SSH key: ', 'TFM_SSH_PUSHER_KEY')

  !mkdir ~/.ssh

  # Read locally with `cat ~/.ssh/result-pusher|tr '\n' '$';echo`
  with open(os.path.expanduser("~/.ssh/id_rsa"), 'wt') as f:
    f.write(SSHKEY.replace('$', '\n'))

  !chmod 0600 ~/.ssh/id_rsa
  !ssh-keygen -y -f ~/.ssh/id_rsa > ~/.ssh/id_rsa.pub
  !chmod 0600 ~/.ssh/id_rsa.pub

  # This won't copy the client key (not needed), but it will initialize the server's on the client
  !ssh-copy-id -i ~/.ssh/id_rsa -o StrictHostKeyChecking=accept-new result-pusher@kb.tfm.codigoparallevar.com

  del SSHKEY

/usr/bin/ssh-copy-id: INFO: Source of key(s) to be installed: "/root/.ssh/id_rsa.pub"
/usr/bin/ssh-copy-id: INFO: attempting to log in with the new key(s), to filter out any that are already installed

		(if you think this is a mistake, you may want to use -f option)



In [6]:
!rsync -HPrz --mkpath \
  result-pusher@kb.tfm.codigoparallevar.com:fine-tuning/fine-tuned/"$trainset"/checkpoint-"$checkpoint"/ \
    fine-tune

receiving incremental file list
created 1 directory for fine-tune
README.md
          5,087 100%    4.85MB/s    0:00:00 (xfr#1, to-chk=12/14)
adapter_config.json
            843 100%  823.24kB/s    0:00:00 (xfr#2, to-chk=11/14)
adapter_model.safetensors
    262,219,392 100%   17.60MB/s    0:00:14 (xfr#3, to-chk=10/14)
merges.txt
        916,646 100%    1.03MB/s    0:00:00 (xfr#4, to-chk=9/14)
optimizer.pt
    133,785,108 100%   16.71MB/s    0:00:07 (xfr#5, to-chk=8/14)
rng_state.pth
         14,244 100%   32.73kB/s    0:00:00 (xfr#6, to-chk=7/14)
scheduler.pt
          1,064 100%    2.44kB/s    0:00:00 (xfr#7, to-chk=6/14)
special_tokens_map.json
            456 100%    1.05kB/s    0:00:00 (xfr#8, to-chk=5/14)
tokenizer.json
      7,153,264 100%   12.40MB/s    0:00:00 (xfr#9, to-chk=4/14)
tokenizer_config.json
         17,987 100%   31.88kB/s    0:00:00 (xfr#10, to-chk=3/14)
trainer_state.json
         18,180 100%   32.22kB/s    0:00:00 (xfr#11, to-chk=2/14)
training_args.bin
         

## Load pre-trained model

In [7]:
from peft import PeftModel
import torch

device_map = {"": 0}


model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Phi-4",
    max_seq_length = 16384, # max_seq_length,
    load_in_4bit = False, # load_in_4bit,
    # token = "hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

ft_model = PeftModel.from_pretrained(model, "fine-tune",torch_dtype=torch.float16,is_trainable=False)

==((====))==  Unsloth 2025.5.3: Fast Llama patching. Transformers: 4.51.3.
   \\   /|    NVIDIA A100-SXM4-40GB. Num GPUs = 1. Max memory: 39.557 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.0. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.29.post3. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors.index.json:   0%|          | 0.00/29.9k [00:00<?, ?B/s]

model-00001-of-00006.safetensors:   0%|          | 0.00/4.93G [00:00<?, ?B/s]

model-00002-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00003-of-00006.safetensors:   0%|          | 0.00/4.90G [00:00<?, ?B/s]

model-00004-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00005-of-00006.safetensors:   0%|          | 0.00/4.95G [00:00<?, ?B/s]

model-00006-of-00006.safetensors:   0%|          | 0.00/4.62G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/6 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/18.0k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.61M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/917k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/456 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.15M [00:00<?, ?B/s]

## Save to GGUF

In [8]:
outname = f"phi-4-{trainset}-cp-{checkpoint}"

### Fix llama.cpp

- See: https://github.com/unslothai/unsloth/issues/748#issuecomment-2238395604

In [None]:
!bash -c 'git clone --depth=1 --single-branch -b b3345 https://github.com/ggml-org/llama.cpp'
!bash -c 'cd llama.cpp && git submodule update --init --recursive'
!bash -c 'cd llama.cpp && make clean'
!bash -c 'cd llama.cpp && make all -j'

In [11]:
ft_model.save_pretrained_gguf(outname, tokenizer, quantization_method = [ "f16", "q4_k_m", "q6_k"])

Unsloth: Merging 4bit and LoRA weights to 16bit...
Unsloth: Will use up to 61.08 out of 83.48 RAM for saving.
Unsloth: Saving model... This might take 5 minutes ...


100%|██████████| 40/40 [01:15<00:00,  1.89s/it]


Unsloth: Saving tokenizer... Done.
Done.
==((====))==  Unsloth: Conversion from QLoRA to GGUF information
   \\   /|    [0] Installing llama.cpp might take 3 minutes.
O^O/ \_/ \    [1] Converting HF to GGUF 16bits might take 3 minutes.
\        /    [2] Converting GGUF 16bits to ['f16', 'q4_k_m', 'q6_k'] might take 10 minutes each.
 "-____-"     In total, you will have to wait at least 16 minutes.

Unsloth: Installing llama.cpp. This might take 3 minutes...
Unsloth: [1] Converting model at phi-4-r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250514_2224-cp-1900 into f16 GGUF format.
The output location will be /content/phi-4-r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250514_2224-cp-1900/unsloth.F16.gguf
This might take 3 minutes...
INFO:hf-to-gguf:Loading model: phi-4-r16_a16_s0_d0_bnone_l0.0001_4bFalse_20250514_2224-cp-1900
INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only
INFO:hf-to-gguf:Set model parameters
INFO:hf-to-gguf:gguf: context length = 16384
INFO:hf-to-gguf:gguf: embe

## Upload data back

In [12]:
rsyncto="result-pusher@kb.tfm.codigoparallevar.com:fine-tuning/fine-tuned/" + trainset + "/checkpoint-" + checkpoint + "/loadable"

## Completed

In [15]:
!rsync -HPrz --mkpath "$outname"/ "$rsyncto"

sending incremental file list
config.json
            793 100%    0.00kB/s    0:00:00 (xfr#1, to-chk=16/18)
generation_config.json
            170 100%  166.02kB/s    0:00:00 (xfr#2, to-chk=15/18)
merges.txt
        916,646 100%   72.85MB/s    0:00:00 (xfr#3, to-chk=14/18)
model-00001-of-00006.safetensors
  4,933,658,528 100%   21.62MB/s    0:03:37 (xfr#4, to-chk=13/18)
model-00002-of-00006.safetensors
  4,954,693,112 100%   21.58MB/s    0:03:38 (xfr#5, to-chk=12/18)
model-00003-of-00006.safetensors
  4,902,243,992 100%   21.59MB/s    0:03:36 (xfr#6, to-chk=11/18)
model-00004-of-00006.safetensors
  4,954,672,440 100%   21.61MB/s    0:03:38 (xfr#7, to-chk=10/18)
model-00005-of-00006.safetensors
  4,954,672,432 100%   21.62MB/s    0:03:38 (xfr#8, to-chk=9/18)
model-00006-of-00006.safetensors
  4,619,116,224 100%   21.59MB/s    0:03:24 (xfr#9, to-chk=8/18)
model.safetensors.index.json
         29,894 100%   50.60kB/s    0:00:00 (xfr#10, to-chk=7/18)
special_tokens_map.json
            570

In [16]:
!date

Thu May 15 05:35:32 PM UTC 2025
