# Llama 3.2 fine tuning with combined data set

Training the 3B model with the combined data set of simple data sets. The training was successful, it stopped early.
The validation set is just a single example of each of the four (color, size, color-word, size-color-text) datasets while the test set is a 100-element set with equal proportions of each. The final loss looks good enough although could probably be improved.


In [1]:
import unsloth
import os
import numpy as np
import pandas as pd

import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 2000

def load_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-unsloth-bnb-4bit",
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        lora_alpha=16,
        lora_dropout=0,
        target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
        use_rslora=True,
        use_gradient_checkpointing="unsloth",
        random_state = 32,
        loftq_config = None,
    )
    return model, tokenizer

In [3]:
def create_trainer(model, tokenizer, training_data, max_steps):
    training_arguments = TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=16,
        num_train_epochs=40,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        # max_steps=max_steps,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=150,
        output_dir="output",
        seed=0,
        save_total_limit=3,
    )

    if max_steps is not None:
        training_arguments.max_steps = max_steps
    
    return SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=training_data,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=10,
        packing=True,
        args=training_arguments,
    )

In [4]:
import numpy as np
from utils.similarity import calculate_metrics
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import torch

log_dir = 'output/runs'

def add_image_to_tensorboard(name, step, img_path):
    image = Image.open(img_path)
    image = image.convert('RGB')
    image_array = np.array(image)
    image_tensor = torch.from_numpy(image_array)
    image_tensor = image_tensor.permute(2, 0, 1)
    image_tensor = image_tensor.float() / 255.0
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_image(name, image_tensor, step)
    
def add_text_to_tensorboard(name, step, text):
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_text(name, text, step)

def postprocess_text(preds, labels):
    preds = [pred.strip().replace('<unk>', '') for pred in preds]
    labels = [[label.strip().replace('<unk>', '')] for label in labels]

    return preds, labels

def compute_metrics(decoded_predictions, decoded_labels, steps):
    similarity_scores = []
    perceptual_losses = []
    index = 1
    
    for prediction, label in zip(decoded_predictions, decoded_labels):
        prediction = prediction.replace(tokenizer.eos_token, '')
        
        add_text_to_tensorboard(f'valid_{index}_label_text', steps, label)
        add_text_to_tensorboard(f'valid_{index}_prediction_text', steps, prediction)
        
        metrics = calculate_metrics(prediction, label)
        
        if metrics is not None:
            similarity_scores.append(metrics['similarity'])
            perceptual_losses.append(metrics['perceptual_loss'])
            
            add_image_to_tensorboard(f'valid_{index}_expectation', steps, metrics['expected_screenshot_path'])
            add_image_to_tensorboard(f'valid_{index}_prediction', steps, metrics['predicted_screenshot_path'])
        
        index += 1

    results = {
        "similarity": float(np.mean(similarity_scores)),
        "perceptual_loss": float(np.mean(perceptual_losses)),
    }
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_scalar('similarity', results['similarity'], steps)
    writer.add_scalar('perceptual_loss', results['perceptual_loss'], steps)
    
    print("Similarity:", results['similarity'])
    print("Perceptual loss:", results['perceptual_loss'])

    return results

def test_prediction(model, data, steps):
    answers = []
    labels = []
    print("Generating predictions...")
    for row in data:
        inputs = tokenizer(
        [
            data_prompt.format(
                #instructions
                row['svg'],
                #answer
                "",
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
        answer = tokenizer.batch_decode(outputs)
        answers.append(answer[0].split("### Response:")[-1])
        labels.append(row['html'])

    print("Computing metrics...")
    return compute_metrics(answers, labels, steps)

In [5]:
!rm -rf output

In [6]:
!apt install zip -y
!rm -rf data-rb-combined
!mkdir -p data-rb-combined
# This data set is already split into train/valid/test
!wget "https://www.dropbox.com/scl/fi/m3yr7sesw87qmb8xz1fe7/data-rb-combined.zip?rlkey=hw0xj9481owxrql4eewujmbod&dl=1" -O model.zip
!unzip model.zip -d data-rb-combined

!rm -rf data-rb-validate
!mkdir -p data-rb-validate

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  unzip
The following NEW packages will be installed:
  unzip zip
0 upgraded, 2 newly installed, 0 to remove and 53 not upgraded.
Need to get 350 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 zip amd64 3.0-12build2 [176 kB]
Fetched 350 kB in 1s (335 kB/s)m[33m
debconf: delaying package configuration, since apt-utils is not installed

7[0;23r8[1ASelecting previously unselected package unzip.
(Reading database ... 36713 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m

In [7]:
from datasets import load_from_disk, DatasetDict
dataset = load_from_disk('data-rb-combined')

In [8]:
model, tokenizer = load_model()

data_prompt = """Your job is to take an SVG file of a web design and convert it into a pixel-perfect HTML and CSS markup and stylesheet.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    inputs       = examples["svg"]
    outputs      = examples["html"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }



==((====))==  Unsloth 2025.3.1: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.111 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/2.35G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Unsloth 2025.3.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [9]:
training_data = dataset.map(formatting_prompt, batched=True)

Map:   0%|          | 0/299847 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

In [10]:
training_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 299847
    })
    valid: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 101
    })
})

In [11]:
def get_token_lengths(examples):
    inputs = tokenizer(
        examples['text'],
        truncation=False,  # Don't truncate yet
        padding=False,     # Don't pad yet
        return_length=True,
    )

    return inputs

tokenized_data = training_data.map(get_token_lengths, batched=True)

def filter_function(example):
    return example['length'] <= max_seq_length

filtered_data = tokenized_data.filter(filter_function)

print(filtered_data)

Map:   0%|          | 0/299847 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Map:   0%|          | 0/101 [00:00<?, ? examples/s]

Filter:   0%|          | 0/299847 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4 [00:00<?, ? examples/s]

Filter:   0%|          | 0/101 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 299847
    })
    valid: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 4
    })
    test: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 101
    })
})


In [12]:
filtered_data = filtered_data.remove_columns(["input_ids", "attention_mask", "length"])
filtered_data.save_to_disk('data-rb-combined-filtered-' + str(max_seq_length))

Saving the dataset (0/4 shards):   0%|          | 0/299847 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/101 [00:00<?, ? examples/s]

In [13]:
from datasets import load_from_disk

filtered_data = load_from_disk('data-rb-combined-filtered-' + str(max_seq_length))

filtered_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 299847
    })
    valid: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 101
    })
})

In [14]:
import torch
from tqdm import tqdm

resume = False

for steps in tqdm(range(0, 1500, 100)):
    print(f"Steps: {steps}")

    if steps > 0:
        trainer = create_trainer(model, tokenizer, filtered_data['train'], steps)
        if resume:
            trainer.train(resume_from_checkpoint=True)
        else:
            trainer.train()
            resume = True
        
    model = FastLanguageModel.for_inference(model)

    results = test_prediction(model, filtered_data['valid'], steps)

    if results is not None and results['perceptual_loss'] == 0.0:
        break

    model = FastLanguageModel.for_training(model)

    

  0%|          | 0/15 [00:00<?, ?it/s]

Steps: 0
Generating predictions...
Computing metrics...


  7%|▋         | 1/15 [02:30<35:00, 150.06s/it]

Similarity: 0.784655911847949
Perceptual loss: 0.4888540618121624
Steps: 100


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 100
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,0.7443
2,0.7307
3,0.7198
4,0.7394
5,0.7268
6,0.7282
7,0.7171
8,0.7194
9,0.7067
10,0.7024


Generating predictions...
Computing metrics...


 13%|█▎        | 2/15 [38:16<4:46:57, 1324.39s/it]

Similarity: 0.9513074830520054
Perceptual loss: 0.05823707673698664
Steps: 200


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 200
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
101,0.1619
102,0.1536
103,0.1607
104,0.1503
105,0.1574
106,0.1557
107,0.1535
108,0.1579
109,0.159
110,0.1537


Generating predictions...
Computing metrics...


 20%|██        | 3/15 [1:13:35<5:37:25, 1687.15s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9517559228010214
Perceptual loss: 0.04647174966521561
Steps: 300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 300
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
201,0.1458
202,0.1456
203,0.148
204,0.1483
205,0.1408
206,0.1417
207,0.1529
208,0.1467
209,0.1514
210,0.1455


Generating predictions...
Computing metrics...


 27%|██▋       | 4/15 [1:46:56<5:32:03, 1811.21s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.802166077635411
Perceptual loss: 0.1376871329266578
Steps: 400


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 400
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
301,0.1417
302,0.1428
303,0.1447
304,0.1451
305,0.1417
306,0.146
307,0.1404
308,0.1426
309,0.1441
310,0.1405


Generating predictions...
Computing metrics...


 33%|███▎      | 5/15 [2:20:17<5:13:17, 1879.71s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8021790189530293
Perceptual loss: 0.13716587665840052
Steps: 500


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 500
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
401,0.1368
402,0.1379
403,0.1374
404,0.1445
405,0.1392
406,0.1418
407,0.1406
408,0.1431
409,0.1382
410,0.1421


Generating predictions...
Computing metrics...


 40%|████      | 6/15 [2:53:41<4:48:17, 1921.92s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8021790224703637
Perceptual loss: 0.13716531558020506
Steps: 600


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 600
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
501,0.1386
502,0.1372
503,0.137
504,0.1378
505,0.1367
506,0.1424
507,0.1424
508,0.1407
509,0.138
510,0.1302


Generating predictions...
Computing metrics...


 47%|████▋     | 7/15 [3:26:58<4:19:31, 1946.41s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9517771487481884
Perceptual loss: 0.04590102335242818
Steps: 700


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 700
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
601,0.1321
602,0.1363
603,0.1399
604,0.1395
605,0.1364
606,0.1409
607,0.1437
608,0.1389
609,0.1371
610,0.1371


Generating predictions...
Computing metrics...


 53%|█████▎    | 8/15 [4:00:16<3:49:00, 1962.93s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9776591491885483
Perceptual loss: 0.03592189773917198
Steps: 800


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 800
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
701,0.1418
702,0.1364
703,0.1388
704,0.135
705,0.1397
706,0.1397
707,0.1358
708,0.1384
709,0.1349
710,0.1352


Generating predictions...
Computing metrics...


 60%|██████    | 9/15 [4:33:34<3:17:21, 1973.64s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9776591491885483
Perceptual loss: 0.03592189773917198
Steps: 900


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 900
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
801,0.141
802,0.1348
803,0.1345
804,0.1333
805,0.1441
806,0.1312
807,0.1337
808,0.14
809,0.1341
810,0.1354


Generating predictions...
Computing metrics...


 67%|██████▋   | 10/15 [5:06:51<2:45:05, 1981.07s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9776591491885483
Perceptual loss: 0.03592189773917198
Steps: 1000


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
901,0.1355
902,0.1409
903,0.1313
904,0.1359
905,0.1403
906,0.1345
907,0.1408
908,0.1352
909,0.1355
910,0.137


Generating predictions...
Computing metrics...


 73%|███████▎  | 11/15 [5:40:10<2:12:25, 1986.49s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.998759717962821
Perceptual loss: 0.00906180590391159
Steps: 1100


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,100
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1001,0.1343
1002,0.1276
1003,0.1282
1004,0.1399
1005,0.1375
1006,0.1372
1007,0.1343
1008,0.1425
1009,0.1358
1010,0.1421


Generating predictions...
Computing metrics...


 80%|████████  | 12/15 [6:13:21<1:39:23, 1987.89s/it]

Similarity: 0.998759717962821
Perceptual loss: 0.00906180590391159
Steps: 1200


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,200
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1101,0.138
1102,0.132
1103,0.1334
1104,0.1396
1105,0.1382
1106,0.14
1107,0.138
1108,0.1396
1109,0.1347
1110,0.1309


Generating predictions...
Computing metrics...


 87%|████████▋ | 13/15 [6:46:37<1:06:20, 1990.21s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.998759717962821
Perceptual loss: 0.00906180590391159
Steps: 1300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 119,262 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,300
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1201,0.1343
1202,0.1342
1203,0.1338
1204,0.1352
1205,0.1364
1206,0.1373
1207,0.1362
1208,0.1381
1209,0.1395
1210,0.1321


Generating predictions...
Computing metrics...


 87%|████████▋ | 13/15 [7:19:48<1:07:39, 2029.87s/it]

Similarity: 1.0
Perceptual loss: 0.0





In [15]:
test_index = 0
text = filtered_data['test'][test_index]['svg']
model = FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]

print(filtered_data['test'][test_index]['html'])
print("Answer of the question is:", answer)

<body><div></div></body>

<style>


        body {
            margin: 0;
        }

        div {
            background-color: black;
            position: absolute;
            height: 100vh;
            width: 92%;
            top: 0;
            left: 0;
        }


</style>
Answer of the question is: 
<body><div></div></body>

<style>

        body {
            margin: 0;
        }

        div {
            background-color: black;
            position: absolute;
            height: 100vh;
            width: 93%;
            top: 0;
            left: 0;
        }

</style><|end_of_text|>


In [16]:
test_prediction(model, filtered_data['test'], steps)

Generating predictions...
Computing metrics...
Similarity: 0.9947140620987173
Perceptual loss: 0.010002789617225638


{'similarity': 0.9947140620987173, 'perceptual_loss': 0.010002789617225638}