# Llama 3.2 fine tuning with size-color-text-base dataset, fixed dependencies

2025-02-28 23:00

Running the llama-size-color-text-bare-dataset with the fixed dependency versions. Much better ressults than previously. Note that dependencies were first installed with the version script.

The training goes pretty fast and gets to a reasonable low loss. Nevertheless the loss plateaus after a while and the results don't improve, if anything they slightly overfit. It makes me wonder if we'd get better results with a larger model or perhaps the Instruct version. Some of the sizes are still not calculated very well, mostly the one related to vw.

Note also that in Tensorboard the images are mislabeled: expected vs predicted are swapped. Otherwise they are correctly presented.

In [1]:
# Order is important!
import unsloth
import os
import numpy as np
import pandas as pd

import torch
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 5020

def load_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-1B-bnb-4bit",
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        lora_alpha=16,
        lora_dropout=0,
        target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
        use_rslora=True,
        use_gradient_checkpointing="unsloth",
        random_state = 32,
        loftq_config = None,
    )
    return model, tokenizer

In [3]:
def create_trainer(model, tokenizer, training_data, max_steps):
    training_arguments = SFTConfig(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=16,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        # max_steps=max_steps,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=150,
        output_dir="output",
        seed=0,
        save_total_limit=3,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=10,
        packing=True,
    )

    if max_steps is not None:
        training_arguments.max_steps = max_steps
    
    return SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=training_data,
        args=training_arguments,
    )

In [4]:
from json import JSONDecodeError
import numpy as np
from utils.similarity import calculate_metrics
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import torch
import json

log_dir = 'output/runs'
with open('size-color-text-page-compressed.html', 'r') as f:
    html_template = f.read()

def add_image_to_tensorboard(name, step, img_path):
    image = Image.open(img_path)
    image = image.convert('RGB')
    image_array = np.array(image)
    image_tensor = torch.from_numpy(image_array)
    image_tensor = image_tensor.permute(2, 0, 1)
    image_tensor = image_tensor.float() / 255.0
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_image(name, image_tensor, step)
    
def add_text_to_tensorboard(name, step, text):
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_text(name, text, step)

def postprocess_text(preds, labels):
    preds = [pred.strip().replace('<unk>', '') for pred in preds]
    labels = [[label.strip().replace('<unk>', '')] for label in labels]

    return preds, labels

def apply_to_templates(text, template):
    try:
        variables = json.loads(text)
    except JSONDecodeError:
        return None

    if not isinstance(variables, dict):
        return None
    
    for variable_name, variable_value in variables.items():
        template = template.replace('{{' + variable_name + '}}', str(variable_value))

    return template

def compute_metrics(decoded_predictions, decoded_labels, steps):
    similarity_scores = []
    perceptual_losses = []
    index = 1
    
    for prediction, label in zip(decoded_predictions, decoded_labels):
        prediction = prediction.replace(tokenizer.eos_token, '')
        
        add_text_to_tensorboard(f'valid_{index}_label_text', steps, label)
        add_text_to_tensorboard(f'valid_{index}_prediction_text', steps, prediction)
        
        applied_label = apply_to_templates(label, html_template)
        applied_prediction = apply_to_templates(prediction, html_template)

        if applied_label is None or applied_prediction is None:
            metrics = None
        else:
            add_text_to_tensorboard(f'valid_{index}_label_text_applied', steps, applied_label)
            add_text_to_tensorboard(f'valid_{index}_prediction_text_applied', steps, applied_prediction)

            # The paramerers here are in reverse!
            metrics = calculate_metrics(
                applied_label, 
                applied_prediction
            )
        
        if metrics is not None:
            similarity_scores.append(metrics['similarity'])
            perceptual_losses.append(metrics['perceptual_loss'])
            
            add_image_to_tensorboard(f'valid_{index}_expectation', steps, metrics['expected_screenshot_path'])
            add_image_to_tensorboard(f'valid_{index}_prediction', steps, metrics['predicted_screenshot_path'])
        
        index += 1

    results = {
        "similarity": float(np.mean(similarity_scores)),
        "perceptual_loss": float(np.mean(perceptual_losses)),
    }
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_scalar('similarity', results['similarity'], steps)
    writer.add_scalar('perceptual_loss', results['perceptual_loss'], steps)
    
    print("Similarity:", results['similarity'])
    print("Perceptual loss:", results['perceptual_loss'])

    return results

def test_prediction(model, data, steps):
    answers = []
    labels = []
    print("Generating predictions...")
    for row in data:
        inputs = tokenizer(
        [
            data_prompt.format(
                #instructions
                row['svg'],
                #answer
                "",
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens = 5020, use_cache = True)
        answer = tokenizer.batch_decode(outputs)
        answers.append(answer[0].split("### Response:")[-1])
        labels.append(row['html'])

    print("Computing metrics...")
    compute_metrics(answers, labels, steps)

In [5]:
!rm -rf output

In [6]:
!apt install zip -y
!rm -rf data-rb-size-color-text-bare
!mkdir -p data-rb-size-color-text-bare
!wget "https://www.dropbox.com/scl/fi/or7eexwsl7s9ud8otg4y4/data-rb-size-color-text-bare.zip?rlkey=35kkqe2k0a4xorh8q6ow7c1in&dl=1" -O model.zip
!unzip model.zip -d data-rb-size-color-text-bare

!rm -rf data-rb-validate
!mkdir -p data-rb-validate

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  unzip
The following NEW packages will be installed:
  unzip zip
0 upgraded, 2 newly installed, 0 to remove and 53 not upgraded.
Need to get 350 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 zip amd64 3.0-12build2 [176 kB]
Fetched 350 kB in 1s (656 kB/s)m[33m
debconf: delaying package configuration, since apt-utils is not installed

7[0;23r8[1ASelecting previously unselected package unzip.
(Reading database ... 36713 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m

In [7]:
from datasets import load_from_disk
dataset = load_from_disk('data-rb-size-color-text-bare')

dataset = dataset.train_test_split(test_size=4/len(dataset))

dataset

DatasetDict({
    train: Dataset({
        features: ['svg', 'html'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html'],
        num_rows: 4
    })
})

In [8]:
model, tokenizer = load_model()

data_prompt = """Your job is to take variable parameters extracted from an SVG file of a web design and convert it into a variable set of parameters of HTML and CSS markup and stylesheet that represents the design in pixel-perfect accuracy.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    inputs       = examples["svg"]
    outputs      = examples["html"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }



==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.111 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/230 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/50.6k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/459 [00:00<?, ?B/s]

Unsloth 2025.2.15 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [9]:
training_data = dataset.map(formatting_prompt, batched=True)

Map:   0%|          | 0/99849 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [10]:
training_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
})

In [11]:
def get_token_lengths(examples):
    inputs = tokenizer(
        examples['text'],
        truncation=False,  # Don't truncate yet
        padding=False,     # Don't pad yet
        return_length=True,
    )

    return inputs

tokenized_data = training_data.map(get_token_lengths, batched=True)

def filter_function(example):
    return example['length'] <= max_seq_length

filtered_data = tokenized_data.filter(filter_function)

print(filtered_data)

Map:   0%|          | 0/99849 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Filter:   0%|          | 0/99849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 4
    })
})


In [12]:
filtered_data = filtered_data.remove_columns(["input_ids", "attention_mask", "length"])
filtered_data.save_to_disk('data-rb-size-color-text-bare-filtered-' + str(max_seq_length))

Saving the dataset (0/1 shards):   0%|          | 0/99849 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [13]:
from datasets import load_from_disk

filtered_data = load_from_disk('data-rb-size-color-text-bare-filtered-' + str(max_seq_length))

filtered_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
})

In [14]:
import torch
from tqdm import tqdm
import os

import sys
sys.set_int_max_str_digits(0)

resume = False

for steps in tqdm(range(0, 1501, 100)):
    print(f"Steps: {steps}")

    if steps > 0:
        trainer = create_trainer(model, tokenizer, filtered_data['train'], steps)
        if resume:
            trainer.train(resume_from_checkpoint=True)
        else:
            trainer.train()
            resume = True
        
    model = FastLanguageModel.for_inference(model)

    results = test_prediction(model, filtered_data['test'], steps)

    if results is not None and results['perceptual_loss'] == 0.0:
        break

    model = FastLanguageModel.for_training(model)

    

  0%|          | 0/16 [00:00<?, ?it/s]

Steps: 0
Generating predictions...
Computing metrics...


  6%|▋         | 1/16 [03:15<48:56, 195.78s/it]

Similarity: 0.8597965896129608
Perceptual loss: 0.43494701385498047
Steps: 100


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 100
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,0.8633
2,0.8625
3,0.8639
4,0.8638
5,0.8634
6,0.8591
7,0.8553
8,0.85
9,0.8424
10,0.8336


Generating predictions...
Computing metrics...


 12%|█▎        | 2/16 [43:11<5:47:39, 1490.00s/it]

Similarity: 0.8281217093579472
Perceptual loss: 0.6363855302333832
Steps: 200


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 200
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
101,0.5192
102,0.5185
103,0.5175
104,0.5161
105,0.5164
106,0.5146
107,0.5145
108,0.514
109,0.5126
110,0.512


Generating predictions...
Computing metrics...


 19%|█▉        | 3/16 [1:23:14<6:53:09, 1906.87s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8366979093290865
Perceptual loss: 0.5140989422798157
Steps: 300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 300
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
201,0.4848
202,0.4839
203,0.4871
204,0.4929
205,0.4929
206,0.4955
207,0.4876
208,0.492
209,0.4906
210,0.4897


Generating predictions...
Computing metrics...


 25%|██▌       | 4/16 [2:02:48<6:58:16, 2091.39s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9097845990676433
Perceptual loss: 0.3651899918913841
Steps: 400


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 400
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
301,0.4408
302,0.4419
303,0.4483
304,0.4663
305,0.4707
306,0.4696
307,0.4684
308,0.4597
309,0.452
310,0.4463


Generating predictions...
Computing metrics...


 31%|███▏      | 5/16 [2:42:07<6:41:03, 2187.57s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9887408097361913
Perceptual loss: 0.07822196837514639
Steps: 500


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 500
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
401,0.3051
402,0.304
403,0.307
404,0.3478
405,0.3091
406,0.3299
407,0.3176
408,0.32
409,0.3127
410,0.3126


Generating predictions...
Computing metrics...


 38%|███▊      | 6/16 [3:21:17<6:13:50, 2243.07s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.986567432837459
Perceptual loss: 0.04981125466292724
Steps: 600


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 6
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 600
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
501,0.2899
502,0.289
503,0.29
504,0.2894
505,0.2921
506,0.2912
507,0.2897
508,0.2897
509,0.2892
510,0.2887


Generating predictions...
Computing metrics...


 44%|████▍     | 7/16 [4:00:28<5:41:45, 2278.36s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865985290528752
Perceptual loss: 0.04936793298111297
Steps: 700


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 700
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
601,0.288
602,0.2867
603,0.2886
604,0.2897
605,0.2908
606,0.2885
607,0.2898
608,0.2881
609,0.2876
610,0.2885


Generating predictions...
Computing metrics...


 50%|█████     | 8/16 [4:39:43<5:07:01, 2302.66s/it]

Similarity: 0.9854268664937094
Perceptual loss: 0.058563041646266356
Steps: 800


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 8
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 800
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
701,0.286
702,0.284
703,0.2884
704,0.2897
705,0.2872
706,0.2861
707,0.2881
708,0.2885
709,0.2882
710,0.2874


Generating predictions...
Computing metrics...


 56%|█████▋    | 9/16 [5:18:55<4:30:27, 2318.16s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865286883738233
Perceptual loss: 0.049868506263010204
Steps: 900


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 9
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 900
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
801,0.2846
802,0.2836
803,0.285
804,0.2854
805,0.2866
806,0.2838
807,0.2886
808,0.2848
809,0.2859
810,0.2846


Generating predictions...
Computing metrics...


 62%|██████▎   | 10/16 [5:58:10<3:52:56, 2329.43s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865250394376128
Perceptual loss: 0.050081464753020555
Steps: 1000


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
901,0.2838
902,0.2829
903,0.2837
904,0.2849
905,0.2846
906,0.2833
907,0.2838
908,0.2833
909,0.283
910,0.2833


Generating predictions...
Computing metrics...


 69%|██████▉   | 11/16 [6:37:27<3:14:49, 2337.80s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9858255486446991
Perceptual loss: 0.05531542110838927
Steps: 1100


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,100
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1001,0.2823
1002,0.2827
1003,0.2842
1004,0.284
1005,0.2876
1006,0.2833
1007,0.2827
1008,0.284
1009,0.2831
1010,0.2858


Generating predictions...
Computing metrics...


 75%|███████▌  | 12/16 [7:16:44<2:36:14, 2343.61s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9858247999262175
Perceptual loss: 0.05521556723397225
Steps: 1200


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 11
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,200
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1101,0.2816
1102,0.2802
1103,0.2812
1104,0.2807
1105,0.2828
1106,0.2822
1107,0.2818
1108,0.2805
1109,0.2819
1110,0.2816


Generating predictions...
Computing metrics...


 81%|████████▏ | 13/16 [7:56:00<1:57:22, 2347.38s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.986554990421338
Perceptual loss: 0.04968157425173558
Steps: 1300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 12
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,300
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1201,0.2819
1202,0.283
1203,0.2817
1204,0.282
1205,0.2827
1206,0.2836
1207,0.2826
1208,0.2816
1209,0.2832
1210,0.2815


Generating predictions...
Computing metrics...


 88%|████████▊ | 14/16 [8:35:17<1:18:20, 2350.43s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9858547211047097
Perceptual loss: 0.05481568045797758
Steps: 1400


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 13
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,400
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1301,0.28
1302,0.2811
1303,0.2816
1304,0.2819
1305,0.2813
1306,0.2813
1307,0.2806
1308,0.2824
1309,0.2833
1310,0.2815


Generating predictions...
Computing metrics...


 94%|█████████▍| 15/16 [9:14:32<39:11, 2351.79s/it]  max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865586393575485
Perceptual loss: 0.04946861576172523
Steps: 1500


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 14
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,500
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1401,0.2808
1402,0.2798
1403,0.2812
1404,0.2795
1405,0.2814
1406,0.28
1407,0.2835
1408,0.2793
1409,0.2791
1410,0.2811


Generating predictions...
Computing metrics...


100%|██████████| 16/16 [9:53:47<00:00, 2226.73s/it]

Similarity: 0.9865586393575485
Perceptual loss: 0.04946861576172523





In [15]:
test_index = 0
text = filtered_data['test'][test_index]['svg']
model = FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 5020, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]

print(filtered_data['test'][test_index]['html'])
print("Answer of the question is:", answer)

{"FONT_SIZE4": "110%", "COLOR8": "#450a54", "COLOR7": "#d230ef", "SIZE3": "234px", "FONT_SIZE3": "4em", "COLOR6": "#1b38ab", "COLOR5": "#8f1c2b", "SIZE2": "69vw", "FONT_SIZE2": "19pt", "COLOR4": "#10c7a8", "COLOR3": "#6c597e", "FONT_SIZE1": "28px", "COLOR2": "#a213bf", "SIZE1": "66vh", "COLOR1": "#a8b6e5", "WORD4": "UNTIL", "WORD3": "APPLE", "WORD2": "THROW", "WORD1": "BEGAN"}
Answer of the question is: 
{"FONT_SIZE4": "111%", "COLOR8": "#450a54", "COLOR7": "#d230ef", "SIZE3": "234px", "FONT_SIZE3": "4em", "COLOR6": "#1b38ab", "COLOR5": "#8f1c2b", "SIZE2": "69vw", "FONT_SIZE2": "19pt", "COLOR4": "#10c7a8", "COLOR3": "#6c597e", "FONT_SIZE1": "28px", "COLOR2": "#a213bf", "SIZE1": "66vh", "COLOR1": "#a8b6e5", "WORD4": "UNTIL", "WORD3": "APPLE", "WORD2": "THROW", "WORD1": "BEGAN"}<|end_of_text|>


In [16]:
test_prediction(model, filtered_data['test'], steps)

Generating predictions...
Computing metrics...
Similarity: 0.9906618096260912
Perceptual loss: 0.04154091063537635
