# Llama 3.2 fine tuning with size-color-text-bare dataset, fixed dependencies, ran longer

2025-02-28 20:23

Same as 20250227-llama-size-color-text-bare-dataset-fixed but ran longer. No improvement in perceptual loss, it plateaus out. Trying larger model next.


In [1]:
!apt-get install build-essential -y

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  binutils binutils-common binutils-x86-64-linux-gnu bzip2 cpp cpp-11 dirmngr
  dpkg-dev fakeroot g++ g++-11 gcc gcc-11 gcc-11-base gnupg gnupg-l10n
  gnupg-utils gpg-agent gpg-wks-client gpg-wks-server gpgsm
  libalgorithm-diff-perl libalgorithm-diff-xs-perl libalgorithm-merge-perl
  libasan6 libatomic1 libbinutils libcc1-0 libctf-nobfd0 libctf0 libdpkg-perl
  libfakeroot libfile-fcntllock-perl libgcc-11-dev libgomp1 libisl23 libitm1
  libksba8 liblocale-gettext-perl liblsan0 libmpc3 libmpfr6 libnpth0
  libquadmath0 libstdc++-11-dev libtsan0 libubsan1 lto-disabled-list make
  patch pinentry-curses xz-utils
Suggested packages:
  binutils-doc bzip2-doc cpp-doc gcc-11-locales dbus-user-session
  pinentry-gnome3 tor debian-keyring g++-multilib g++-11-multilib gcc-11-doc
  gcc-multilib manpages-dev autoconf automake libtool flex bison gdb gcc

In [2]:
!pip uninstall torch torchvision torchaudio -y && pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

!pip install unsloth
!pip uninstall unsloth -y && pip install --upgrade --no-cache-dir "unsloth[colab-new] @ git+https://github.com/unslothai/unsloth.git"

!pip install sacrebleu
!pip install pytest-playwright
!playwright install
!pip install matplotlib
!pip install pillow
!pip install torchvision
!pip install lpips

!playwright install-deps  

!pip install -U numpy
!pip install tensorboard

Found existing installation: torch 2.4.0
Uninstalling torch-2.4.0:
  Successfully uninstalled torch-2.4.0
Found existing installation: torchvision 0.19.0
Uninstalling torchvision-0.19.0:
  Successfully uninstalled torchvision-0.19.0
Found existing installation: torchaudio 2.4.0
Uninstalling torchaudio-2.4.0:
  Successfully uninstalled torchaudio-2.4.0
[0mLooking in indexes: https://download.pytorch.org/whl/cu126
Collecting torch
  Downloading https://download.pytorch.org/whl/cu126/torch-2.6.0%2Bcu126-cp311-cp311-manylinux_2_28_x86_64.whl.metadata (28 kB)
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu126/torchvision-0.21.0%2Bcu126-cp311-cp311-linux_x86_64.whl.metadata (6.1 kB)
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu126/torchaudio-2.6.0%2Bcu126-cp311-cp311-linux_x86_64.whl.metadata (6.6 kB)
Collecting sympy==1.13.1 (from torch)
  Downloading https://download.pytorch.org/whl/sympy-1.13.1-py3-none-any.whl (6.2 MB)
[2K     [90m━━━

In [1]:
import unsloth
import os
import numpy as np
import pandas as pd

import torch
from trl import SFTTrainer, SFTConfig
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 5020

def load_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-1B-bnb-4bit",
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        lora_alpha=16,
        lora_dropout=0,
        target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
        use_rslora=True,
        use_gradient_checkpointing="unsloth",
        random_state = 32,
        loftq_config = None,
    )
    return model, tokenizer

In [3]:
def create_trainer(model, tokenizer, training_data, max_steps):
    training_arguments = SFTConfig(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=16,
        num_train_epochs=1,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        # max_steps=max_steps,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=150,
        output_dir="output2",
        seed=0,
        save_total_limit=3,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=10,
        packing=True,
    )

    if max_steps is not None:
        training_arguments.max_steps = max_steps
    
    return SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=training_data,
        args=training_arguments,
    )

In [4]:
from json import JSONDecodeError
import numpy as np
from utils.similarity import calculate_metrics
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import torch
import json

log_dir = 'output2/runs'
with open('size-color-text-page-compressed.html', 'r') as f:
    html_template = f.read()

def add_image_to_tensorboard(name, step, img_path):
    image = Image.open(img_path)
    image = image.convert('RGB')
    image_array = np.array(image)
    image_tensor = torch.from_numpy(image_array)
    image_tensor = image_tensor.permute(2, 0, 1)
    image_tensor = image_tensor.float() / 255.0
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_image(name, image_tensor, step)
    
def add_text_to_tensorboard(name, step, text):
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_text(name, text, step)

def postprocess_text(preds, labels):
    preds = [pred.strip().replace('<unk>', '') for pred in preds]
    labels = [[label.strip().replace('<unk>', '')] for label in labels]

    return preds, labels

def apply_to_templates(text, template):
    try:
        variables = json.loads(text)
    except JSONDecodeError:
        return None

    if not isinstance(variables, dict):
        return None
    
    for variable_name, variable_value in variables.items():
        template = template.replace('{{' + variable_name + '}}', str(variable_value))

    return template

def compute_metrics(decoded_predictions, decoded_labels, steps):
    similarity_scores = []
    perceptual_losses = []
    index = 1
    
    for prediction, label in zip(decoded_predictions, decoded_labels):
        prediction = prediction.replace(tokenizer.eos_token, '')
        
        add_text_to_tensorboard(f'valid_{index}_label_text', steps, label)
        add_text_to_tensorboard(f'valid_{index}_prediction_text', steps, prediction)
        
        applied_label = apply_to_templates(label, html_template)
        applied_prediction = apply_to_templates(prediction, html_template)

        if applied_label is None or applied_prediction is None:
            metrics = None
        else:
            add_text_to_tensorboard(f'valid_{index}_label_text_applied', steps, applied_label)
            add_text_to_tensorboard(f'valid_{index}_prediction_text_applied', steps, applied_prediction)

            # The paramerers here are in reverse!
            metrics = calculate_metrics(
                applied_label, 
                applied_prediction
            )
        
        if metrics is not None:
            similarity_scores.append(metrics['similarity'])
            perceptual_losses.append(metrics['perceptual_loss'])
            
            add_image_to_tensorboard(f'valid_{index}_expectation', steps, metrics['expected_screenshot_path'])
            add_image_to_tensorboard(f'valid_{index}_prediction', steps, metrics['predicted_screenshot_path'])
        
        index += 1

    results = {
        "similarity": float(np.mean(similarity_scores)),
        "perceptual_loss": float(np.mean(perceptual_losses)),
    }
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_scalar('similarity', results['similarity'], steps)
    writer.add_scalar('perceptual_loss', results['perceptual_loss'], steps)
    
    print("Similarity:", results['similarity'])
    print("Perceptual loss:", results['perceptual_loss'])

    return results

def test_prediction(model, data, steps):
    answers = []
    labels = []
    print("Generating predictions...")
    for row in data:
        inputs = tokenizer(
        [
            data_prompt.format(
                #instructions
                row['svg'],
                #answer
                "",
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens = 5020, use_cache = True)
        answer = tokenizer.batch_decode(outputs)
        answers.append(answer[0].split("### Response:")[-1])
        labels.append(row['html'])

    print("Computing metrics...")
    compute_metrics(answers, labels, steps)

In [5]:
!rm -rf output2

In [6]:
!apt install zip -y
!rm -rf data-rb-size-color-text-bare
!mkdir -p data-rb-size-color-text-bare
!wget "https://www.dropbox.com/scl/fi/or7eexwsl7s9ud8otg4y4/data-rb-size-color-text-bare.zip?rlkey=35kkqe2k0a4xorh8q6ow7c1in&dl=1" -O model.zip
!unzip model.zip -d data-rb-size-color-text-bare

!rm -rf data-rb-validate
!mkdir -p data-rb-validate

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  unzip
The following NEW packages will be installed:
  unzip zip
0 upgraded, 2 newly installed, 0 to remove and 53 not upgraded.
Need to get 350 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 zip amd64 3.0-12build2 [176 kB]
Fetched 350 kB in 1s (656 kB/s)m[33m
debconf: delaying package configuration, since apt-utils is not installed

7[0;23r8[1ASelecting previously unselected package unzip.
(Reading database ... 36713 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m

In [7]:
from datasets import load_from_disk
dataset = load_from_disk('data-rb-size-color-text-bare')

dataset = dataset.train_test_split(test_size=4/len(dataset))

dataset

DatasetDict({
    train: Dataset({
        features: ['svg', 'html'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html'],
        num_rows: 4
    })
})

In [5]:
model, tokenizer = load_model()

data_prompt = """Your job is to take variable parameters extracted from an SVG file of a web design and convert it into a variable set of parameters of HTML and CSS markup and stylesheet that represents the design in pixel-perfect accuracy.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    inputs       = examples["svg"]
    outputs      = examples["html"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }



==((====))==  Unsloth 2025.2.15: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.111 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.2.15 patched 16 layers with 16 QKV layers, 16 O layers and 16 MLP layers.


In [9]:
training_data = dataset.map(formatting_prompt, batched=True)

Map:   0%|          | 0/99849 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [10]:
training_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
})

In [11]:
def get_token_lengths(examples):
    inputs = tokenizer(
        examples['text'],
        truncation=False,  # Don't truncate yet
        padding=False,     # Don't pad yet
        return_length=True,
    )

    return inputs

tokenized_data = training_data.map(get_token_lengths, batched=True)

def filter_function(example):
    return example['length'] <= max_seq_length

filtered_data = tokenized_data.filter(filter_function)

print(filtered_data)

Map:   0%|          | 0/99849 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

Filter:   0%|          | 0/99849 [00:00<?, ? examples/s]

Filter:   0%|          | 0/4 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 4
    })
})


In [12]:
filtered_data = filtered_data.remove_columns(["input_ids", "attention_mask", "length"])
filtered_data.save_to_disk('data-rb-size-color-text-bare-filtered-' + str(max_seq_length))

Saving the dataset (0/1 shards):   0%|          | 0/99849 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

In [6]:
from datasets import load_from_disk

filtered_data = load_from_disk('data-rb-size-color-text-bare-filtered-' + str(max_seq_length))

filtered_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 99849
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
})

In [7]:
import torch
from tqdm import tqdm
import os

import sys
sys.set_int_max_str_digits(0)

resume = False

for steps in tqdm(range(0, 2501, 100)):
    print(f"Steps: {steps}")

    if steps > 0:
        trainer = create_trainer(model, tokenizer, filtered_data['train'], steps)
        if resume:
            trainer.train(resume_from_checkpoint=True)
        else:
            trainer.train()
            resume = True
        
    model = FastLanguageModel.for_inference(model)

    results = test_prediction(model, filtered_data['test'], steps)

    if results is not None and results['perceptual_loss'] == 0.0:
        break

    model = FastLanguageModel.for_training(model)

    

  0%|          | 0/26 [00:00<?, ?it/s]

Steps: 0
Generating predictions...
Computing metrics...


  4%|▍         | 1/26 [03:14<1:20:51, 194.04s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8597965896129608
Perceptual loss: 0.43494701385498047
Steps: 100


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 100
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1,0.8633
2,0.8625
3,0.8639
4,0.8638
5,0.8634
6,0.8591
7,0.8553
8,0.85
9,0.8424
10,0.8336


Generating predictions...
Computing metrics...


  8%|▊         | 2/26 [42:51<9:51:28, 1478.69s/it]

Similarity: 0.8340339411981404
Perceptual loss: 0.5016977861523628
Steps: 200


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 200
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
101,0.5206
102,0.5201
103,0.5192
104,0.5175
105,0.5181
106,0.5165
107,0.5164
108,0.5159
109,0.5145
110,0.514


Generating predictions...
Computing metrics...


 12%|█▏        | 3/26 [1:22:29<12:04:12, 1889.23s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8398889794945718
Perceptual loss: 0.4800076484680176
Steps: 300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 300
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
201,0.4847
202,0.4838
203,0.4966
204,0.5278
205,0.5098
206,0.5012
207,0.5004
208,0.4989
209,0.497
210,0.4978


Generating predictions...
Computing metrics...


 15%|█▌        | 4/26 [2:02:08<12:43:32, 2082.36s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9545239628962009
Perceptual loss: 0.22109356243163347
Steps: 400


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 400
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
301,0.3861
302,0.3877
303,0.385
304,0.3859
305,0.4422
306,0.4026
307,0.3925
308,0.384
309,0.381
310,0.3751


Generating predictions...
Computing metrics...


 19%|█▉        | 5/26 [2:41:35<12:44:48, 2185.15s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9848422558390394
Perceptual loss: 0.0718755564012099
Steps: 500


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 5
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 500
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
401,0.3027
402,0.3016
403,0.303
404,0.3005
405,0.301
406,0.2999
407,0.3018
408,0.3007
409,0.2995
410,0.2997


Generating predictions...
Computing metrics...


 23%|██▎       | 6/26 [3:20:59<12:28:38, 2245.95s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9849982745854231
Perceptual loss: 0.07372726130415685
Steps: 600


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 6
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 600
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
501,0.291
502,0.2902
503,0.2915
504,0.2936
505,0.3017
506,0.2938
507,0.2933
508,0.2935
509,0.292
510,0.2914


Generating predictions...
Computing metrics...


 27%|██▋       | 7/26 [4:00:20<12:03:04, 2283.41s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854293908395485
Perceptual loss: 0.05891591910040006
Steps: 700


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 7
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 700
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
601,0.2888
602,0.2876
603,0.2894
604,0.2897
605,0.2917
606,0.2893
607,0.2902
608,0.2887
609,0.289
610,0.2886


Generating predictions...
Computing metrics...


 31%|███       | 8/26 [4:39:38<11:32:12, 2307.34s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854525721722294
Perceptual loss: 0.058260552963474765
Steps: 800


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 8
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 800
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
701,0.2868
702,0.2848
703,0.2894
704,0.2933
705,0.2895
706,0.2885
707,0.2892
708,0.2903
709,0.2894
710,0.2895


Generating predictions...
Computing metrics...


 35%|███▍      | 9/26 [5:19:02<10:58:45, 2325.01s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854525721722294
Perceptual loss: 0.058260552963474765
Steps: 900


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 9
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 900
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
801,0.286
802,0.2846
803,0.286
804,0.2852
805,0.285
806,0.2843
807,0.286
808,0.2847
809,0.285
810,0.2843


Generating predictions...
Computing metrics...


 38%|███▊      | 10/26 [5:58:26<10:23:13, 2337.12s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854481745175372
Perceptual loss: 0.058373657579068094
Steps: 1000


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
901,0.2845
902,0.2834
903,0.2844
904,0.2885
905,0.2905
906,0.2864
907,0.2877
908,0.2857
909,0.2853
910,0.2865


Generating predictions...
Computing metrics...


 42%|████▏     | 11/26 [6:37:50<9:46:18, 2345.22s/it] max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854481745175372
Perceptual loss: 0.058373657579068094
Steps: 1100


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 10
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,100
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1001,0.283
1002,0.2835
1003,0.2848
1004,0.2842
1005,0.2872
1006,0.2854
1007,0.2847
1008,0.2837
1009,0.2846
1010,0.2866


Generating predictions...
Computing metrics...


 46%|████▌     | 12/26 [7:17:15<9:08:37, 2351.27s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.986550846237392
Perceptual loss: 0.049778976070228964
Steps: 1200


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 11
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,200
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1101,0.2824
1102,0.281
1103,0.2818
1104,0.2813
1105,0.2838
1106,0.2844
1107,0.2838
1108,0.2811
1109,0.2836
1110,0.2832


Generating predictions...
Computing metrics...


 50%|█████     | 13/26 [7:56:42<8:30:29, 2356.10s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854271199644555
Perceptual loss: 0.05854733884916641
Steps: 1300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 12
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,300
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1201,0.2826
1202,0.2837
1203,0.2828
1204,0.2838
1205,0.2822
1206,0.2884
1207,0.285
1208,0.2848
1209,0.2861
1210,0.2848


Generating predictions...
Computing metrics...


 54%|█████▍    | 14/26 [8:36:08<7:51:48, 2359.03s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854227223097634
Perceptual loss: 0.05866044346475974
Steps: 1400


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 13
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,400
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1301,0.281
1302,0.2818
1303,0.2821
1304,0.282
1305,0.2811
1306,0.2818
1307,0.2814
1308,0.2829
1309,0.2839
1310,0.2823


Generating predictions...
Computing metrics...


 58%|█████▊    | 15/26 [9:15:30<7:12:40, 2360.00s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854525721722294
Perceptual loss: 0.058260552963474765
Steps: 1500


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 14
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,500
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1401,0.2815
1402,0.2805
1403,0.2822
1404,0.2812
1405,0.2834
1406,0.2807
1407,0.2841
1408,0.2808
1409,0.28
1410,0.2823


Generating predictions...
Computing metrics...


 62%|██████▏   | 16/26 [9:54:57<6:33:39, 2361.94s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854525721722294
Perceptual loss: 0.058260552963474765
Steps: 1600


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 15
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,600
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1501,0.2798
1502,0.2796
1503,0.2816
1504,0.2805
1505,0.2831
1506,0.2803
1507,0.2807
1508,0.2808
1509,0.2831
1510,0.2819


Generating predictions...
Computing metrics...


 65%|██████▌   | 17/26 [10:34:24<5:54:30, 2363.43s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854525721722294
Perceptual loss: 0.058260552963474765
Steps: 1700


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 16
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,700
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1601,0.2787
1602,0.2787
1603,0.2803
1604,0.2804
1605,0.2794
1606,0.2818
1607,0.2814
1608,0.2802
1609,0.2787
1610,0.281


Generating predictions...
Computing metrics...


 69%|██████▉   | 18/26 [11:13:48<5:15:10, 2363.78s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854271199644555
Perceptual loss: 0.05854733884916641
Steps: 1800


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 17
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,800
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1701,0.2798
1702,0.2801
1703,0.2796
1704,0.2801
1705,0.2819
1706,0.2814
1707,0.2791
1708,0.2783
1709,0.2799
1710,0.2794


Generating predictions...
Computing metrics...


 73%|███████▎  | 19/26 [11:53:14<4:35:50, 2364.37s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865333394991467
Perceptual loss: 0.04973969887942076
Steps: 1900


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 18
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,900
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
1801,0.2798
1802,0.2795
1803,0.2789
1804,0.2795
1805,0.2814
1806,0.2799
1807,0.2784
1808,0.279
1809,0.2781
1810,0.2801


Step,Training Loss
1901,0.2784
1902,0.2802
1903,0.2801
1904,0.2829
1905,0.2783
1906,0.2789
1907,0.2802
1908,0.2813
1909,0.2797
1910,0.2793


Generating predictions...
Computing metrics...


 81%|████████  | 21/26 [13:12:05<3:17:04, 2364.93s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865333394991467
Perceptual loss: 0.04973969887942076
Steps: 2100


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 20
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 2,100
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
2001,0.2792
2002,0.2788
2003,0.2808
2004,0.2801
2005,0.28
2006,0.2793
2007,0.2775
2008,0.2781
2009,0.2781
2010,0.2795


Generating predictions...
Computing metrics...


 85%|████████▍ | 22/26 [13:51:32<2:37:41, 2365.48s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854307689005509
Perceptual loss: 0.05833438038825989
Steps: 2200


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 20
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 2,200
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
2101,0.2768
2102,0.2788
2103,0.2783
2104,0.2775
2105,0.2793
2106,0.2776
2107,0.2788
2108,0.2786
2109,0.2786
2110,0.2784


Generating predictions...
Computing metrics...


 88%|████████▊ | 23/26 [14:30:58<1:58:17, 2365.75s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9854307689005509
Perceptual loss: 0.05833438038825989
Steps: 2300


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 21
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 2,300
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
2201,0.2771
2202,0.2775
2203,0.2783
2204,0.2794
2205,0.2773
2206,0.2779
2207,0.2794
2208,0.2779
2209,0.2787
2210,0.2782


Generating predictions...
Computing metrics...


 92%|█████████▏| 24/26 [15:10:27<1:18:53, 2366.62s/it]max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865333394991467
Perceptual loss: 0.04973969887942076
Steps: 2400


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 22
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 2,400
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
2301,0.2786
2302,0.2771
2303,0.279
2304,0.278
2305,0.2778
2306,0.2788
2307,0.2776
2308,0.2783
2309,0.279
2310,0.2783


Generating predictions...
Computing metrics...


 96%|█████████▌| 25/26 [15:49:54<39:26, 2366.72s/it]  max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9865333394991467
Perceptual loss: 0.04973969887942076
Steps: 2500


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 14,137 | Num Epochs = 23
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 2,500
 "-____-"     Number of trainable parameters = 11,272,192


Step,Training Loss
2401,0.2777
2402,0.2767
2403,0.2779
2404,0.2787
2405,0.2789
2406,0.2795
2407,0.2786
2408,0.2783
2409,0.2748
2410,0.2779


Generating predictions...
Computing metrics...


100%|██████████| 26/26 [16:29:17<00:00, 2282.97s/it]

Similarity: 0.9854307689005509
Perceptual loss: 0.05833438038825989





In [8]:
test_index = 0
text = filtered_data['test'][test_index]['svg']
model = FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = 5020, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]

print(filtered_data['test'][test_index]['html'])
print("Answer of the question is:", answer)

{"FONT_SIZE4": "110%", "COLOR8": "#450a54", "COLOR7": "#d230ef", "SIZE3": "234px", "FONT_SIZE3": "4em", "COLOR6": "#1b38ab", "COLOR5": "#8f1c2b", "SIZE2": "69vw", "FONT_SIZE2": "19pt", "COLOR4": "#10c7a8", "COLOR3": "#6c597e", "FONT_SIZE1": "28px", "COLOR2": "#a213bf", "SIZE1": "66vh", "COLOR1": "#a8b6e5", "WORD4": "UNTIL", "WORD3": "APPLE", "WORD2": "THROW", "WORD1": "BEGAN"}
Answer of the question is: 
{"FONT_SIZE4": "110%", "COLOR8": "#450a54", "COLOR7": "#d230ef", "SIZE3": "234px", "FONT_SIZE3": "4em", "COLOR6": "#1b38ab", "COLOR5": "#8f1c2b", "SIZE2": "69vw", "FONT_SIZE2": "19pt", "COLOR4": "#10c7a8", "COLOR3": "#6c597e", "FONT_SIZE1": "28px", "COLOR2": "#a213bf", "SIZE1": "66vh", "COLOR1": "#a8b6e5", "WORD4": "UNTIL", "WORD3": "APPLE", "WORD2": "THROW", "WORD1": "BEGAN"}<|end_of_text|>


In [9]:
test_prediction(model, filtered_data['test'], steps)

Generating predictions...
Computing metrics...
Similarity: 0.993928965172685
Perceptual loss: 0.03703510426566936
