# Llama 3.2 3B fine tuning with chopped data set

2025-03-03 16:01

Fixed deps. Chopped data set might not be as useful as I hoped. The perceptual loss goes down and it comes back up after the first epoch, indicating overfitting. The validation set is the manual test set. I have seen signs of potantially good screenshots around epoch 1 at least for some of the validation tests.

However on the test set the perceptual loss isn't too bad. However it's possible that it's because many of these "webpages" are too simple, not styled at all. Suggesting running it with an actual data set with full web pages.

In [1]:
import unsloth
import os
import numpy as np
import pandas as pd

import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel
from datasets import Dataset
from unsloth import is_bfloat16_supported

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 7500

def load_model():
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name="unsloth/Llama-3.2-3B-unsloth-bnb-4bit",
        max_seq_length=max_seq_length,
        load_in_4bit=True,
        dtype=None,
    )
    
    model = FastLanguageModel.get_peft_model(
        model,
        r=16,
        lora_alpha=16,
        lora_dropout=0,
        target_modules=["q_proj", "k_proj", "v_proj", "up_proj", "down_proj", "o_proj", "gate_proj"],
        use_rslora=True,
        use_gradient_checkpointing="unsloth",
        random_state = 32,
        loftq_config = None,
    )
    return model, tokenizer

In [3]:
def create_trainer(model, tokenizer, training_data, max_steps):
    training_arguments = TrainingArguments(
        learning_rate=3e-4,
        lr_scheduler_type="linear",
        per_device_train_batch_size=8,
        gradient_accumulation_steps=16,
        num_train_epochs=40,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        # max_steps=max_steps,
        optim="adamw_8bit",
        weight_decay=0.01,
        warmup_steps=150,
        output_dir="output",
        seed=0,
        save_total_limit=3,
    )

    if max_steps is not None:
        training_arguments.max_steps = max_steps
    
    return SFTTrainer(
        model=model,
        tokenizer=tokenizer,
        train_dataset=training_data,
        dataset_text_field="text",
        max_seq_length=max_seq_length,
        dataset_num_proc=10,
        packing=True,
        args=training_arguments,
    )

In [4]:
import numpy as np
from utils.similarity import calculate_metrics
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import torch

log_dir = 'output/runs'

def add_image_to_tensorboard(name, step, img_path):
    image = Image.open(img_path)
    image = image.convert('RGB')
    image_array = np.array(image)
    image_tensor = torch.from_numpy(image_array)
    image_tensor = image_tensor.permute(2, 0, 1)
    image_tensor = image_tensor.float() / 255.0
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_image(name, image_tensor, step)
    
def add_text_to_tensorboard(name, step, text):
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_text(name, text, step)

def postprocess_text(preds, labels):
    preds = [pred.strip().replace('<unk>', '') for pred in preds]
    labels = [[label.strip().replace('<unk>', '')] for label in labels]

    return preds, labels

def compute_metrics(decoded_predictions, decoded_labels, steps):
    similarity_scores = []
    perceptual_losses = []
    index = 1
    
    for prediction, label in zip(decoded_predictions, decoded_labels):
        prediction = prediction.replace(tokenizer.eos_token, '')
        
        add_text_to_tensorboard(f'valid_{index}_label_text', steps, label)
        add_text_to_tensorboard(f'valid_{index}_prediction_text', steps, prediction)
        
        metrics = calculate_metrics(prediction, label)
        
        if metrics is not None:
            similarity_scores.append(metrics['similarity'])
            perceptual_losses.append(metrics['perceptual_loss'])
            
            add_image_to_tensorboard(f'valid_{index}_expectation', steps, metrics['expected_screenshot_path'])
            add_image_to_tensorboard(f'valid_{index}_prediction', steps, metrics['predicted_screenshot_path'])
        
        index += 1

    results = {
        "similarity": float(np.mean(similarity_scores)),
        "perceptual_loss": float(np.mean(perceptual_losses)),
    }
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_scalar('similarity', results['similarity'], steps)
    writer.add_scalar('perceptual_loss', results['perceptual_loss'], steps)
    
    print("Similarity:", results['similarity'])
    print("Perceptual loss:", results['perceptual_loss'])

    return results

def test_prediction(model, data, steps):
    answers = []
    labels = []
    print("Generating predictions...")
    for row in data:
        inputs = tokenizer(
        [
            data_prompt.format(
                #instructions
                row['svg'],
                #answer
                "",
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
        answer = tokenizer.batch_decode(outputs)
        answers.append(answer[0].split("### Response:")[-1])
        labels.append(row['html'])

    print("Computing metrics...")
    return compute_metrics(answers, labels, steps)

In [5]:
!rm -rf output

In [7]:
!apt install zip -y
!rm -rf data-rb-chopped
!mkdir -p data-rb-chopped
!wget "https://www.dropbox.com/scl/fi/hsqsp79okuob4u63oj7j3/data-rb-chopped.zip?rlkey=ey3a4ap5h6v9mcaava1bps52n&dl=11" -O model.zip
!unzip model.zip -d data-rb-chopped

!rm -rf data-rb-validate
!mkdir -p data-rb-validate
!wget "https://www.dropbox.com/scl/fi/5szml8y5l248mcabj9rqg/verify-dataset.zip?rlkey=se33rwtxgngn0ts1i0pc8f6wk&st=1d68x9zt&dl=1" -O validate.zip
!unzip validate.zip -d data-rb-validate

Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following additional packages will be installed:
  unzip
The following NEW packages will be installed:
  unzip zip
0 upgraded, 2 newly installed, 0 to remove and 36 not upgraded.
Need to get 350 kB of archives.
After this operation, 930 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 unzip amd64 6.0-26ubuntu3.2 [175 kB]
Get:2 http://archive.ubuntu.com/ubuntu jammy/main amd64 zip amd64 3.0-12build2 [176 kB]
Fetched 350 kB in 1s (673 kB/s)m
debconf: delaying package configuration, since apt-utils is not installed

7[0;23r8[1ASelecting previously unselected package unzip.
(Reading database ... 36713 files and directories currently installed.)
Preparing to unpack .../unzip_6.0-26ubuntu3.2_amd64.deb ...
7[24;0f[42m[30mProgress: [  0%][49m[39m [..........................................................] 87[24;0f[42m[30mP

In [8]:
from datasets import load_from_disk, DatasetDict
dataset = load_from_disk('data-rb-chopped')
visual_validation_dataset = load_from_disk('data-rb-validate')

train_test_split = dataset.train_test_split(test_size=100/len(dataset)) 
train_dataset = train_test_split["train"]
temp_dataset = train_test_split["test"]

dataset = DatasetDict({
    "train": train_test_split["train"],
    "valid": visual_validation_dataset,
    "test": train_test_split["test"],
})

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['svg', 'html'],
        num_rows: 214712
    })
    valid: Dataset({
        features: ['svg', 'html'],
        num_rows: 5
    })
    test: Dataset({
        features: ['svg', 'html'],
        num_rows: 100
    })
})

In [6]:
model, tokenizer = load_model()

data_prompt = """Your job is to take an SVG file of a web design and convert it into a pixel-perfect HTML and CSS markup and stylesheet.

### Input:
{}

### Response:
{}"""

EOS_TOKEN = tokenizer.eos_token
def formatting_prompt(examples):
    inputs       = examples["svg"]
    outputs      = examples["html"]
    texts = []
    for input_, output in zip(inputs, outputs):
        text = data_prompt.format(input_, output) + EOS_TOKEN
        texts.append(text)
    return { "text" : texts, }



==((====))==  Unsloth 2025.3.1: Fast Llama patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.003 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Unsloth 2025.3.1 patched 28 layers with 28 QKV layers, 28 O layers and 28 MLP layers.


In [19]:
training_data = dataset.map(formatting_prompt, batched=True)

Map:   0%|          | 0/214712 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

In [20]:
training_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 214712
    })
    valid: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 5
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 100
    })
})

In [21]:
def get_token_lengths(examples):
    inputs = tokenizer(
        examples['text'],
        truncation=False,  # Don't truncate yet
        padding=False,     # Don't pad yet
        return_length=True,
    )

    return inputs

tokenized_data = training_data.map(get_token_lengths, batched=True)

def filter_function(example):
    return example['length'] <= max_seq_length

filtered_data = tokenized_data.filter(filter_function)

print(filtered_data)

Map:   0%|          | 0/214712 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Filter:   0%|          | 0/214712 [00:00<?, ? examples/s]

Filter:   0%|          | 0/5 [00:00<?, ? examples/s]

Filter:   0%|          | 0/100 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 120211
    })
    valid: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 4
    })
    test: Dataset({
        features: ['svg', 'html', 'text', 'input_ids', 'attention_mask', 'length'],
        num_rows: 59
    })
})


In [22]:
filtered_data = filtered_data.remove_columns(["input_ids", "attention_mask", "length"])
filtered_data.save_to_disk('data-rb-chopped-filtered-' + str(max_seq_length))

Saving the dataset (0/17 shards):   0%|          | 0/120211 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/4 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/59 [00:00<?, ? examples/s]

In [7]:
from datasets import load_from_disk

filtered_data = load_from_disk('data-rb-chopped-filtered-' + str(max_seq_length))

filtered_data

Loading dataset from disk:   0%|          | 0/17 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 120211
    })
    valid: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 59
    })
})

In [8]:
import torch
from tqdm import tqdm

resume = False

for steps in tqdm(range(0, 1500, 50)):
    print(f"Steps: {steps}")

    if steps > 0:
        trainer = create_trainer(model, tokenizer, filtered_data['train'], steps)
        if resume:
            trainer.train(resume_from_checkpoint=True)
        else:
            trainer.train()
            resume = True
        
    model = FastLanguageModel.for_inference(model)

    results = test_prediction(model, filtered_data['valid'], steps)

    if results is not None and results['perceptual_loss'] == 0.0:
        break

    model = FastLanguageModel.for_training(model)

    

  0%|          | 0/30 [00:00<?, ?it/s]

Steps: 0
Generating predictions...
Computing metrics...


  3%|▎         | 1/30 [03:15<1:34:28, 195.45s/it]

Similarity: 0.5739374347031116
Perceptual loss: 0.5538427978754044
Steps: 50


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 50
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1,0.9385
2,0.9021
3,0.9551
4,1.0136
5,1.0039
6,0.9468
7,0.9276
8,1.0059
9,1.0442
10,0.9584


Generating predictions...
Computing metrics...


  7%|▋         | 2/30 [1:29:45<24:22:22, 3133.66s/it]

Similarity: 0.3827984225004911
Perceptual loss: 0.7434577792882919
Steps: 100


Generating train split: 0 examples [00:00, ? examples/s]

max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 100
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
51,0.624
52,0.5875
53,0.6303
54,0.5923
55,0.5941
56,0.5633
57,0.5689
58,0.5671
59,0.5419
60,0.5187


Generating predictions...
Computing metrics...


 10%|█         | 3/30 [2:54:53<30:15:43, 4034.96s/it]

Similarity: 0.38714014291763305
Perceptual loss: 0.6896161139011383
Steps: 150


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 150
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
101,0.3738
102,0.3803
103,0.3849
104,0.4064
105,0.3951
106,0.3684
107,0.3737
108,0.3943
109,0.3587
110,0.3718


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.6230820212687832
Perceptual loss: 0.6282994747161865
Steps: 200


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 200
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
151,0.3299
152,0.3038
153,0.3636
154,0.3121
155,0.3503
156,0.3383
157,0.3475
158,0.3556
159,0.3233
160,0.3094


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.4772056562826037
Perceptual loss: 0.6732154190540314
Steps: 250


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 250
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
201,0.246
202,0.2753
203,0.2597
204,0.271
205,0.2979
206,0.2721
207,0.3104
208,0.3038
209,0.2784
210,0.2671


Generating predictions...
Computing metrics...


 20%|██        | 6/30 [6:53:37<30:22:34, 4556.45s/it]

Similarity: 0.5514663877844668
Perceptual loss: 0.5249455897137523
Steps: 300


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 300
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
251,0.2457
252,0.2151
253,0.2763
254,0.2711
255,0.2559
256,0.2736
257,0.2568
258,0.2632
259,0.2448
260,0.2321


Generating predictions...
Computing metrics...


 23%|██▎       | 7/30 [8:10:14<29:11:38, 4569.51s/it]

Similarity: 0.5551341425726435
Perceptual loss: 0.5178780211135745
Steps: 350


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 350
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
301,0.2394
302,0.2534
303,0.2557
304,0.2336
305,0.2396
306,0.2259
307,0.2615
308,0.2418
309,0.231
310,0.2317


Generating predictions...
Computing metrics...


 27%|██▋       | 8/30 [9:29:05<28:14:22, 4621.01s/it]

Similarity: 0.7102779510125401
Perceptual loss: 0.5625910349190235
Steps: 400


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 1
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 400
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
351,0.2655
352,0.2358
353,0.2401
354,0.2293
355,0.2422
356,0.2375
357,0.2339
358,0.2632
359,0.2525
360,0.225


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8451267016267593
Perceptual loss: 0.4153260560706258
Steps: 450


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 450
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
401,0.2217
402,0.2164
403,0.2209
404,0.2252
405,0.2335
406,0.2202
407,0.2289
408,0.2368
409,0.2214
410,0.2222


Generating predictions...
Computing metrics...


 33%|███▎      | 10/30 [12:00:15<25:23:17, 4569.86s/it]

Similarity: 0.6221776869282827
Perceptual loss: 0.5600565690547228
Steps: 500


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 500
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
451,0.2236
452,0.1961
453,0.2096
454,0.2133
455,0.2006
456,0.2142
457,0.2089
458,0.2398
459,0.2109
460,0.213


Generating predictions...
Computing metrics...


 37%|███▋      | 11/30 [13:20:03<24:28:19, 4636.80s/it]

Similarity: 0.8330716487716245
Perceptual loss: 0.38403721805661917
Steps: 550


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 550
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
501,0.2343
502,0.2086
503,0.2179
504,0.2521
505,0.1993
506,0.2283
507,0.2052
508,0.1944
509,0.2248
510,0.1912


Generating predictions...
Computing metrics...


 40%|████      | 12/30 [14:37:28<23:11:46, 4639.24s/it]

Similarity: 0.5518277260771811
Perceptual loss: 0.5489231944084167
Steps: 600


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 600
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
551,0.2615
552,0.1959
553,0.2172
554,0.2006
555,0.1838
556,0.2048
557,0.2095
558,0.2135
559,0.1972
560,0.1927


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.9389215500352293
Perceptual loss: 0.3270674031227827
Steps: 650


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 650
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
601,0.1983
602,0.2178
603,0.1962
604,0.1974
605,0.1998
606,0.2191
607,0.2106
608,0.2087
609,0.2016
610,0.1925


Generating predictions...
Computing metrics...


 47%|████▋     | 14/30 [17:14:47<20:50:33, 4689.62s/it]

Similarity: 0.7781338190396461
Perceptual loss: 0.4890983998775482
Steps: 700


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 700
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
651,0.1904
652,0.2014
653,0.204
654,0.2072
655,0.1807
656,0.1992
657,0.1896
658,0.2109
659,0.1967
660,0.1824


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.6294065007865357
Perceptual loss: 0.5716382376849651
Steps: 750


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 750
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
701,0.1826
702,0.1975
703,0.1882
704,0.2035
705,0.1947
706,0.1922
707,0.2067
708,0.1842
709,0.1849
710,0.1836


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.3933006390929222
Perceptual loss: 0.6978883594274521
Steps: 800


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 800
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
751,0.1792
752,0.1931
753,0.1897
754,0.1969
755,0.1962
756,0.177
757,0.1786
758,0.185
759,0.1832
760,0.1917


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.5413852951664011
Perceptual loss: 0.6129406988620758
Steps: 850


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 2
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 850
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
801,0.1666
802,0.1786
803,0.1863
804,0.1934
805,0.178
806,0.2078
807,0.1659
808,0.2062
809,0.1948
810,0.1943


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.4756374711170792
Perceptual loss: 0.6539374589920044
Steps: 900


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 900
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
851,0.1859
852,0.1763
853,0.1911
854,0.1939
855,0.1791
856,0.1725
857,0.3753
858,0.2004
859,0.1708
860,0.1937


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.5732909811733066
Perceptual loss: 0.5055544710485265
Steps: 950


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 950
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
901,0.1828
902,0.1694
903,0.1776
904,0.1763
905,0.1771
906,0.208
907,0.1892
908,0.1846
909,0.1626
910,0.1776


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.5736369788646698
Perceptual loss: 0.5335205942392349
Steps: 1000


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,000
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
951,0.1712
952,0.1654
953,0.1794
954,0.1772
955,0.1703
956,0.1892
957,0.1764
958,0.1882
959,0.1791
960,0.1646


Generating predictions...
Computing metrics...


max_steps is given, it will override any value given in num_train_epochs


Similarity: 0.8183461479381093
Perceptual loss: 0.4401044137775898
Steps: 1050


==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,050
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1001,0.187
1002,0.1815
1003,0.1871
1004,0.1757
1005,0.1589
1006,0.2022
1007,0.1534
1008,0.1824
1009,0.1626
1010,0.1738


Generating predictions...
Computing metrics...


 73%|███████▎  | 22/30 [27:42:57<10:23:45, 4678.19s/it]

Similarity: 0.3849829245358705
Perceptual loss: 0.6867330968379974
Steps: 1100


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,100
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1051,0.164
1052,0.1541
1053,0.1583
1054,0.1578
1055,0.1775
1056,0.1669
1057,0.2012
1058,0.1622
1059,0.1788
1060,0.1721


Generating predictions...
Computing metrics...


 77%|███████▋  | 23/30 [29:04:21<9:12:59, 4739.89s/it] 

Similarity: 0.3996595917269587
Perceptual loss: 0.6767501384019852
Steps: 1150


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,150
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1101,0.1522
1102,0.1837
1103,0.1523
1104,0.19
1105,0.1729
1106,0.1875
1107,0.1565
1108,0.1632
1109,0.1587
1110,0.165


Generating predictions...
Computing metrics...


 80%|████████  | 24/30 [30:21:26<7:50:31, 4705.33s/it]

Similarity: 0.4684059005230665
Perceptual loss: 0.6253143101930618
Steps: 1200


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,200
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1151,0.1601
1152,0.1607
1153,0.1828
1154,0.1621
1155,0.1633
1156,0.1521
1157,0.1622
1158,0.1616
1159,0.161
1160,0.1864


Generating predictions...
Computing metrics...


 83%|████████▎ | 25/30 [31:39:40<6:31:50, 4702.14s/it]

Similarity: 0.39499870557337996
Perceptual loss: 0.7042859643697739
Steps: 1250


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 3
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,250
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1201,0.1711
1202,0.1988
1203,0.1668
1204,0.1685
1205,0.1652
1206,0.1642
1207,0.16
1208,0.1538
1209,0.1575
1210,0.1676


Generating predictions...
Computing metrics...


 87%|████████▋ | 26/30 [32:54:30<5:09:13, 4638.40s/it]

Similarity: 0.3839351836591959
Perceptual loss: 0.6893871426582336
Steps: 1300


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,300
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1251,0.1659
1252,0.1632
1253,0.1683
1254,0.1513
1255,0.1711
1256,0.1607
1257,0.1721
1258,0.1574
1259,0.165
1260,0.1904


Generating predictions...
Computing metrics...


 90%|█████████ | 27/30 [34:12:13<3:52:17, 4645.74s/it]

Similarity: 0.4291782978922129
Perceptual loss: 0.6770193874835968
Steps: 1350


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,350
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1301,0.1698
1302,0.1651
1303,0.1693
1304,0.1703
1305,0.1525
1306,0.1718
1307,0.166
1308,0.1667
1309,0.1553
1310,0.1564


Generating predictions...
Computing metrics...


 93%|█████████▎| 28/30 [35:33:43<2:37:17, 4718.92s/it]

Similarity: 0.3935340289026498
Perceptual loss: 0.6850200891494751
Steps: 1400


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,400
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1351,0.1527
1352,0.1574
1353,0.1458
1354,0.1639
1355,0.1548
1356,0.1727
1357,0.1685
1358,0.1621
1359,0.1594
1360,0.1467


Generating predictions...
Computing metrics...


 97%|█████████▋| 29/30 [36:55:13<1:19:30, 4770.39s/it]

Similarity: 0.4698920926079154
Perceptual loss: 0.6358620822429657
Steps: 1450


max_steps is given, it will override any value given in num_train_epochs
==((====))==  Unsloth - 2x faster free finetuning | Num GPUs = 1
   \\   /|    Num examples = 54,787 | Num Epochs = 4
O^O/ \_/ \    Batch size per device = 8 | Gradient Accumulation steps = 16
\        /    Total batch size = 128 | Total steps = 1,450
 "-____-"     Number of trainable parameters = 24,313,856


Step,Training Loss
1401,0.1744
1402,0.1461
1403,0.1422
1404,0.1521
1405,0.1475
1406,0.1733
1407,0.1774
1408,0.1549
1409,0.1531
1410,0.1688


Generating predictions...
Computing metrics...


100%|██████████| 30/30 [38:10:37<00:00, 4581.25s/it]  

Similarity: 0.7088590345345437
Perceptual loss: 0.43334976583719254





In [11]:
test_index = 0
text = filtered_data['test'][test_index]['svg']
model = FastLanguageModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]

print(filtered_data['test'][test_index]['svg'])
print(filtered_data['test'][test_index]['html'])
print("Answer of the question is:", answer)

<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="401" height="868" viewBox="0 0 401 868"><g id="html1"><g data-tag="head" id="head1" data-z-index="auto" data-stacking-context="true" aria-owns="script1"><g data-tag="script" id="script1" data-z-index="auto" data-stacking-context="true"/></g><g data-tag="body" id="body1" data-z-index="auto" data-stacking-context="true" role="document" mask="url(#mask-for-body11)" aria-owns="p1 style1"><mask id="mask-for-body11"><rect width="401" height="868" x="8" y="8" fill="#ffffff"/></mask><g data-tag="p" id="p1" data-z-index="auto" data-stacking-context="true"><text color="rgb(0, 0, 0)" dominant-baseline="text-after-edge" font-family="&quot;Times New Roman&quot;" font-size="16px" font-size-adjust="none" font-stretch="100%" font-style="normal" font-variant="normal" font-weight="400" direction="ltr" letter-spacing="normal" text-decoration="none solid rgb(0, 0, 0)" text-anchor="start" text-rendering="auto" unicode

In [10]:
test_prediction(model, filtered_data['test'], steps+1)

Generating predictions...
Computing metrics...
Similarity: 0.9519440559470719
Perceptual loss: 0.09252261793560554


{'similarity': 0.9519440559470719, 'perceptual_loss': 0.09252261793560554}