# Llama 3.2 90B fine tuning with large data set

In [1]:
import unsloth
import os
import numpy as np
import pandas as pd

import torch
from trl import SFTTrainer
from transformers import TrainingArguments, TextStreamer, AutoTokenizer
from unsloth.chat_templates import get_chat_template
from unsloth import FastLanguageModel, FastVisionModel
from datasets import Dataset
from unsloth import is_bfloat16_supported
from unsloth import UnslothTrainer, UnslothTrainingArguments
from transformers import BitsAndBytesConfig

# Saving model
from transformers import AutoTokenizer, AutoModelForSequenceClassification

# Warnings
import warnings
warnings.filterwarnings("ignore")

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.
🦥 Unsloth Zoo will now patch everything to make training faster!


In [2]:
max_seq_length = 12000

In [3]:
import numpy as np
from utils.similarity import calculate_metrics
from torch.utils.tensorboard import SummaryWriter
from PIL import Image
import torch

log_dir = 'output/runs'

VIEWPORT_SIZES = {
    'DESKTOP': {'width': 1440, 'height': 900},
    'TABLET': {'width': 834, 'height': 1210},
    'MOBILE': {'width': 393, 'height': 852},
}

def add_image_to_tensorboard(name, step, img_path):
    image = Image.open(img_path)
    image = image.convert('RGB')
    image_array = np.array(image)
    image_tensor = torch.from_numpy(image_array)
    image_tensor = image_tensor.permute(2, 0, 1)
    image_tensor = image_tensor.float() / 255.0
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_image(name, image_tensor, step)
    
def add_text_to_tensorboard(name, step, text):
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_text(name, text, step)

def postprocess_text(preds, labels):
    preds = [pred.strip().replace('<unk>', '') for pred in preds]
    labels = [[label.strip().replace('<unk>', '')] for label in labels]

    return preds, labels

def compute_metrics(decoded_predictions, decoded_labels, steps):
    similarity_scores = []
    perceptual_losses = []
    index = 1
    
    for prediction, label in zip(decoded_predictions, decoded_labels):
        prediction = prediction.replace(tokenizer.eos_token, '')
        
        add_text_to_tensorboard(f'valid_{index}_label_text', steps, label)
        add_text_to_tensorboard(f'valid_{index}_prediction_text', steps, prediction)
        
        metrics = calculate_metrics(prediction, label, 
                                    VIEWPORT_SIZES['MOBILE']['width'], VIEWPORT_SIZES['MOBILE']['height'])
        
        if metrics is not None:
            similarity_scores.append(metrics['similarity'])
            perceptual_losses.append(metrics['perceptual_loss'])
            
            add_image_to_tensorboard(f'valid_{index}_expectation', steps, metrics['expected_screenshot_path'])
            add_image_to_tensorboard(f'valid_{index}_prediction', steps, metrics['predicted_screenshot_path'])
        
        index += 1

    results = {
        "similarity": float(np.mean(similarity_scores)),
        "perceptual_loss": float(np.mean(perceptual_losses)),
    }
    
    writer = SummaryWriter(log_dir=log_dir)
    writer.add_scalar('similarity', results['similarity'], steps)
    writer.add_scalar('perceptual_loss', results['perceptual_loss'], steps)
    
    print("Similarity:", results['similarity'])
    print("Perceptual loss:", results['perceptual_loss'])

    return results

def test_prediction(model, data, steps):
    answers = []
    labels = []
    print("Generating predictions...")
    for row in data:
        inputs = tokenizer(
        [
            data_prompt.format(
                #instructions
                row['svg'],
                #answer
                "",
            )
        ], return_tensors = "pt").to("cuda")
        
        outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
        answer = tokenizer.batch_decode(outputs)
        answers.append(answer[0].split("### Response:")[-1])
        labels.append(row['html'])

    print("Computing metrics...")
    return compute_metrics(answers, labels, steps)

In [4]:
from datasets import load_from_disk

filtered_data = load_from_disk('data-rb-large-filtered-' + str(max_seq_length))


filtered_data

Loading dataset from disk:   0%|          | 0/23 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 100502
    })
    valid: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 4
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 23
    })
})

In [5]:
# Doing some reshuffling because:
# 1. The validation data set is arguably too simple for the 90B model and it does a pretty good job even before fine-tuning
# 2. The test data set is too small because of the filtering

from datasets import concatenate_datasets, DatasetDict

new_valid_data = concatenate_datasets([
    filtered_data['valid'],
    filtered_data['train'].select(range(25))
])

new_test_data = concatenate_datasets([
    filtered_data['test'],
    filtered_data['train'].select(range(25, 52))
])

new_train_data = filtered_data['train'].select(range(25, len(filtered_data['train'])))

filtered_data = DatasetDict({
    'train': new_train_data,
    'valid': new_valid_data,
    'test': new_test_data
})

In [6]:
filtered_data

DatasetDict({
    train: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 100477
    })
    valid: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 29
    })
    test: Dataset({
        features: ['svg', 'html', 'text'],
        num_rows: 50
    })
})

In [7]:
data_prompt = """Your job is to take an SVG file of a web design and convert it into a pixel-perfect HTML and CSS markup and stylesheet.

### Input:
{}

### Response:
{}"""

In [8]:
from transformers import AutoModel

model, _ = FastVisionModel.from_pretrained('./output/final', 
                                        device_map="cuda:0",
                                        max_seq_length = max_seq_length,
                                        dtype = torch.float16,
                                        load_in_4bit = True,)

tokenizer = AutoTokenizer.from_pretrained("unsloth/Llama-3.2-90B-Vision-Instruct-bnb-4bit")

test_index = 0
text = filtered_data['test'][test_index]['svg']
model = FastVisionModel.for_inference(model)
inputs = tokenizer(
[
    data_prompt.format(
        #instructions
        text,
        #answer
        "",
    )
], return_tensors = "pt").to("cuda")

outputs = model.generate(**inputs, max_new_tokens = max_seq_length, use_cache = True)
answer=tokenizer.batch_decode(outputs)
answer = answer[0].split("### Response:")[-1]

print(filtered_data['test'][test_index]['svg'])
print(filtered_data['test'][test_index]['html'])
print("Answer of the question is:", answer)

==((====))==  Unsloth 2025.3.3: Fast Mllama vision patching. Transformers: 4.46.3.
   \\   /|    GPU: NVIDIA H100 NVL. Max memory: 93.111 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.5.1+cu124. CUDA: 9.0. CUDA Toolkit: 12.4. Triton: 3.1.0
\        /    Bfloat16 = TRUE. FA [Xformers = 0.0.28.post3. FA2 = False]
 "-____-"     Free Apache license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


Loading checkpoint shards:   0%|          | 0/10 [00:00<?, ?it/s]

<svg xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink" width="393" height="852" viewBox="0 0 393 852"><g id="html1"><g data-tag="head" id="head1" data-z-index="auto" data-stacking-context="true" aria-owns="script1"><g data-tag="script" id="script1" data-z-index="auto" data-stacking-context="true"/></g><g data-tag="body" id="body1" data-z-index="auto" data-stacking-context="true" role="document" aria-owns="center1 hr1 center2 style1"><g data-tag="center" id="center1" data-z-index="auto" data-stacking-context="true" aria-owns="h11"><g data-tag="h1" id="h11" data-z-index="auto" data-stacking-context="true" role="heading" aria-level="1"><text color="rgb(0, 0, 0)" dominant-baseline="text-after-edge" font-family="&quot;Times New Roman&quot;" font-size="32px" font-size-adjust="none" font-stretch="100%" font-style="normal" font-variant="normal" font-weight="700" direction="ltr" letter-spacing="normal" text-decoration="none solid rgb(0, 0, 0)" text-anchor="start" tex

In [9]:
test_prediction(model, filtered_data['test'], 500)

Generating predictions...
Computing metrics...
Script failed with error: Traceback (most recent call last):
  File "/utils/similarity.py", line 192, in <module>
    take_screenshot(args.predicted_url, args.predicted_screenshot_path, args.viewport_width, args.viewport_height)
  File "/utils/similarity.py", line 97, in take_screenshot
    page.goto(url)
  File "/opt/conda/lib/python3.11/site-packages/playwright/sync_api/_generated.py", line 9018, in goto
    self._sync(
  File "/opt/conda/lib/python3.11/site-packages/playwright/_impl/_sync_base.py", line 115, in _sync
    return task.result()
           ^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/playwright/_impl/_page.py", line 551, in goto
    return await self._main_frame.goto(**locals_to_params(locals()))
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/site-packages/playwright/_impl/_frame.py", line 145, in goto
    await self._channel.send("goto", locals_to_par

{'similarity': 0.9530310030860966, 'perceptual_loss': 0.30120282709522517}