In [1]:
from torchvision import transforms
import numpy as np
import os
import sys
sys.path.append(os.path.abspath(".."))  
from models.ui_dataset import UIDataset
from torch.utils.data import random_split, DataLoader
import torch
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score
from wandb.integration.xgboost import WandbCallback
from transformers import AutoModel, AutoProcessor
from transformers.image_utils import load_image
from PIL import Image
from tqdm import tqdm

In [7]:
ckpt = "google/siglip2-base-patch16-256"
model = AutoModel.from_pretrained(ckpt, device_map="auto").eval()
processor = AutoProcessor.from_pretrained("google/siglip-base-patch16-256")


preprocessor_config.json:   0%|          | 0.00/368 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/711 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/798k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/409 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

In [8]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

In [9]:
dataset_path = "/Users/yuriivoievidka/UCU/CV/cv-project/data"

ui_dataset_all = UIDataset(root_dir=dataset_path, processor=processor)
ui_dataset_all_dataloader = DataLoader(ui_dataset_all, batch_size=32, shuffle=False)


with torch.no_grad():
    all_embeddings = []
    for batch in tqdm(ui_dataset_all_dataloader):
        images, labels = batch
        images = images.to(device)

        inputs = {"pixel_values": images}  
        embeddings = model.get_image_features(**inputs) 

        embeddings /= embeddings.norm(dim=-1, keepdim=True)
        all_embeddings.append(embeddings.cpu())

    all_embeddings = torch.cat(all_embeddings, dim=0)

print("Embeddings shape:", all_embeddings.shape)

# Convert embeddings to numpy
all_embeddings_np = all_embeddings.numpy()

# Encode labels as integers
label_to_index = {label: idx for idx, label in enumerate(set(label for _, label in ui_dataset_all.image_paths))}
all_labels_np = np.array([label_to_index[label] for _, label in ui_dataset_all.image_paths])

# Save embeddings and labels
np.save("siglip_embeddings.npy", all_embeddings_np)
np.save("siglip_labels.npy", all_labels_np)

  0%|          | 0/130 [00:00<?, ?it/s]

100%|██████████| 130/130 [02:47<00:00,  1.28s/it]

Embeddings shape: torch.Size([4149, 768])





In [None]:
embeddings = np.load("siglip_embeddings.npy")  
labels = np.load("siglip_labels.npy")  

X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42)

In [11]:
sweep_config = {
    "method": "random", 
    "metric": {
      "name": "f1_score",
      "goal": "maximize"   
    },
    "parameters": {
        "booster": {
            "values": ["gbtree","gblinear"]
        },
        "max_depth": {
            "values": [3, 6, 9, 12]
        },
        "learning_rate": {
            "values": [0.1, 0.05, 0.2]
        },
        "subsample": {
            "values": [1, 0.5, 0.3]
        }
    }
}

In [12]:
sweep_id = wandb.sweep(sweep_config, project="ui-classification-experiments")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: tx1ewc0i
Sweep URL: https://wandb.ai/urik-voevidka-ukrainian-catholic-university/ui-classification-experiments/sweeps/tx1ewc0i


In [None]:
def train():
  config={
        "learning_rate": 0.02,
        "architecture": "XGBoost",
        "dataset": "DesktopUI",
        "epochs": 100,
        "model": "SigLIP",
        "test_size": 0.3,
        "random_state": 42
    }

  wandb.init(config=config)
  config = wandb.config

  model = xgb.XGBClassifier(objective="multi:softmax", booster=config.booster, max_depth=config.max_depth,
                        learning_rate=config.learning_rate, subsample=config.subsample)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  predictions = [round(value) for value in y_pred]

  accuracy = accuracy_score(y_test, predictions)
  
  f1 = f1_score(y_test, predictions, average="weighted")
  recall = recall_score(y_test, predictions, average="weighted")
  precision = precision_score(y_test, predictions, average="weighted")

  wandb.log({
      "accuracy": accuracy,
      "f1_score": f1,
      "recall": recall, 
      "precision": precision,
      "classification_report": classification_report(y_test, predictions, output_dict=True)
  })
  