In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score
import wandb
import xgboost as xgb

In [1]:
import os
import torch
import numpy as np
from tqdm import tqdm
from PIL import Image
from torch.utils.data import Dataset, DataLoader
from transformers import Blip2Processor, Blip2Model

# Set device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"

# Load BLIP-2 model and processor
model_id = "Salesforce/blip2-opt-2.7b"  # You can use a different BLIP-2 model variant
blip_model = Blip2Model.from_pretrained(model_id).to(device)
processor = Blip2Processor.from_pretrained(model_id)

# Define custom dataset
class UIDataset(Dataset):
    def __init__(self, root_dir, processor=None):
        self.root_dir = root_dir
        self.processor = processor
        self.image_paths = []

        for label in os.listdir(self.root_dir):
            subdir_path = os.path.join(self.root_dir, label)
            if os.path.isdir(subdir_path):
                for filename in os.listdir(subdir_path):
                    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.webp')):
                        file_path = os.path.join(subdir_path, filename)
                        self.image_paths.append((file_path, label))  # Store (image path, label)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index):
        file_path, label = self.image_paths[index]

        image = Image.open(file_path).convert("RGB")  

        if self.processor:
            inputs = self.processor(images=image, return_tensors="pt")
            image_tensor = inputs["pixel_values"].squeeze(0)  # Extract processed image tensor

        return image_tensor, label


config.json:   0%|          | 0.00/1.03k [00:00<?, ?B/s]

model.safetensors.index.json:   0%|          | 0.00/122k [00:00<?, ?B/s]

Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:   0%|          | 0.00/10.0G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/4.98G [00:00<?, ?B/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/432 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/882 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/798k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/3.56M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

processor_config.json:   0%|          | 0.00/68.0 [00:00<?, ?B/s]

FileNotFoundError: [Errno 2] No such file or directory: 'path_to_your_dataset'

In [2]:
# Create dataset and dataloader
dataset_path = "/Users/yuriivoievidka/UCU/CV/cv-project/data"
ui_dataset_all = UIDataset(root_dir=dataset_path, processor=processor)
ui_dataset_all_dataloader = DataLoader(ui_dataset_all, batch_size=32, shuffle=False)

# Extract image embeddings using BLIP-2 vision encoder
with torch.no_grad():
    all_embeddings = []
    for batch in tqdm(ui_dataset_all_dataloader):
        images, labels = batch
        images = images.to(device)

        embeddings = blip_model.vision_model(images)  # Extract features from BLIP-2 vision encoder
        embeddings = embeddings.pooler_output  # Take the pooled output if needed

        embeddings /= embeddings.norm(dim=-1, keepdim=True)  # Normalize embeddings

        all_embeddings.append(embeddings.cpu())

    all_embeddings = torch.cat(all_embeddings, dim=0)

print("Embeddings shape:", all_embeddings.shape)

# Convert embeddings to numpy
all_embeddings_np = all_embeddings.numpy()

# Encode labels as integers
label_to_index = {label: idx for idx, label in enumerate(set(label for _, label in ui_dataset_all.image_paths))}
all_labels_np = np.array([label_to_index[label] for _, label in ui_dataset_all.image_paths])

# Save embeddings and labels
np.save("blip2_embeddings.npy", all_embeddings_np)
np.save("blip2_labels.npy", all_labels_np)

100%|██████████| 130/130 [07:54<00:00,  3.65s/it]

Embeddings shape: torch.Size([4149, 1408])





In [6]:
embeddings = np.load("blip2_embeddings.npy")  
labels = np.load("blip2_labels.npy")  

X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42)

In [7]:
sweep_config = {
    "method": "random", 
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "booster": {
            "values": ["gbtree","gblinear"]
        },
        "max_depth": {
            "values": [3, 6, 9, 12]
        },
        "learning_rate": {
            "values": [0.1, 0.05, 0.2]
        },
        "subsample": {
            "values": [1, 0.5, 0.3]
        }
    }
}

In [10]:
sweep_id = wandb.sweep(sweep_config, project="ui-classification-experiments")

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Create sweep with ID: fjdy11wf
Sweep URL: https://wandb.ai/urik-voevidka-ukrainian-catholic-university/ui-classification-experiments/sweeps/fjdy11wf


In [11]:
def train():
  config={
        "learning_rate": 0.02,
        "architecture": "XGBoost",
        "dataset": "DesktopUI",
        "epochs": 100,
        "model": "XGBoost",
        "test_size": 0.3,
        "random_state": 42
    }

  wandb.init(config=config)  # defaults are over-ridden during the sweep
  config = wandb.config

  model = xgb.XGBClassifier(objective="multi:softmax", booster=config.booster, max_depth=config.max_depth,
                        learning_rate=config.learning_rate, subsample=config.subsample)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  predictions = [round(value) for value in y_pred]

  accuracy = accuracy_score(y_test, predictions)
  
  f1 = f1_score(y_test, predictions, average="weighted")
  recall = recall_score(y_test, predictions, average="weighted")
  precision = precision_score(y_test, predictions, average="weighted")

  wandb.log({
      "accuracy": accuracy,
      "f1_score": f1,
      "recall": recall, 
      "precision": precision,
      "classification_report": classification_report(y_test, predictions, output_dict=True)
  })
  

In [12]:
wandb.agent(sweep_id, train, count=25)

[34m[1mwandb[0m: Agent Starting Run: c6k7o7eg with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.3
[34m[1mwandb[0m: Currently logged in as: [33murik-voevidka[0m ([33murik-voevidka-ukrainian-catholic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84578
f1_score,0.84659
precision,0.84891
recall,0.84578


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: h3aravin with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.78474
f1_score,0.77823
precision,0.78898
recall,0.78474


[34m[1mwandb[0m: Agent Starting Run: 760qoeuy with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.78474
f1_score,0.77823
precision,0.78898
recall,0.78474


[34m[1mwandb[0m: Agent Starting Run: kgtyqipr with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.80803
f1_score,0.80438
precision,0.80865
recall,0.80803


[34m[1mwandb[0m: Agent Starting Run: 25j8c4p3 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.8241
f1_score,0.82005
precision,0.82523
recall,0.8241


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: o3vbyqyv with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84578
f1_score,0.84659
precision,0.84891
recall,0.84578


[34m[1mwandb[0m: Agent Starting Run: vnvpl2yr with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.83614
f1_score,0.8337
precision,0.836
recall,0.83614


[34m[1mwandb[0m: Agent Starting Run: 1q4bepst with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.78956
f1_score,0.78277
precision,0.79142
recall,0.78956


[34m[1mwandb[0m: Agent Starting Run: 2nndupue with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.85622
f1_score,0.85663
precision,0.8576
recall,0.85622


[34m[1mwandb[0m: Agent Starting Run: 3ozjcrni with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.81365
f1_score,0.80915
precision,0.81568
recall,0.81365


[34m[1mwandb[0m: Agent Starting Run: 1dfenp2c with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.85221
f1_score,0.85285
precision,0.85438
recall,0.85221


[34m[1mwandb[0m: Agent Starting Run: sudyj4fm with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84659
f1_score,0.8474
precision,0.84967
recall,0.84659


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: kggmolf5 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.82892
f1_score,0.82595
precision,0.82985
recall,0.82892


[34m[1mwandb[0m: Agent Starting Run: 44idk9dx with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.81365
f1_score,0.80983
precision,0.81615
recall,0.81365


[34m[1mwandb[0m: Agent Starting Run: 8kzgzepf with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.81285
f1_score,0.80891
precision,0.81328
recall,0.81285


[34m[1mwandb[0m: Agent Starting Run: vdargm48 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84659
f1_score,0.84545
precision,0.84716
recall,0.84659


[34m[1mwandb[0m: Agent Starting Run: vex7bbba with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.8241
f1_score,0.82005
precision,0.82523
recall,0.8241


[34m[1mwandb[0m: Agent Starting Run: 5e31bpo7 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84337
f1_score,0.84161
precision,0.8437
recall,0.84337


[34m[1mwandb[0m: Agent Starting Run: k1htlw9x with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84739
f1_score,0.84821
precision,0.85043
recall,0.84739


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ptuvgq7i with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.3


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.8506
f1_score,0.85123
precision,0.85267
recall,0.8506


[34m[1mwandb[0m: Agent Starting Run: q9rlf9dh with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84659
f1_score,0.8474
precision,0.84967
recall,0.84659


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ytg7epme with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.83614
f1_score,0.8337
precision,0.836
recall,0.83614


[34m[1mwandb[0m: Agent Starting Run: of6t98uq with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.80482
f1_score,0.79945
precision,0.80768
recall,0.80482


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: rh7ibcw3 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.82972
f1_score,0.82819
precision,0.83058
recall,0.82972


[34m[1mwandb[0m: Agent Starting Run: do5fhng0 with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.84659
f1_score,0.8474
precision,0.84967
recall,0.84659
