# CLIP as feature extractor + XGBoost classification




CLIP encoder has very strong feature extraction capabilities. In pair with XGBoost it is fast and effective way to classify small datasets.
 

In [1]:
from torchvision import transforms
import numpy as np
import os
import sys
sys.path.append(os.path.abspath(".."))  
from models.ui_dataset import UIDataset
from torch.utils.data import random_split, DataLoader
import torch
import wandb
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score, recall_score, precision_score
from wandb.integration.xgboost import WandbCallback
import torch
import os
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import CLIPModel, CLIPFeatureExtractor
from PIL import Image
from tqdm import tqdm

In [2]:
wandb.login()

[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.
[34m[1mwandb[0m: Currently logged in as: [33murik-voevidka[0m ([33murik-voevidka-ukrainian-catholic-university[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


True

In [3]:
dataset_path = "/Users/yuriivoievidka/UCU/CV/cv-project/data"

In [4]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
model_id = "openai/clip-vit-base-patch32"
clip_model = CLIPModel.from_pretrained(model_id).to(device)
feature_extractor = CLIPFeatureExtractor.from_pretrained(model_id)  



In [5]:
class UIDataset(Dataset):
    def __init__(self, root_dir, feature_extractor=None):
        self.root_dir = root_dir
        self.feature_extractor = feature_extractor
        self.image_paths = []

        for label in os.listdir(self.root_dir):
            subdir_path = os.path.join(self.root_dir, label)
            if os.path.isdir(subdir_path):
                for filename in os.listdir(subdir_path):
                    if filename.lower().endswith(('.png', '.jpg', '.jpeg', '.bmp', '.gif', '.tiff', '.webp')):
                        file_path = os.path.join(subdir_path, filename)
                        self.image_paths.append((file_path, label))  # Store (image path, label)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, index):
        file_path, label = self.image_paths[index]

        image = Image.open(file_path).convert("RGB")  
        
        if self.feature_extractor:
            image = self.feature_extractor(images=image, return_tensors="pt")["pixel_values"].squeeze(0)

        return image, label


In [6]:
dataset_path = "/Users/yuriivoievidka/UCU/CV/cv-project/data"
ui_dataset_all = UIDataset(root_dir=dataset_path, feature_extractor=feature_extractor)

ui_train_size = int(len(ui_dataset_all) * 0.6)
ui_val_size = int(len(ui_dataset_all) * 0.2)
ui_test_size = len(ui_dataset_all) - ui_train_size - ui_val_size
train_dataset, val_dataset, test_dataset = random_split(ui_dataset_all, [ui_train_size, ui_val_size, ui_test_size])

ui_dataset_all_dataloader = DataLoader(ui_dataset_all, batch_size=32, shuffle=True)


In [7]:
with torch.no_grad():
    all_embeddings = []
    for batch in tqdm(ui_dataset_all_dataloader):
        images, labels = batch
        images = images.to(device)

        embeddings = clip_model.get_image_features(images)

        embeddings /= embeddings.norm(dim=-1, keepdim=True)

        all_embeddings.append(embeddings.cpu())

    all_embeddings = torch.cat(all_embeddings, dim=0)

print("Embeddings shape:", all_embeddings.shape)

100%|██████████| 130/130 [02:13<00:00,  1.03s/it]

Embeddings shape: torch.Size([4149, 512])





In [9]:
all_embeddings_np = all_embeddings.numpy()

label_to_index = {label: idx for idx, label in enumerate(set(label for _, label in ui_dataset_all.image_paths))}
all_labels_np = np.array([label_to_index[label] for _, label in ui_dataset_all.image_paths])

np.save("clip_embeddings.npy", all_embeddings_np)
np.save("clip_labels.npy", all_labels_np)

In [10]:
!export WANDB_API_KEY="7e54b169f751025afccd3ef14aacaf584f797cb1"

In [8]:
embeddings = np.load("clip_embeddings.npy")  
labels = np.load("clip_labels.npy")  

X_train, X_test, y_train, y_test = train_test_split(embeddings, labels, test_size=0.3, random_state=42)

In [9]:
sweep_config = {
    "method": "random", 
    "metric": {
      "name": "accuracy",
      "goal": "maximize"   
    },
    "parameters": {
        "booster": {
            "values": ["gbtree","gblinear"]
        },
        "max_depth": {
            "values": [3, 6, 9, 12]
        },
        "learning_rate": {
            "values": [0.1, 0.05, 0.2]
        },
        "subsample": {
            "values": [1, 0.5, 0.3]
        }
    }
}

In [10]:
sweep_id = wandb.sweep(sweep_config, project="ui-classification-experiments")

Create sweep with ID: 9rkuh7kp
Sweep URL: https://wandb.ai/urik-voevidka-ukrainian-catholic-university/ui-classification-experiments/sweeps/9rkuh7kp


In [11]:
def train():
  config={
        "learning_rate": 0.02,
        "architecture": "XGBoost",
        "dataset": "DesktopUI",
        "epochs": 100,
        "model": "XGBoost",
        "test_size": 0.3,
        "random_state": 42
    }

  wandb.init(config=config)  # defaults are over-ridden during the sweep
  config = wandb.config

  model = xgb.XGBClassifier(objective="multi:softmax", booster=config.booster, max_depth=config.max_depth,
                        learning_rate=config.learning_rate, subsample=config.subsample)
  model.fit(X_train, y_train)

  y_pred = model.predict(X_test)
  predictions = [round(value) for value in y_pred]

  accuracy = accuracy_score(y_test, predictions)
  
  f1 = f1_score(y_test, predictions, average="weighted")
  recall = recall_score(y_test, predictions, average="weighted")
  precision = precision_score(y_test, predictions, average="weighted")

  wandb.log({
      "accuracy": accuracy,
      "f1_score": f1,
      "recall": recall, 
      "precision": precision,
      "classification_report": classification_report(y_test, predictions, output_dict=True)
  })
  

In [12]:
wandb.agent(sweep_id, train, count=25)

[34m[1mwandb[0m: Agent Starting Run: bxw5a2ia with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.3


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44096
f1_score,0.42114
precision,0.41561
recall,0.44096


[34m[1mwandb[0m: Agent Starting Run: py7hh5nn with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.3


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44177
f1_score,0.42187
precision,0.41637
recall,0.44177


[34m[1mwandb[0m: Agent Starting Run: o1bpfbnh with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.42329
f1_score,0.40235
precision,0.39439
recall,0.42329


[34m[1mwandb[0m: Agent Starting Run: yc9ds863 with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.43213
f1_score,0.41846
precision,0.41194
recall,0.43213


[34m[1mwandb[0m: Agent Starting Run: fz6mkht1 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44337
f1_score,0.41909
precision,0.41588
recall,0.44337


[34m[1mwandb[0m: Agent Starting Run: 3snz0ssy with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44096
f1_score,0.42107
precision,0.41565
recall,0.44096


[34m[1mwandb[0m: Agent Starting Run: a35zx1m9 with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.5


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44498
f1_score,0.41848
precision,0.39629
recall,0.44498


[34m[1mwandb[0m: Agent Starting Run: o2gpt5tm with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44739
f1_score,0.42339
precision,0.40287
recall,0.44739


[34m[1mwandb[0m: Agent Starting Run: iwzmvqbc with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.4249
f1_score,0.41718
precision,0.4135
recall,0.4249


[34m[1mwandb[0m: Agent Starting Run: mmzxrhzu with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 1


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.43293
f1_score,0.40813
precision,0.38686
recall,0.43293


[34m[1mwandb[0m: Agent Starting Run: t7xd2oj9 with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.43373
f1_score,0.42003
precision,0.41355
recall,0.43373


[34m[1mwandb[0m: Agent Starting Run: ondctahb with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.42972
f1_score,0.40696
precision,0.39646
recall,0.42972


[34m[1mwandb[0m: Agent Starting Run: xzd52iwu with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44096
f1_score,0.42114
precision,0.41561
recall,0.44096


[34m[1mwandb[0m: Agent Starting Run: n9yk6tbu with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.45141
f1_score,0.42508
precision,0.40266
recall,0.45141


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: ds5bhs87 with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44177
f1_score,0.4218
precision,0.41641
recall,0.44177


[34m[1mwandb[0m: Agent Starting Run: 9n508syv with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.45944
f1_score,0.43272
precision,0.41005
recall,0.45944


[34m[1mwandb[0m: Agent Starting Run: jx2unpfo with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44819
f1_score,0.42198
precision,0.39914
recall,0.44819


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: v2k87jv5 with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 3
[34m[1mwandb[0m: 	subsample: 0.3


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44096
f1_score,0.42114
precision,0.41561
recall,0.44096


[34m[1mwandb[0m: Agent Starting Run: 41f3b4hs with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44096
f1_score,0.42114
precision,0.41561
recall,0.44096


[34m[1mwandb[0m: Agent Starting Run: b1e0y4cr with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.2
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.5


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.42972
f1_score,0.40696
precision,0.39646
recall,0.42972


[34m[1mwandb[0m: Agent Starting Run: ek26xipq with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.3


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44177
f1_score,0.42187
precision,0.41637
recall,0.44177


[34m[1mwandb[0m: Agent Starting Run: t0u8kwen with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.05
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 1


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44096
f1_score,0.42107
precision,0.41565
recall,0.44096


[34m[1mwandb[0m: Agent Starting Run: apxgc1wa with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 12
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.44418
f1_score,0.41875
precision,0.3967
recall,0.44418


[34m[1mwandb[0m: Sweep Agent: Waiting for job.
[34m[1mwandb[0m: Job received.
[34m[1mwandb[0m: Agent Starting Run: t65tq0jj with config:
[34m[1mwandb[0m: 	booster: gbtree
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 6
[34m[1mwandb[0m: 	subsample: 0.3


0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.45141
f1_score,0.42546
precision,0.4027
recall,0.45141


[34m[1mwandb[0m: Agent Starting Run: 5151by7v with config:
[34m[1mwandb[0m: 	booster: gblinear
[34m[1mwandb[0m: 	learning_rate: 0.1
[34m[1mwandb[0m: 	max_depth: 9
[34m[1mwandb[0m: 	subsample: 0.5


Parameters: { "max_depth", "subsample" } are not used.



0,1
accuracy,▁
f1_score,▁
precision,▁
recall,▁

0,1
accuracy,0.43293
f1_score,0.41927
precision,0.41268
recall,0.43293


In [13]:
y_pred = model.predict(X_test)
predictions = [round(value) for value in y_pred]
classes = ["clean-ui", "to-crop", "unnecessary"]

NameError: name 'model' is not defined

In [14]:
print("Classification Report:")
print(classification_report(y_test, y_pred, target_names=classes))

Classification Report:


NameError: name 'y_pred' is not defined