## Training classifier

Using SmolVLM for extracting text and image embeddings

Can expand to Qwen2.5VM if needed

In [None]:
!nvcc --version

In [None]:
!pip install qwen_vl_utils

In [None]:
!pip install num2words

In [None]:
!pip install scikit-multilearn

In [None]:
!pip install -q accelerate datasets peft bitsandbytes tensorboard

In [None]:
!pip install flash-attn --no-build-isolation

In [None]:
from transformers import AutoProcessor, TrainingArguments, Trainer, BitsAndBytesConfig, Idefics3ForConditionalGeneration
from PIL import Image
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, Subset
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model
from skmultilearn.model_selection import iterative_train_test_split
from collections import Counter
import json
import os

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

In [4]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

## SmolVLM2 Instruct Model

In [None]:
USE_LORA = False
USE_QLORA = True

model_path = "HuggingFaceTB/SmolVLM2-2.2B-Instruct"
processor = AutoProcessor.from_pretrained(model_path)

try:
    # Attempt to use CUDA
    if torch.cuda.is_available():
        device = torch.device("cuda")
    else:
        raise Exception("CUDA not available.")
except Exception as e:
    print("Error: CUDA not found or not available. Using CPU instead.")
    device = torch.device("cpu")

if USE_QLORA or USE_LORA:
    lora_config = LoraConfig(
        r=8,
        lora_alpha=8,
        lora_dropout=0.1,
        target_modules=['down_proj','o_proj','k_proj','q_proj','gate_proj','up_proj','v_proj'],
        use_dora=False if USE_QLORA else True,
        init_lora_weights="gaussian"
    )
    lora_config.inference_mode = False
    if USE_QLORA:
        bnb_config = BitsAndBytesConfig(
            load_in_4bit=True,
            bnb_4bit_use_double_quant=True,
            bnb_4bit_quant_type="nf4",
            bnb_4bit_compute_dtype=torch.bfloat16
        )

    model = Idefics3ForConditionalGeneration.from_pretrained(
        model_path,
        quantization_config=bnb_config if USE_QLORA else None,
        #_attn_implementation="flash_attention_2",
        device_map="auto"
    )
    if getattr(model, "peft_config", None) is None:
        model.add_adapter(lora_config)
        model.enable_adapters()
    model = prepare_model_for_kbit_training(model)
    model = get_peft_model(model, lora_config)
    print(model.get_nb_trainable_parameters())
else:
    model = Idefics3ForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        #_attn_implementation="flash_attention_2",
    ).to(device)


In [5]:
# Continuing training model
from peft import PeftModel
model_path = "jerick5555/SmolVLM2-2.2B-Instruct-vqav2"
processor = AutoProcessor.from_pretrained("HuggingFaceTB/SmolVLM2-2.2B-Instruct")

model = Idefics3ForConditionalGeneration.from_pretrained(
        model_path,
        torch_dtype=torch.bfloat16,
        #_attn_implementation="flash_attention_2",
    ).to("cuda")

model = PeftModel.from_pretrained(model, model_path)

model = model.to("cuda")  # or "cpu" if GPU is not available
model.train()

print(model.get_nb_trainable_parameters())

You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

(0, 2257321840)


## Data Loader

In [6]:
# For example, a flattened version of your label tree:
ALL_CLASSES = [
    # Main classes
    "Abandoned Trolleys", "Animals", "Cleanliness", "Construction Sites",
    "Drains & Sewers", "Drinking Water", "Housing", "Illegal Parking",
    "Others", "Parks & Greenery", "Pests", "Roads & Footprints",
    "Shared Bicycles", "Smoking",
    "Others",  # this is often used as a catch-all category
    #Sub classes
    "Cold Storage", "FairPrice", "Giant", "Ikea", "Mustafa", "Other Trolleys", "ShengSong",
    "Bird Issues", "Cat Issues", "Dead Animal", "Dog Issues", "Injured Animal", "Other Animal Issues",
    "Bulky Waste in Common Areas", "Dirty Public Areas", "High-rise Littering", "Overflowing Litter Bin",
    "Construction Noise",
    "Choked Drain or Stagnant Water", "Damaged Drain", "Flooding", "Sewage Smell", "Sewer Choke or Overflow",
    "No Water", "Water Leak", "Water Pressure", "Water Quality",
    "Common Area Maintenance", "HDB Car Park Maintenance", "Lightning Maintenance", "Playground & Fitness Facilities Maintenance",
    "HDB or URA Car Park", "Motorcycle at Void Deck", "Road",
    "Fallen Tree or Branch", "Other Parks and Greenery Issues", "Overgrown Grass", "Park Facilities Maintenance", "Park Lighting Maintenance",
    "Bee & Hornets", "Cockroaches in Food Establishment", "Mosquitoes", "Rodents in Common Areas", "Rodents in Food Establishment",
    "Covered Linkway Maintenance", "Damaged Road Signs", "Faulty Streetlight", "Footpath Maintenance", "Road Maintenance",
    "Anywheel", "HelloRide", "Other Bicycles",
    "Food Premises", "Other Public Areas", "Parks & Park Connectors"
]

label_to_idx = {label: idx for idx, label in enumerate(ALL_CLASSES)}
NUM_CLASSES = len(ALL_CLASSES)

data_root = "/kaggle/input/huawei-data/2. Sorted"  # Adjust to your data folder path
label_vector_file = "/kaggle/input/huawei-label/label_vectors.json"  # Path to your label vector JSON file

# System prompt for the model
system_prompt = (
    "You are an expert in municipal services issues. Your task is to analyze the provided input, "
    "which may include an image and a description, and categorize the issue into one or more categories "
    "from the predefined list of municipal service issue types. Additionally, assess the severity of the issue "
    "as one of the following: Low, Medium, or High.\n\n"
    "The predefined list of categories is as follows:\n"
    + "\n".join(f"- {category}" for category in ALL_CLASSES) +
    "\n\nYour response should be in the following JSON format:\n"
    "{\n"
    "    \"categories\": [categories],\n"
    "    \"severity\": severity\n"
    "}\n\n"
    "Ensure that the categories are selected from the provided list of issue types, and the severity is determined "
    "based on the details provided in the input."
)


### Dataset

In [7]:
# Define a training dataset
class FixMyStreetDataset(Dataset):
    def __init__(self, data_root, label_vector_file):
        """
        data_root: Folder containing the image and JSON files.
        label_vector_file: Path to the JSON file mapping report IDs to label lists.
        """
        self.data_root = data_root

        # Load label mapping (file names without extension as keys)
        with open(label_vector_file, 'r', encoding='utf-8') as f:
            self.label_vector = json.load(f)

        self.report_ids = list(self.label_vector.keys())

    def __len__(self):
        return len(self.report_ids)

    def __getitem__(self, idx):
        report_id = self.report_ids[idx]

        # Get label vector entry for the current report ID
        label_entry = self.label_vector[report_id]

        image_path = label_entry["image_path"]

        # Load the JSON metadata
        # Get the image path and derive the JSON path
        image_path = image_path.replace("\\", "/")  # Normalize backslashes to forward slashes
        json_path = os.path.splitext(image_path)[0] + ".json"  # Replace the image extension with .json

        # Replace the old root ("data/2. Sorted") with the new data_root
        json_path = json_path.replace("data/2. Sorted", self.data_root)
        image_path = image_path.replace("data/2. Sorted", self.data_root)

        # Replace & with and
        json_path = json_path.replace("&", "and")
        image_path = image_path.replace("&", "and")

        # Normalize the path
        json_path = os.path.normpath(json_path)

        with open(json_path, 'r', encoding="utf-8") as f:
            metadata = json.load(f)

        # Prepare the text content
        input_text = metadata["description"] + "\n\n"
        input_text += "Nearby location tags: " + ", ".join([f"{k}: {v}" for tag in metadata["tags"]["nearby"] for k, v in tag.items()]) + "\n\n"
        input_text += "Enclosing location tags: " + ", ".join([f"{k}: {v}" for tag in metadata["tags"]["enclosing"] for k, v in tag.items()])

        # Check if the image exists
        if image_path:
            # Load the image
            image = Image.open(image_path)
            if image.mode != "RGB":
                image = image.convert("RGB")
        else:
            # Placeholder for missing image
            image = torch.zeros((3, 224, 224), dtype=torch.uint8)

        output_text = f'{{"categories": {json.dumps(label_entry["labels"])}, "severity": {json.dumps(label_entry["severity"])}}}'


        return input_text, image, output_text


In [8]:
image_token_id = processor.tokenizer.additional_special_tokens_ids[
            processor.tokenizer.additional_special_tokens.index("<image>")]

# Batching function
def collate_fn(data_stream):
    texts = []
    images = []

    for data in data_stream:
        user_content = [{"type": "text", "text": data[0]}, {"type": "image"}]

        messages = [
                {"role": "system", "content": system_prompt},
                {"role": "user", "content": user_content},
                {"role": "assistant", "content": [{"type": "text", "text": data[2]}]}
            ]
        
        # Process the inputs using the processor
        text = processor.apply_chat_template(
            messages,
            add_generation_prompt=False,
        )
        texts.append(text.strip())
        images.append([data[1]])

    batch = processor(text=texts, images=images, return_tensors="pt", padding=True)
    labels = batch["input_ids"].clone()
    labels[labels == processor.tokenizer.pad_token_id] = -100
    labels[labels == image_token_id] = -100
    batch["labels"] = labels

    return batch

In [9]:
# Create dataset and dataloader
dataset = FixMyStreetDataset(data_root=data_root,
                             label_vector_file=label_vector_file)

data_loader = DataLoader(dataset, batch_size=8, shuffle=True, collate_fn=collate_fn)

In [None]:
len(dataset)

In [None]:
dataset[0][2]

In [None]:
def labels_to_multi_hot(label_list, label_to_idx, num_classes):
    """
    Convert a list of label strings into a multi-hot tensor.

    Parameters:
      label_list (List[str]): The labels assigned to one example.
      label_to_idx (Dict[str,int]): Mapping from each label string to its index.
      num_classes (int): Total number of possible classes.

    Returns:
      torch.Tensor of shape (num_classes,) with 1s at the indices corresponding
      to label_list, 0s elsewhere.
    """
    multi_hot = torch.zeros(num_classes, dtype=torch.float32)
    for label in label_list:
        idx = label_to_idx.get(label)
        if idx is not None:
            multi_hot[idx] = 1.0
    return multi_hot


In [None]:
all_indices = list(range(len(dataset)))

X = np.arange(len(dataset)).reshape(-1, 1)
Y = np.stack([labels_to_multi_hot(dataset.report_ids[i], label_to_idx, NUM_CLASSES).numpy()
              for i in range(len(dataset))])

X_train, Y_train, X_test, Y_test = iterative_train_test_split(X, Y, test_size=0.2)
train_idx = X_train.ravel().tolist()
test_idx  = X_test.ravel().tolist()

train_dataset = Subset(dataset, train_idx)
test_dataset  = Subset(dataset, test_idx)

In [None]:
train_dataset[0]

In [None]:
test_dataset[0]

In [12]:
# 1. Count how often each label appears in the entire dataset
all_labels = [
    label
    for entry in dataset.label_vector.values()
    for label in entry["labels"]
]
label_freq = Counter(all_labels)  # counts per label :contentReference[oaicite:0]{index=0}

# 2. Identify “rare” labels (frequency ≤ 10)
rare_labels = {lbl for lbl, cnt in label_freq.items() if cnt <= 30}

# 3. Collect indices of dataset items that have at least one rare label
rare_indices = [
    idx
    for idx, report_id in enumerate(dataset.report_ids)
    if any(
        lbl in rare_labels
        for lbl in dataset.label_vector[report_id]["labels"]
    )
]

# 4. Create a Subset for these “rare-label” examples
rare_subset = Subset(dataset, rare_indices)  # torch.utils.data.Subset usage :contentReference[oaicite:1]{index=1}

In [13]:
len(rare_subset)

129

## Fine-tuning

In [14]:
model_name = model_path.split("/")[-1]

training_args = TrainingArguments(
    num_train_epochs=1,
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    warmup_steps=50,
    learning_rate=1e-4,
    weight_decay=0.01,
    logging_steps=25,
    save_strategy="steps",
    save_steps=150,
    save_total_limit=1,
    optim="paged_adamw_8bit", # for 8-bit, keep this, else adamw_hf
    bf16=True, # underlying precision for 8bit
    output_dir=f"./{model_name}-vqav3",
    hub_model_id=f"{model_name}-vqav3",
    report_to="tensorboard",
    remove_unused_columns=False,
    gradient_checkpointing=True
)

In [15]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=collate_fn,
    train_dataset=rare_subset
)

No label_names provided for model class `PeftModel`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


In [16]:
trainer.train()



Step,Training Loss


You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


TrainOutput(global_step=8, training_loss=1.1990596055984497, metrics={'train_runtime': 1715.6462, 'train_samples_per_second': 0.075, 'train_steps_per_second': 0.005, 'total_flos': 2398203286448256.0, 'train_loss': 1.1990596055984497, 'epoch': 0.9696969696969697})

In [17]:
trainer.push_to_hub()

You are using a model of type smolvlm to instantiate a model of type idefics3. This is not supported for all configurations of models and can yield errors.


training_args.bin:   0%|          | 0.00/5.37k [00:00<?, ?B/s]

Upload 5 LFS files:   0%|          | 0/5 [00:00<?, ?it/s]

events.out.tfevents.1745772089.e4755d98c020.109.0:   0%|          | 0.00/8.47k [00:00<?, ?B/s]

events.out.tfevents.1745775145.e4755d98c020.345.0:   0%|          | 0.00/8.82k [00:00<?, ?B/s]

adapter_model.safetensors:   0%|          | 0.00/42.2M [00:00<?, ?B/s]

events.out.tfevents.1745773356.e4755d98c020.219.0:   0%|          | 0.00/8.47k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/jerick5555/SmolVLM2-2.2B-Instruct-vqav2-vqav3/commit/feea31b0086a907c9e0b769d90cdfdc6174d25e4', commit_message='End of training', commit_description='', oid='feea31b0086a907c9e0b769d90cdfdc6174d25e4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/jerick5555/SmolVLM2-2.2B-Instruct-vqav2-vqav3', endpoint='https://huggingface.co', repo_type='model', repo_id='jerick5555/SmolVLM2-2.2B-Instruct-vqav2-vqav3'), pr_revision=None, pr_num=None)