Task:  Fine tune ConvNeXt V2 (facebook/convnextv2-huge-384)

Architecture: ConvNeXt V2 builds on the success of ConvNeXt V1, which was designed to improve the efficiency and performance of convolutional networks, making them competitive with transformer models.

Disaster imagery can include intricate details (e.g., damaged buildings, roads, etc.), and ConvNeXt V2 is particularly good at capturing such local patterns due to its advanced convolutional layers.
State-of-the-art: ConvNeXt V2 is one of the most powerful convolutional models, with an architecture designed to handle large-scale image classification tasks like ImageNet. It has shown excellent performance in both high-level and fine-grained image tasks.
Efficiency: While it's large, ConvNeXt V2 is optimized for efficiency compared to some transformer models, making it more manageable in terms of computational cost for training.

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-3.1.0-py3-none-any.whl.metadata (20 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.9.0,>=2023.1.0 (from fsspec[http]<=2024.9.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.9.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.1.0-py3-none-any.whl (480 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m480.6/480.6 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m11.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.9.0-py3-none-any.whl 

In [4]:
import torch
import numpy as np
from torch.optim import AdamW
from torch.cuda.amp import autocast, GradScaler
from torch.utils.data import DataLoader, IterableDataset
from datasets import load_dataset
from torchvision import transforms
from transformers import AutoModelForImageClassification, AutoImageProcessor
from tqdm.auto import tqdm
from google.colab import drive
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score


In [5]:
# Mount Google Drive to save the best model
drive.mount('/content/drive')
best_model_path = '/content/drive/MyDrive/best_model.pth'

Mounted at /content/drive


In [13]:
# Load the LADI dataset
ds = load_dataset("MITLL/LADI-v2-dataset", streaming=True)

README.md:   0%|          | 0.00/7.49k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/41 [00:00<?, ?it/s]

In [14]:
# Define the label keys for multi-label classification
label_keys = ['bridges_any', 'buildings_any', 'buildings_affected_or_greater', 'buildings_minor_or_greater',
              'debris_any', 'flooding_any', 'flooding_structures', 'roads_any', 'roads_damage',
              'trees_any', 'trees_damage', 'water_any']

In [15]:
# Model and processor setup
model_name = "facebook/convnextv2-huge-22k-384"
processor = AutoImageProcessor.from_pretrained(model_name)

# Load the model while ignoring the size mismatch for the classifier layer
model = AutoModelForImageClassification.from_pretrained(
    model_name,
    num_labels=len(label_keys),
    ignore_mismatched_sizes=True  # Ignore classifier weight size mismatch
)

Some weights of ConvNextV2ForImageClassification were not initialized from the model checkpoint at facebook/convnextv2-huge-22k-384 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 2816]) in the checkpoint and torch.Size([12, 2816]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([12]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [16]:
# Move the model to GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

ConvNextV2ForImageClassification(
  (convnextv2): ConvNextV2Model(
    (embeddings): ConvNextV2Embeddings(
      (patch_embeddings): Conv2d(3, 352, kernel_size=(4, 4), stride=(4, 4))
      (layernorm): ConvNextV2LayerNorm()
    )
    (encoder): ConvNextV2Encoder(
      (stages): ModuleList(
        (0): ConvNextV2Stage(
          (downsampling_layer): Identity()
          (layers): Sequential(
            (0): ConvNextV2Layer(
              (dwconv): Conv2d(352, 352, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), groups=352)
              (layernorm): ConvNextV2LayerNorm()
              (pwconv1): Linear(in_features=352, out_features=1408, bias=True)
              (act): GELUActivation()
              (grn): ConvNextV2GRN()
              (pwconv2): Linear(in_features=1408, out_features=352, bias=True)
              (drop_path): Identity()
            )
            (1): ConvNextV2Layer(
              (dwconv): Conv2d(352, 352, kernel_size=(7, 7), stride=(1, 1), padding=(3, 3), group

In [17]:
# Define optimizer and gradient scaler for mixed precision
optimizer = AdamW(model.parameters(), lr=2e-5)
scaler = GradScaler()

  scaler = GradScaler()


In [18]:
# Image preprocessing transformation
image_transforms = transforms.Compose([
    transforms.Resize((384, 384)),  # Resize to match the input size of ConvNeXtV2
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet stats
])

In [19]:
# StreamDataset class for handling image and label processing
class StreamDataset(IterableDataset):
    def __init__(self, dataset, split_name, label_keys, image_transforms):
        self.dataset = dataset
        self.split_name = split_name
        self.label_keys = label_keys
        self.image_transforms = image_transforms

    def process_item(self, item):
        image = item['image']
        labels = [int(item[key]) for key in self.label_keys]

        # Apply transformations to the image
        processed_image = self.image_transforms(image)
        return processed_image, labels

    def __iter__(self):
        for item in self.dataset[self.split_name]:
            yield self.process_item(item)

# Function to process the dataset for training
def process_dataset(model, dataset, split_name, label_keys, image_transforms, optimizer=None, train=False, batch_size=8):
    model.train() if train else model.eval()

    running_loss = 0.0
    all_labels = []
    all_preds = []

    processed_dataset = StreamDataset(dataset, split_name, label_keys, image_transforms)
    loader = DataLoader(processed_dataset, batch_size=batch_size, collate_fn=lambda x: tuple(zip(*x)))

    if not train:
        torch.no_grad()

    for batch_images, batch_labels in tqdm(loader):
        batch_images = torch.stack(batch_images).to(device)
        batch_labels = torch.tensor(batch_labels, dtype=torch.float32).to(device)

        if train:
            with autocast():
                outputs = model(batch_images)
                loss = torch.nn.BCEWithLogitsLoss()(outputs.logits, batch_labels)

            running_loss += loss.item()

            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        else:
            with torch.no_grad():
                outputs = model(batch_images)
                loss = torch.nn.BCEWithLogitsLoss()(outputs.logits, batch_labels)
                running_loss += loss.item()

        logits = outputs.logits.cpu().detach().numpy()
        predictions = torch.sigmoid(torch.tensor(logits)).cpu().detach().numpy()

        all_preds.extend(predictions)
        all_labels.extend(batch_labels.cpu().numpy())

        torch.cuda.empty_cache()

    return running_loss / len(all_preds), all_labels, all_preds

# Training loop
num_epochs = 5
batch_size = 8

# Initialize variables for best model tracking and early stopping
best_val_loss = float("inf")
patience, no_improvement = 2, 0

for epoch in range(num_epochs):
    print(f"Epoch {epoch+1}/{num_epochs}")

    # Training step
    train_loss, train_labels, train_preds = process_dataset(model, ds, 'train', label_keys, image_transforms, optimizer, train=True, batch_size=batch_size)
    print(f"Training Loss: {train_loss:.4f}")

    del train_labels, train_preds
    torch.cuda.empty_cache()

    # Validation step
    val_loss, val_labels, val_preds = process_dataset(model, ds, 'validation', label_keys, image_transforms, batch_size=batch_size)
    print(f"Validation Loss: {val_loss:.4f}")

     # Save the model if validation loss improves
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        torch.save(model.state_dict(), best_model_path)
        print(f"New best model saved with validation loss: {best_val_loss:.4f}")
        no_improvement = 0
    else:
        no_improvement += 1

    # Early stopping check
    if no_improvement >= patience:
        print("Early stopping due to no improvement.")
        break

    del val_labels, val_preds
    torch.cuda.empty_cache()

print("Training complete. You can now evaluate the model using the evaluation pipeline.")

Epoch 1/5


0it [00:00, ?it/s]

  with autocast():


Training Loss: 0.0255


0it [00:00, ?it/s]

Validation Loss: 0.0211
New best model saved with validation loss: 0.0211
Epoch 2/5


0it [00:00, ?it/s]

Training Loss: 0.0136


0it [00:00, ?it/s]

Validation Loss: 0.0244
Epoch 3/5


0it [00:00, ?it/s]

Training Loss: 0.0070


0it [00:00, ?it/s]

Validation Loss: 0.0287
Early stopping due to no improvement.
Training complete. You can now evaluate the model using the evaluation pipeline.


### Model Eval

In [20]:
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, roc_auc_score

# Load the best model weights from Google Drive
model.load_state_dict(torch.load(best_model_path))

# Set the model to evaluation mode
model.eval()

# Function to compute evaluation metrics
def compute_metrics(labels, predictions, threshold=0.5):
    # Convert predictions list to NumPy array
    predictions = np.array(predictions)

    # Convert probabilities to binary predictions using the threshold
    predictions_bin = (predictions >= threshold).astype(int)

    # Calculate accuracy, precision, recall, F1 score, and ROC AUC
    accuracy = accuracy_score(labels, predictions_bin)
    precision, recall, f1, _ = precision_recall_fscore_support(labels, predictions_bin, average='macro', zero_division=1)
    roc_auc = roc_auc_score(labels, predictions, average='macro')

    print(f"Accuracy: {accuracy:.4f}")
    print(f"Precision: {precision:.4f}")
    print(f"Recall: {recall:.4f}")
    print(f"F1 Score: {f1:.4f}")
    print(f"ROC AUC: {roc_auc:.4f}")

    return accuracy, precision, recall, f1, roc_auc

# Function to evaluate the model on the test set
def evaluate_model(model, dataset, label_keys, image_transforms, batch_size=8):
    print("Evaluating the model on the test dataset...")

    # Process the test dataset and get predictions and labels
    test_loss, test_labels, test_preds = process_dataset(model, dataset, 'test', label_keys, image_transforms, batch_size=batch_size)
    print(f"Test Loss: {test_loss:.4f}")

    # Compute metrics for the test set
    metrics = compute_metrics(test_labels, test_preds)

    # Clear memory
    del test_labels, test_preds
    torch.cuda.empty_cache()

    return metrics

# Run the evaluation on the test dataset
test_metrics = evaluate_model(model, ds, label_keys, image_transforms, batch_size=batch_size)


  model.load_state_dict(torch.load(best_model_path))


Evaluating the model on the test dataset...


0it [00:00, ?it/s]

Test Loss: 0.0218
Accuracy: 0.5033
Precision: 0.7876
Recall: 0.4960
F1 Score: 0.5677
ROC AUC: 0.9089
