# Multiheader Classification Beam Damage Classification

## Data Augmentation  & Oversampling

In [4]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter

# === CONFIG ===
original_csv = "Final-Competition-2025/cleaned_renamed_labeled_dataset.csv"
output_train_csv = "oversampled_train.csv"
output_val_csv = "validation_set.csv"

# === STEP 1: Load dataset ===
print("📥 Loading dataset...")
df = pd.read_csv(original_csv)
print(f"✅ Loaded {len(df)} samples")

# === STEP 2: Split into train and val ===
train_df, val_df = train_test_split(
    df, test_size=0.2, stratify=df["class_label"], random_state=42
)
print(f"🧪 Split: {len(train_df)} train / {len(val_df)} val")

# === STEP 3: Oversample rare damage labels ===
train_df = train_df.copy()
train_df["damage_labels"] = train_df["damage_labels"].fillna("").astype(str).str.split(",")

# Count all labels
flat_labels = [int(lbl) for sub in train_df["damage_labels"] for lbl in sub if lbl.strip().isdigit()]
label_counts = Counter(flat_labels)
print("📊 Original label distribution:", dict(label_counts))

# Define rare labels
rare_labels = [label for label, count in label_counts.items() if count < 50]
print(f"🔍 Rare labels (<50 occurrences): {rare_labels}")

# Oversample each rare label based on its scarcity
augmented = [train_df]
for label in rare_labels:
    matching_rows = train_df[train_df["damage_labels"].apply(lambda lst: str(label) in lst)]
    n_repeat = max(1, 3 * (50 - label_counts[label]) // (len(matching_rows) + 1))
    print(f"🔁 Oversampling label {label}: duplicating {len(matching_rows)} rows x{n_repeat}")
    augmented.append(pd.concat([matching_rows] * n_repeat))

augmented_train_df = pd.concat(augmented, ignore_index=True)

# Convert list back to string
augmented_train_df["damage_labels"] = augmented_train_df["damage_labels"].apply(lambda x: ",".join(x))

# === STEP 4: Save train and validation sets ===
val_df = val_df.copy()
val_df["damage_labels"] = val_df["damage_labels"].fillna("").astype(str)

augmented_train_df.to_csv(output_train_csv, index=False)
val_df.to_csv(output_val_csv, index=False)

print(f"✅ Saved oversampled training set → {output_train_csv}")
print(f"✅ Saved validation set → {output_val_csv}")


📥 Loading dataset...
✅ Loaded 452 samples
🧪 Split: 361 train / 91 val
📊 Original label distribution: {1: 60, 3: 15, 8: 124, 7: 48, 9: 35, 6: 103, 0: 96, 4: 56, 10: 3, 5: 4}
🔍 Rare labels (<50 occurrences): [3, 7, 9, 10, 5]
🔁 Oversampling label 3: duplicating 15 rows x6
🔁 Oversampling label 7: duplicating 48 rows x1
🔁 Oversampling label 9: duplicating 35 rows x1
🔁 Oversampling label 10: duplicating 3 rows x35
🔁 Oversampling label 5: duplicating 4 rows x27
✅ Saved oversampled training set → oversampled_train.csv
✅ Saved validation set → validation_set.csv


In [6]:
# Import the helper module
from m3pro_gpu_helper import setup_m3pro_gpu, seed_everything

# Replace your device setup with this line
device = setup_m3pro_gpu()

# Replace your current seeding function with this
seed_everything(42)

🖥️ System: Darwin arm64
🚀 Using Apple Silicon GPU via MPS
📊 PyTorch Version: 2.7.0


## Training a Multi-Header Classification Model for Beam Damage Classification

In [None]:
import os
import random
import numpy as np
import pandas as pd
import cv2
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset, random_split
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
import albumentations as A
from albumentations.pytorch import ToTensorV2
from torchvision import models

# # === SEED EVERYTHING FOR REPRODUCIBILITY ===
# def set_seed(seed=42):
#     random.seed(seed)
#     np.random.seed(seed)
#     torch.manual_seed(seed)
#     torch.cuda.manual_seed(seed)
#     torch.cuda.manual_seed_all(seed)
#     torch.backends.cudnn.deterministic = True
#     torch.backends.cudnn.benchmark = False
#     os.environ['PYTHONHASHSEED'] = str(seed)

# set_seed(42)

# === Albumentations Dataset ===
class AlbumentationsDamageDataset(Dataset):
    def __init__(self, csv_path, root_dir, transform=None):
        self.df = pd.read_csv(csv_path)
        self.root_dir = root_dir
        self.transform = transform
        self.class_folders = {18: "Class A", 19: "Class B", 20: "Class C"}

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        class_label = int(row["class_label"])
        subfolder = self.class_folders[class_label]
        img_path = os.path.join(self.root_dir, subfolder, row["id"])

        image = cv2.imread(img_path)
        image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)

        if self.transform:
            image = self.transform(image=image)["image"]

        class_tensor = torch.tensor(class_label - 18, dtype=torch.long)

        damage_vector = torch.zeros(11)
        if pd.notna(row["damage_labels"]):
            for l in str(row["damage_labels"]).split(","):
                if l.strip().isdigit():
                    damage_vector[int(l)] = 1.0

        return image, class_tensor, damage_vector


class MultiTaskDamageModel(nn.Module):
    def __init__(self, version='b4', num_classes=3, num_damage_labels=11):
        super().__init__()
        
        if version == 'b3':
            base = models.efficientnet_b3(weights="IMAGENET1K_V1")
        elif version == 'b4':
            base = models.efficientnet_b4(weights="IMAGENET1K_V1")
        elif version == 'b5':
            base = models.efficientnet_b5(weights="IMAGENET1K_V1")
        
        else:
            raise ValueError(f"Unsupported EfficientNet version: {version}")

        in_features = base.classifier[1].in_features
        base.classifier = nn.Identity()
        
        self.backbone = base
        self.class_head = nn.Sequential(nn.Dropout(0.4), nn.Linear(in_features, num_classes))
        self.damage_head = nn.Sequential(nn.Dropout(0.4), nn.Linear(in_features, num_damage_labels))

    def forward(self, x):
        features = self.backbone(x)
        class_logits = self.class_head(features)
        damage_logits = self.damage_head(features)
        return class_logits, damage_logits


# === Transformations ===
train_transform = A.Compose([
    A.RandomResizedCrop(size=(300, 300), scale=(0.8, 1.0), ratio=(0.9, 1.1), p=1.0),
    A.HorizontalFlip(p=0.5),
    A.RandomBrightnessContrast(p=0.2),
    A.ShiftScaleRotate(shift_limit=0.05, scale_limit=0.05, rotate_limit=15, p=0.3),
    A.Normalize(),
    ToTensorV2()
])

val_transform = A.Compose([
    A.Resize(height=300, width=300),
    A.Normalize(),
    ToTensorV2()
])


# === Config ===
csv_path = "oversampled_train.csv"
image_root = "Final-Competition-2025"
num_epochs = 50
batch_size = 32
learning_rate = 1e-4
val_split = 0.1
threshold = 0.5 # with 0.4 we got 64 % model:version='b4',batch=32, 
num_damage_labels = 11
damage_ignore_label = 2

# === Dataset & Dataloaders ===
full_dataset = AlbumentationsDamageDataset(csv_path, image_root, transform=train_transform)
val_size = int(len(full_dataset) * val_split)
train_size = len(full_dataset) - val_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size], generator=torch.Generator().manual_seed(42))
val_dataset.dataset.transform = val_transform

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

# === Model, Losses, Optimizer ===
model = MultiTaskDamageModel().to(device)
ce_loss = nn.CrossEntropyLoss(label_smoothing=0.1)
rare_labels = [1, 3, 4]
pos_weights = torch.tensor([5.0 if i in rare_labels else 1.0 for i in range(num_damage_labels)])
pos_weights[damage_ignore_label] = 0.0
bce_loss = nn.BCEWithLogitsLoss(pos_weight=pos_weights.to(device))
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# === Training Loop ===
best_val_damage_f1 = 0.0
for epoch in range(1, num_epochs + 1):
    model.train()
    total_loss = 0.0
    class_preds_train, class_targets_train = [], []
    damage_preds_train, damage_targets_train = [], []

    for images, class_labels, damage_labels in tqdm(train_loader, desc=f"Epoch {epoch}/{num_epochs}"):
        images, class_labels, damage_labels = images.to(device), class_labels.to(device), damage_labels.to(device)
        optimizer.zero_grad()
        class_outputs, damage_outputs = model(images)
        loss = ce_loss(class_outputs, class_labels) + bce_loss(damage_outputs, damage_labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

        class_preds = class_outputs.argmax(dim=1)
        damage_probs = torch.sigmoid(damage_outputs)
        damage_bin = (damage_probs > threshold).float()
        class_preds_train.extend(class_preds.cpu().numpy())
        class_targets_train.extend(class_labels.cpu().numpy())
        damage_preds_train.append(damage_bin.cpu())
        damage_targets_train.append(damage_labels.cpu())

    # === Validation ===
    model.eval()
    val_loss = 0.0
    class_preds_val, class_targets_val = [], []
    damage_preds_val, damage_targets_val = [], []
    with torch.no_grad():
        for images, class_labels, damage_labels in val_loader:
            images, class_labels, damage_labels = images.to(device), class_labels.to(device), damage_labels.to(device)
            class_outputs, damage_outputs = model(images)
            loss = ce_loss(class_outputs, class_labels) + bce_loss(damage_outputs, damage_labels)
            val_loss += loss.item()

            class_preds = class_outputs.argmax(dim=1)
            class_preds_val.extend(class_preds.cpu().numpy())
            class_targets_val.extend(class_labels.cpu().numpy())

            damage_probs = torch.sigmoid(damage_outputs)
            damage_bin = (damage_probs > threshold).float()
            for i in range(damage_bin.size(0)):
                if damage_bin[i].sum() == 0:
                    top_index = damage_probs[i].topk(1).indices.item()
                    if top_index != damage_ignore_label:
                        damage_bin[i, top_index] = 1.0
            damage_preds_val.append(damage_bin.cpu())
            damage_targets_val.append(damage_labels.cpu())

    # === Metrics ===
    damage_preds_train = torch.cat(damage_preds_train).numpy()
    damage_targets_train = torch.cat(damage_targets_train).numpy()
    damage_preds_val = torch.cat(damage_preds_val).numpy()
    damage_targets_val = torch.cat(damage_targets_val).numpy()

    train_acc = accuracy_score(class_targets_train, class_preds_train)
    val_acc = accuracy_score(class_targets_val, class_preds_val)
    train_f1 = f1_score(class_targets_train, class_preds_train, average="macro")
    val_f1 = f1_score(class_targets_val, class_preds_val, average="macro")
    train_damage_f1 = f1_score(damage_targets_train, damage_preds_train, average="macro", zero_division=0)
    val_damage_f1 = f1_score(damage_targets_val, damage_preds_val, average="macro", zero_division=0)

    print(f"\n📊 Epoch {epoch} Summary:")
    print(f"Train Loss: {total_loss:.4f} | Train Acc: {train_acc:.4f} | Train F1: {train_f1:.4f} | Damage F1: {train_damage_f1:.4f}")
    print(f" Val Loss: {val_loss:.4f} | Val Acc:   {val_acc:.4f} | Val F1:   {val_f1:.4f} | Damage F1: {val_damage_f1:.4f}")

    if val_damage_f1 > best_val_damage_f1:
        best_val_damage_f1 = val_damage_f1
        torch.save(model.state_dict(), "best_model_by_val_damage_f1.pth")
        print("💾 Best model updated!")

torch.save(model.state_dict(), "multitask_model.pth")
print("\n✅ Training complete. Final model saved.")
print(f"🏆 Best model F1 (damage): {best_val_damage_f1:.4f}")

  original_init(self, **validated_kwargs)
Epoch 1/50:   9%|▉         | 2/22 [05:07<51:13, 153.67s/it]  


KeyboardInterrupt: 

In [None]:
import os
import torch
from torchvision import transforms
from PIL import Image
import pandas as pd
import numpy as np
# from model import MultiTaskDamageModel

# === CONFIG ===
test_dir = "test_data/beam"
model_path = "multitask_model.pth"
thresholds = [0.3, 0.35,0.5, 0.75, 0.8]  # Only run best
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# === Allowed damage labels by class ===
allowed_damage = {
    18: [0],
    19: [3, 4, 6, 8],
    20: [1, 5, 7, 9, 10]
}

# === Load model ===
model = MultiTaskDamageModel().to(device)
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()

# === Transform ===
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor()
])

# === Run threshold sweep (only 0.35 here) ===
for threshold in thresholds:
    results = []
    label_counts = []

    for fname in sorted(os.listdir(test_dir), key=lambda x: int(os.path.splitext(x)[0])):
        if not fname.lower().endswith((".jpg", ".jpeg", ".png")):
            continue

        img_path = os.path.join(test_dir, fname)
        image = Image.open(img_path).convert("RGB")
        image = transform(image).unsqueeze(0).to(device)

        with torch.no_grad():
            class_logits, damage_logits = model(image)

            class_pred_raw = class_logits.argmax(dim=1).item()
            class_label = class_pred_raw + 18

            damage_probs = torch.sigmoid(damage_logits).squeeze().cpu().numpy()
            damage_pred = damage_probs > threshold

            # === Filter only allowed damage indices for the predicted class
            valid_indices = allowed_damage[class_label]

            damage_labels = [str(i) for i in valid_indices if damage_pred[i]]

            # 🔧 Fallback: assign top-1 if none passed threshold
            if not damage_labels:
                best_idx = valid_indices[np.argmax(damage_probs[valid_indices])]
                damage_labels = [str(best_idx)]

        # === Final formatted row
        image_id = os.path.splitext(fname)[0]
        full_row = [image_id, str(class_label)] + damage_labels
        results.append(full_row)
        label_counts.append(len(damage_labels))

    # === Format and Save Submission CSV ===
    df = pd.DataFrame(results)
    df.columns = ["ID", "class"] + [f"damage_{i}" for i in range(df.shape[1] - 2)]

    def clean_row(row):
        return ",".join([str(int(x)) for x in row if str(x).isdigit()])

    df["class"] = df.iloc[:, 1:].apply(clean_row, axis=1)
    df = df[["ID", "class"]]

    filename = f"submission_thresh_{str(threshold).replace('.', '')}.csv"
    df.to_csv(filename, index=False)

    # === Logging ===
    print(f"\n=== Threshold: {threshold} ===")
    print(f"✅ Saved: {filename}")
    print(f"Images: {len(label_counts)}")
    print(f"Avg labels/image: {np.mean(label_counts):.2f}")
    print(f"Min: {np.min(label_counts)}, Max: {np.max(label_counts)}, Median: {np.median(label_counts)}")
    print("Example rows:")