<a href="https://colab.research.google.com/github/szha0052/Lightweight-Multi-Modal-Classification-Using-EfficientNet-B0-and-MiniLM/blob/main/COMP5329_A2_All.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# COMP5329 Assignment 2 (Group 15)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/MyDrive/COMP5329_A2/

In [None]:
!pip install scikit-multilearn

In [None]:

import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from skmultilearn.model_selection import IterativeStratification
from sklearn.preprocessing import MultiLabelBinarizer
from PIL import Image
from torchvision import transforms, models
from transformers import BertTokenizer
from torch.optim import AdamW
from sklearn.metrics import f1_score
import numpy as np
from transformers import AutoModel, AutoConfig

import time
from skmultilearn.model_selection import IterativeStratification
from sklearn.preprocessing import MultiLabelBinarizer
import zipfile

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


## 1. Data Preprocessing

### 1.1 Process the data, remove redundant commas (",") in the text, and ensure the data can be successfully read

In [None]:
# with zipfile.ZipFile('filename.zip', 'r') as zip_ref:
#     zip_ref.extractall('.')

In [None]:

input_file = 'COMP5329S1A2Dataset/train.csv'
output_file = 'process/train_cleaned.csv'


with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:
        comma1 = line.find(',')
        comma2 = line.find(',', comma1+1)
        if comma1 == -1 or comma2 == -1:
            fout.write(line)
            continue

        part1 = line[:comma2+1]
        part2 = line[comma2+1:]
        part2_no_comma = part2.replace(',', '')
        fout.write(part1 + part2_no_comma)


### 1.2 Data Preprocessing

In [None]:

image_dir = 'COMP5329S1A2Dataset/data'
csv_path = 'process/train_cleaned.csv'


df = pd.read_csv(csv_path)


label_lists = df['Labels'].apply(lambda x: list(map(int, str(x).split())))

# Convert labels starting from 1 to start from 0
for i in range(len(label_lists)):
    label_lists[i] = [x - 1 for x in label_lists[i]]

# Count the total number of label categories
num_labels = max([max(labels) for labels in label_lists]) + 1
mlb = MultiLabelBinarizer(classes=range(num_labels))
y_bin = mlb.fit_transform(label_lists)

# Convert all text to lowercase
df['Caption'] = df['Caption'].str.replace('.', '', regex=False).str.lower()

### 1.3 Stratified Sampling

In [None]:
# Romdom seed
seed = 42
torch.manual_seed(seed)
np.random.seed(seed)

# Stratified sampling with multiple labels: 80% training set, 20% validation set
splitter = IterativeStratification(n_splits=2, order=1)
train_idx, val_idx = next(splitter.split(np.zeros(len(df)), y_bin))
train_df = df.iloc[train_idx].reset_index(drop=True)
val_df = df.iloc[val_idx].reset_index(drop=True)

# Convert the label columns in train_df and val_df to string format and subtract 1 from all values
train_df['Labels'] = train_df['Labels'].apply(lambda x: ' '.join(map(str, [int(i) - 1 for i in str(x).split()])))
val_df['Labels'] = val_df['Labels'].apply(lambda x: ' '.join(map(str, [int(i) - 1 for i in str(x).split()])))


# Save to CSV
train_df.to_csv('process/train_split.csv', index=False)
val_df.to_csv('process/val_split.csv', index=False)


## 2. Model Training

### 2.1 Load dataset into DataLoader

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, csv_path, image_dir, num_classes=20, max_length=128, is_train=True):
        self.data = pd.read_csv(csv_path, quotechar='"', on_bad_lines='skip')
        self.image_dir = image_dir
        self.num_classes = num_classes
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length
        self.is_train = is_train

        if self.is_train:
            self.transform = transforms.Compose([
                transforms.Resize((256, 256)),
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['ImageID'])
        image = self.transform(Image.open(img_path).convert('RGB'))

        caption = str(row['Caption'])
        text = self.tokenizer(caption, truncation=True, padding='max_length',
                              max_length=self.max_length, return_tensors='pt')
        input_ids = text['input_ids'].squeeze(0)
        attention_mask = text['attention_mask'].squeeze(0)
        if self.is_train:
            label_indices = list(map(int, str(row['Labels']).split()))
            labels = torch.zeros(self.num_classes)
            labels[label_indices] = 1.0

            return {
                'image': image,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }
        else:
            return {
                'image': image,
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }


In [None]:
# Load the dataset after layering
train_set = MultimodalDataset(csv_path='process/train_split.csv', image_dir=image_dir, num_classes=num_labels, is_train=True)
val_set = MultimodalDataset(csv_path='process/val_split.csv', image_dir=image_dir, num_classes=num_labels, is_train=True)

# DataLoader Settings
train_loader = DataLoader(train_set, batch_size=16, shuffle=True)
val_loader = DataLoader(val_set, batch_size=16)

### 2.2 Define Model Structure

In [None]:
class MultiModalClassifier(nn.Module):
    def __init__(self, num_labels):
        super(MultiModalClassifier, self).__init__()
        resnet = models.efficientnet_b0(pretrained=True)
        self.image_model = resnet.features
        self.image_fc = nn.Linear(1280, 512)

        self.text_model = AutoModel.from_pretrained('nreimers/MiniLM-L6-H384-uncased')
        self.text_fc = nn.Linear(384, 512)

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(512 * 2, num_labels)

    def forward(self, image, input_ids, attention_mask):

        img_feat = self.image_model(image)


        img_feat = nn.functional.adaptive_avg_pool2d(img_feat, 1)
        img_feat = img_feat.view(img_feat.size(0), -1)
        img_feat = self.image_fc(img_feat)

        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = self.text_fc(text_output.last_hidden_state[:, 0, :])

        fused = torch.cat((img_feat, text_feat), dim=1)
        fused = self.dropout(fused)
        out = self.classifier(fused)
        return torch.sigmoid(out)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Initialize the model, optimizer, and loss function
model = MultiModalClassifier(num_labels=num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()



### 2.3 Train Model

In [None]:
def train_loop(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        image = batch['image'].to(device)
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(image, input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)



In [None]:
def evaluate(model, dataloader, device, threshold=0.5):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in dataloader:
            image = batch['image'].to(device)
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(image, input_ids, attention_mask).cpu().numpy()
            preds.append((outputs > threshold).astype(int))
            trues.append(labels)
    preds = np.vstack(preds)
    trues = np.vstack(trues)
    return f1_score(trues, preds, average='micro')

In [None]:
# Train Model
start_time = time.time()
for epoch in range(8):
    loss = train_loop(model, train_loader, optimizer, criterion, device)
    train_f1 = evaluate(model, train_loader, device)
    val_f1 = evaluate(model, val_loader, device)
    print(f"EfficientNet + MiniLM: Epoch {epoch+1} - Loss: {loss:.4f}, Train F1: {train_f1:.4f}, Val F1: {val_f1:.4f}")

end_time = time.time()

# Clear Cache
torch.cuda.empty_cache()







EfficientNet + MiniLM: Epoch 1 - Loss: 0.1802, Train F1: 0.7390, Val F1: 0.7346
EfficientNet + MiniLM: Epoch 2 - Loss: 0.1036, Train F1: 0.8141, Val F1: 0.8070
EfficientNet + MiniLM: Epoch 3 - Loss: 0.0868, Train F1: 0.8374, Val F1: 0.8224
EfficientNet + MiniLM: Epoch 4 - Loss: 0.0794, Train F1: 0.8472, Val F1: 0.8268
EfficientNet + MiniLM: Epoch 5 - Loss: 0.0743, Train F1: 0.8578, Val F1: 0.8311
EfficientNet + MiniLM: Epoch 6 - Loss: 0.0705, Train F1: 0.8658, Val F1: 0.8310
EfficientNet + MiniLM: Epoch 7 - Loss: 0.0665, Train F1: 0.8788, Val F1: 0.8376
EfficientNet + MiniLM: Epoch 8 - Loss: 0.0635, Train F1: 0.8871, Val F1: 0.8351


In [None]:
# Store Results
all_models = pd.DataFrame(columns=['Model_name', 'Loss', 'Train F1', 'Val F1', 'Time'])
new_row = pd.DataFrame({
    'Model_name': ['EfficientNet + MiniLM'],
    'Loss': [loss],
    'Train F1': [train_f1],
    'Val F1': [val_f1],
    'Time': [end_time - start_time]
})
all_models = pd.concat([all_models, new_row], ignore_index=True)


  all_models = pd.concat([all_models, new_row], ignore_index=True)


### 2.4 Save Model (Quantized Model float32 -> int8)

In [None]:

quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

torch.save(quantized_model.state_dict(), 'model/quantized_model.pth')

## 3. Prediction

### 3.1 Load Model

In [None]:
class PreMultiModalClassifier(nn.Module):
    def __init__(self, num_labels):
        super(PreMultiModalClassifier, self).__init__()
        # Load only the efficientnet_b0 framework
        resnet = models.efficientnet_b0(pretrained=False)
        self.image_model = resnet.features
        self.image_fc = nn.Linear(1280, 512)

        # Load only the MiniLM framework
        config = AutoConfig.from_pretrained("nreimers/MiniLM-L6-H384-uncased")
        self.text_model = AutoModel.from_config(config)
        self.text_fc = nn.Linear(384, 512)

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(512 * 2, num_labels)

    def forward(self, image, input_ids, attention_mask):

        img_feat = self.image_model(image)


        img_feat = nn.functional.adaptive_avg_pool2d(img_feat, 1)
        img_feat = img_feat.view(img_feat.size(0), -1)
        img_feat = self.image_fc(img_feat)

        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = self.text_fc(text_output.last_hidden_state[:, 0, :])

        fused = torch.cat((img_feat, text_feat), dim=1)
        fused = self.dropout(fused)
        out = self.classifier(fused)
        return torch.sigmoid(out)

In [None]:
# Initialize Model
model_test = PreMultiModalClassifier(num_labels=20)

# Quantized Model
quantized_model_test = torch.quantization.quantize_dynamic(
    model_test, {torch.nn.Linear}, dtype=torch.qint8
)
# Load the quantized model state
quantized_model_test.load_state_dict(torch.load('model/quantized_model.pth', map_location='cpu'))


  device=storage.device,


<All keys matched successfully>

### 3.2 Load test set

In [None]:
input_file = 'COMP5329S1A2Dataset/test.csv'
output_file = 'process/test_cleaned.csv'
with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:

        comma1 = line.find(',')
        if comma1 == -1:
            fout.write(line)
            continue

        part1 = line[:comma1+1]
        part2 = line[comma1+1:]
        part2_no_comma = part2.replace(',', '')
        fout.write(part1 + part2_no_comma)

In [None]:
class MultimodalDataset(Dataset):
    def __init__(self, csv_path, image_dir, num_classes=20, max_length=128, is_train=True):
        self.data = pd.read_csv(csv_path, quotechar='"', on_bad_lines='skip')
        self.image_dir = image_dir
        self.num_classes = num_classes
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length
        self.is_train = is_train

        if self.is_train:
            self.transform = transforms.Compose([
                transforms.Resize((256, 256)),
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['ImageID'])
        image = self.transform(Image.open(img_path).convert('RGB'))

        caption = str(row['Caption'])
        text = self.tokenizer(caption, truncation=True, padding='max_length',
                              max_length=self.max_length, return_tensors='pt')
        input_ids = text['input_ids'].squeeze(0)
        attention_mask = text['attention_mask'].squeeze(0)
        if self.is_train:
            label_indices = list(map(int, str(row['Labels']).split()))
            labels = torch.zeros(self.num_classes)
            labels[label_indices] = 1.0

            return {
                'image': image,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }
        else:
            return {
                'image': image,
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }


In [None]:

image_dir = 'COMP5329S1A2Dataset/data'

# Process the test set
test_df = pd.read_csv('process/test_cleaned.csv')
test_df['Caption'] = test_df['Caption'].str.replace('.', '', regex=False).str.lower()

# Load into DataLoader
test_set = MultimodalDataset(csv_path='process/test_cleaned.csv', image_dir=image_dir, num_classes=20, is_train=False)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False)


### 3.3 Prediction

In [None]:
quantized_model_test.eval()
preds = []

with torch.no_grad():
    for batch in test_loader:
        image = batch['image'].to('cpu')
        input_ids = batch['input_ids'].to('cpu')
        attention_mask = batch['attention_mask'].to('cpu')
        outputs = quantized_model_test(image, input_ids, attention_mask).cpu().numpy()
        preds.append((outputs > 0.5).astype(int))
preds = np.vstack(preds)

# Convert prediction results to labels
pred_labels = []
for pred in preds:
    pred_labels.append(' '.join(map(str, np.where(pred == 1)[0] + 1)))  # +1 is because the index starts from 1

torch.cuda.empty_cache()



### 3.4 Save Prediction Results

In [None]:
test_df['PredictedLabels'] = pred_labels
pred_df = test_df[['ImageID', 'PredictedLabels']]
pred_df.columns = ['ImageID', 'Labels']
pred_df.to_csv('Predicted_labels.csv', index=False, header=True, index_label=False)

# save to txt file
# with open('Predicted_labels.txt', 'w') as f:
#     for index, row in pred_df.iterrows():
#         f.write(f"{row['ImageID']},{row['Labels']}\n")



## 4. Ablation Experiment

### 4.1 Only Image Model

In [None]:
class ImageOnlyClassifier(nn.Module):
    def __init__(self, num_labels):
        super(ImageOnlyClassifier, self).__init__()
        resnet = models.efficientnet_b0(pretrained=True)
        self.image_model = resnet.features
        self.image_fc = nn.Linear(1280, 512)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(512, num_labels)

    def forward(self, image):
        img_feat = self.image_model(image)
        img_feat = nn.functional.adaptive_avg_pool2d(img_feat, 1)
        img_feat = img_feat.view(img_feat.size(0), -1)
        img_feat = self.image_fc(img_feat)
        img_feat = self.dropout(img_feat)
        out = self.classifier(img_feat)
        return torch.sigmoid(out)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_image = ImageOnlyClassifier(num_labels=num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()



In [None]:
def train_loop_image(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        image = batch['image'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(image)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate_image(model, dataloader, device, threshold=0.5):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in dataloader:
            image = batch['image'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(image).cpu().numpy()
            preds.append((outputs > threshold).astype(int))
            trues.append(labels)
    preds = np.vstack(preds)
    trues = np.vstack(trues)
    return f1_score(trues, preds, average='micro')


In [None]:

start_time_image = time.time()
for epoch in range(8):
    loss_image = train_loop_image(model_image, train_loader, optimizer, criterion, device)
    train_f1_image = evaluate_image(model_image, train_loader, device)
    val_f1_image = evaluate_image(model_image, val_loader, device)
    print(f"EfficientNet: Epoch {epoch+1} - Loss: {loss_image:.4f}, Train F1: {train_f1_image:.4f}, Val F1: {val_f1_image:.4f}")
end_time_image = time.time()

torch.cuda.empty_cache()

EfficientNet: Epoch 1 - Loss: 0.6938, Train F1: 0.1100, Val F1: 0.1109
EfficientNet: Epoch 2 - Loss: 0.6937, Train F1: 0.1074, Val F1: 0.1080
EfficientNet: Epoch 3 - Loss: 0.6938, Train F1: 0.1077, Val F1: 0.1082
EfficientNet: Epoch 4 - Loss: 0.6938, Train F1: 0.1098, Val F1: 0.1092
EfficientNet: Epoch 5 - Loss: 0.6938, Train F1: 0.1048, Val F1: 0.1048
EfficientNet: Epoch 6 - Loss: 0.6937, Train F1: 0.1055, Val F1: 0.1066
EfficientNet: Epoch 7 - Loss: 0.6938, Train F1: 0.1118, Val F1: 0.1118
EfficientNet: Epoch 8 - Loss: 0.6938, Train F1: 0.1042, Val F1: 0.1057


In [None]:
new_row_image = pd.DataFrame([{'Model_name': 'EfficientNet', 'Loss': loss_image, 'Train F1': train_f1_image, 'Val F1': val_f1_image, 'Time': end_time_image - start_time_image}])
all_models =  pd.concat([all_models, new_row_image], ignore_index=True)

### 4.2 Only Text Model

In [None]:
class TextOnlyClassifier(nn.Module):
    def __init__(self, num_labels):
        super(TextOnlyClassifier, self).__init__()
        self.text_model = AutoModel.from_pretrained('nreimers/MiniLM-L6-H384-uncased')
        self.text_fc = nn.Linear(384, 512)
        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(512, num_labels)

    def forward(self, input_ids, attention_mask):
        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = self.text_fc(text_output.last_hidden_state[:, 0, :])
        text_feat = self.dropout(text_feat)
        out = self.classifier(text_feat)
        return torch.sigmoid(out)

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_text = TextOnlyClassifier(num_labels=num_labels).to(device)
optimizer = AdamW(model.parameters(), lr=2e-5)
criterion = nn.BCELoss()

In [None]:
def train_loop_text(model, dataloader, optimizer, criterion, device):
    model.train()
    total_loss = 0
    for batch in dataloader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        optimizer.zero_grad()
        outputs = model(input_ids, attention_mask)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()
    return total_loss / len(dataloader)


def evaluate_text(model, dataloader, device, threshold=0.5):
    model.eval()
    preds, trues = [], []
    with torch.no_grad():
        for batch in dataloader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].cpu().numpy()
            outputs = model(input_ids, attention_mask).cpu().numpy()
            preds.append((outputs > threshold).astype(int))
            trues.append(labels)
    preds = np.vstack(preds)
    trues = np.vstack(trues)
    return f1_score(trues, preds, average='micro')


In [None]:
start_time_text = time.time()
for epoch in range(8):
    loss_text = train_loop_text(model_text, train_loader, optimizer, criterion, device)
    train_f1_text = evaluate_text(model_text, train_loader, device)
    val_f1_text = evaluate_text(model_text, val_loader, device)
    print(f"MiniL: Epoch {epoch+1} - Loss: {loss_text:.4f}, Train F1: {train_f1_text:.4f}, Val F1: {val_f1_text:.4f}")
end_time_text = time.time()

torch.cuda.empty_cache()

MiniL: Epoch 1 - Loss: 0.6748, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 2 - Loss: 0.6748, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 3 - Loss: 0.6749, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 4 - Loss: 0.6748, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 5 - Loss: 0.6750, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 6 - Loss: 0.6750, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 7 - Loss: 0.6750, Train F1: 0.2374, Val F1: 0.2374
MiniL: Epoch 8 - Loss: 0.6750, Train F1: 0.2374, Val F1: 0.2374


In [None]:
new_row_text = pd.DataFrame([{'Model_name': 'MiniLM', 'Loss': loss_text, 'Train F1': train_f1_text, 'Val F1': val_f1_text, 'Time': end_time_text - start_time_text}])
all_models = pd.concat([all_models, new_row_text], ignore_index=True)

### 4.3 Ablation Experiment Results

In [None]:
all_models_final = all_models.copy()

all_models_final['Train F1'] = all_models_final['Train F1'].apply(lambda x: round(x, 4))
all_models_final['Val F1'] = all_models_final['Val F1'].apply(lambda x: round(x, 4))
all_models_final['Loss'] = all_models_final['Loss'].apply(lambda x: round(x, 4))

all_models_final['Time'] = all_models_final['Time'].apply(lambda x: round(x / 60, 1))


In [None]:
all_models_final

Unnamed: 0,Model_name,Loss,Train F1,Val F1,Time
0,EfficientNet + MiniLM,0.0635,0.8871,0.8351,42.5
1,EfficientNet,0.6938,0.1042,0.1057,39.3
2,MiniLM,0.675,0.2374,0.2374,37.2


## 5. Hyperparameter Selection

### 5.1 Redefine the model framework to facilitate passing hyperparameters

In [None]:
class GridMultiModalClassifier(nn.Module):
    def __init__(self, num_labels, cell_size=512, dropout_rate=0.3):
        super(GridMultiModalClassifier, self).__init__()
        resnet = models.efficientnet_b0(pretrained=True)
        self.image_model = resnet.features
        self.image_fc = nn.Linear(1280, cell_size)  # (batch, 1280)

        self.text_model = AutoModel.from_pretrained('nreimers/MiniLM-L6-H384-uncased')
        self.text_fc = nn.Linear(384, cell_size)

        self.dropout = nn.Dropout(dropout_rate)
        self.classifier = nn.Linear(cell_size * 2, num_labels)

    def forward(self, image, input_ids, attention_mask):

        img_feat = self.image_model(image)
        img_feat = nn.functional.adaptive_avg_pool2d(img_feat, 1)
        img_feat = img_feat.view(img_feat.size(0), -1)
        img_feat = self.image_fc(img_feat)

        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = self.text_fc(text_output.last_hidden_state[:, 0, :])

        fused = torch.cat((img_feat, text_feat), dim=1)
        fused = self.dropout(fused)
        out = self.classifier(fused)
        return torch.sigmoid(out)

### 5.2 Hyperparameter Selection

In [None]:
# parameter grid
param_grid = {
    'cell_size': [256, 512, 1024],
    'dropout_rate': [0.3, 0.5],
}

In [None]:
Grid_results = pd.DataFrame(columns=['Cell_size', 'Dropout_rate', 'Loss', 'Train F1', 'Val F1', 'Time'])

for cell_size in param_grid['cell_size']:
    for dropout_rate in param_grid['dropout_rate']:
        print('-' * 50)
        print(f"Training with cell_size={cell_size}, dropout_rate={dropout_rate}")
        model = GridMultiModalClassifier(num_labels=num_labels, cell_size=cell_size, dropout_rate=dropout_rate).to(device)
        optimizer = AdamW(model.parameters(), lr=2e-5)
        criterion = nn.BCELoss()

        start_time_grid = time.time()
        for epoch in range(8):
            loss_grid = train_loop(model, train_loader, optimizer, criterion, device)
            train_f1_grid = evaluate(model, train_loader, device)
            val_f1_grid = evaluate(model, val_loader, device)
            print(f"Grid Search: Epoch {epoch+1} - Loss: {loss_grid:.4f}, Train F1: {train_f1_grid:.4f}, Val F1: {val_f1_grid:.4f}")
        end_time_grid = time.time()

        torch.cuda.empty_cache()

        new_row_grid = pd.DataFrame([{'Cell_size': cell_size, 'Dropout_rate': dropout_rate, 'Loss': loss_grid, 'Train F1': train_f1_grid, 'Val F1': val_f1_grid, 'Time': end_time_grid - start_time_grid}])
        Grid_results = pd.concat([Grid_results, new_row_grid], ignore_index=True)

Grid_results_final = Grid_results.copy()

--------------------------------------------------
Training with cell_size=256, dropout_rate=0.3




Grid Search: Epoch 1 - Loss: 0.1886, Train F1: 0.7069, Val F1: 0.6980
Grid Search: Epoch 2 - Loss: 0.1100, Train F1: 0.8036, Val F1: 0.7960
Grid Search: Epoch 3 - Loss: 0.0916, Train F1: 0.8241, Val F1: 0.8117
Grid Search: Epoch 4 - Loss: 0.0828, Train F1: 0.8469, Val F1: 0.8272
Grid Search: Epoch 5 - Loss: 0.0773, Train F1: 0.8529, Val F1: 0.8286
Grid Search: Epoch 6 - Loss: 0.0734, Train F1: 0.8623, Val F1: 0.8279
Grid Search: Epoch 7 - Loss: 0.0694, Train F1: 0.8751, Val F1: 0.8357
Grid Search: Epoch 8 - Loss: 0.0657, Train F1: 0.8811, Val F1: 0.8347
--------------------------------------------------
Training with cell_size=256, dropout_rate=0.5


  Grid_results = pd.concat([Grid_results, new_row_grid], ignore_index=True)


Grid Search: Epoch 1 - Loss: 0.1988, Train F1: 0.7127, Val F1: 0.7082
Grid Search: Epoch 2 - Loss: 0.1158, Train F1: 0.7958, Val F1: 0.7861
Grid Search: Epoch 3 - Loss: 0.0960, Train F1: 0.8233, Val F1: 0.8109
Grid Search: Epoch 4 - Loss: 0.0871, Train F1: 0.8385, Val F1: 0.8264
Grid Search: Epoch 5 - Loss: 0.0813, Train F1: 0.8515, Val F1: 0.8282
Grid Search: Epoch 6 - Loss: 0.0765, Train F1: 0.8606, Val F1: 0.8298
Grid Search: Epoch 7 - Loss: 0.0728, Train F1: 0.8710, Val F1: 0.8329
Grid Search: Epoch 8 - Loss: 0.0693, Train F1: 0.8756, Val F1: 0.8323
--------------------------------------------------
Training with cell_size=512, dropout_rate=0.3




Grid Search: Epoch 1 - Loss: 0.1798, Train F1: 0.7143, Val F1: 0.7087
Grid Search: Epoch 2 - Loss: 0.1074, Train F1: 0.8046, Val F1: 0.7957
Grid Search: Epoch 3 - Loss: 0.0891, Train F1: 0.8296, Val F1: 0.8176
Grid Search: Epoch 4 - Loss: 0.0812, Train F1: 0.8424, Val F1: 0.8241
Grid Search: Epoch 5 - Loss: 0.0760, Train F1: 0.8520, Val F1: 0.8285
Grid Search: Epoch 6 - Loss: 0.0712, Train F1: 0.8645, Val F1: 0.8329
Grid Search: Epoch 7 - Loss: 0.0681, Train F1: 0.8753, Val F1: 0.8355
Grid Search: Epoch 8 - Loss: 0.0646, Train F1: 0.8804, Val F1: 0.8359
--------------------------------------------------
Training with cell_size=512, dropout_rate=0.5




Grid Search: Epoch 1 - Loss: 0.1831, Train F1: 0.7249, Val F1: 0.7200
Grid Search: Epoch 2 - Loss: 0.1090, Train F1: 0.8108, Val F1: 0.8014
Grid Search: Epoch 3 - Loss: 0.0915, Train F1: 0.8287, Val F1: 0.8176
Grid Search: Epoch 4 - Loss: 0.0831, Train F1: 0.8451, Val F1: 0.8245
Grid Search: Epoch 5 - Loss: 0.0777, Train F1: 0.8548, Val F1: 0.8297
Grid Search: Epoch 6 - Loss: 0.0737, Train F1: 0.8628, Val F1: 0.8332
Grid Search: Epoch 7 - Loss: 0.0701, Train F1: 0.8706, Val F1: 0.8343
Grid Search: Epoch 8 - Loss: 0.0666, Train F1: 0.8777, Val F1: 0.8300
--------------------------------------------------
Training with cell_size=1024, dropout_rate=0.3




Grid Search: Epoch 1 - Loss: 0.1690, Train F1: 0.7453, Val F1: 0.7401
Grid Search: Epoch 2 - Loss: 0.1007, Train F1: 0.8139, Val F1: 0.8056
Grid Search: Epoch 3 - Loss: 0.0856, Train F1: 0.8352, Val F1: 0.8210
Grid Search: Epoch 4 - Loss: 0.0782, Train F1: 0.8502, Val F1: 0.8292
Grid Search: Epoch 5 - Loss: 0.0735, Train F1: 0.8599, Val F1: 0.8360
Grid Search: Epoch 6 - Loss: 0.0693, Train F1: 0.8745, Val F1: 0.8367
Grid Search: Epoch 7 - Loss: 0.0657, Train F1: 0.8763, Val F1: 0.8310
Grid Search: Epoch 8 - Loss: 0.0624, Train F1: 0.8851, Val F1: 0.8336
--------------------------------------------------
Training with cell_size=1024, dropout_rate=0.5




Grid Search: Epoch 1 - Loss: 0.1726, Train F1: 0.7150, Val F1: 0.7090
Grid Search: Epoch 2 - Loss: 0.1048, Train F1: 0.8064, Val F1: 0.7983
Grid Search: Epoch 3 - Loss: 0.0878, Train F1: 0.8302, Val F1: 0.8142
Grid Search: Epoch 4 - Loss: 0.0805, Train F1: 0.8481, Val F1: 0.8293
Grid Search: Epoch 5 - Loss: 0.0751, Train F1: 0.8599, Val F1: 0.8308
Grid Search: Epoch 6 - Loss: 0.0709, Train F1: 0.8660, Val F1: 0.8371
Grid Search: Epoch 7 - Loss: 0.0676, Train F1: 0.8753, Val F1: 0.8357
Grid Search: Epoch 8 - Loss: 0.0643, Train F1: 0.8796, Val F1: 0.8326


### 5.3 Hyperparameter Selection Results

In [None]:
Grid_results_final['Train F1'] = Grid_results_final['Train F1'].apply(lambda x: round(x, 4))
Grid_results_final['Val F1'] = Grid_results_final['Val F1'].apply(lambda x: round(x, 4))
Grid_results_final['Loss'] = Grid_results_final['Loss'].apply(lambda x: round(x, 4))
Grid_results_final['Time'] = Grid_results_final['Time'].apply(lambda x: round(x / 60, 1))
Grid_results_final

Unnamed: 0,Cell_size,Dropout_rate,Loss,Train F1,Val F1,Time
0,256,0.3,0.0657,0.8811,0.8347,42.4
1,256,0.5,0.0693,0.8756,0.8323,42.4
2,512,0.3,0.0646,0.8804,0.8359,42.4
3,512,0.5,0.0666,0.8777,0.83,42.5
4,1024,0.3,0.0624,0.8851,0.8336,42.4
5,1024,0.5,0.0643,0.8796,0.8326,42.3
