# COMP5329 Assignment 2 (Group 15)

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

%cd /content/drive/MyDrive/COMP5329_A2/

In [2]:

import os
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from PIL import Image
from torchvision import transforms, models
from transformers import BertTokenizer
from torch.optim import AdamW
from sklearn.metrics import f1_score
import numpy as np
from transformers import AutoModel, AutoConfig
import zipfile

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

  from .autonotebook import tqdm as notebook_tqdm


## 1. Prediction

### 1.1 Load Model

In [3]:
class PreMultiModalClassifier(nn.Module):
    def __init__(self, num_labels):
        super(PreMultiModalClassifier, self).__init__()
        # Load only the efficientnet_b0 framework
        resnet = models.efficientnet_b0(pretrained=False)
        self.image_model = resnet.features
        self.image_fc = nn.Linear(1280, 512)  

        # Load only the MiniLM framework
        config = AutoConfig.from_pretrained("nreimers/MiniLM-L6-H384-uncased")
        self.text_model = AutoModel.from_config(config)
        self.text_fc = nn.Linear(384, 512)

        self.dropout = nn.Dropout(0.3)
        self.classifier = nn.Linear(512 * 2, num_labels)

    def forward(self, image, input_ids, attention_mask):

        img_feat = self.image_model(image) 


        img_feat = nn.functional.adaptive_avg_pool2d(img_feat, 1)
        img_feat = img_feat.view(img_feat.size(0), -1) 
        img_feat = self.image_fc(img_feat)

        text_output = self.text_model(input_ids=input_ids, attention_mask=attention_mask)
        text_feat = self.text_fc(text_output.last_hidden_state[:, 0, :])

        fused = torch.cat((img_feat, text_feat), dim=1)
        fused = self.dropout(fused)
        out = self.classifier(fused)
        return torch.sigmoid(out)

In [4]:
# Initialize Model
model_test = PreMultiModalClassifier(num_labels=20)

# Quantized Model
quantized_model_test = torch.quantization.quantize_dynamic(
    model_test, {torch.nn.Linear}, dtype=torch.qint8
)
# Load the quantized model state
quantized_model_test.load_state_dict(torch.load('model/quantized_model.pth', map_location='cpu'))


  device=storage.device,


<All keys matched successfully>

### 3.2 Load test set

In [5]:
# with zipfile.ZipFile('filename.zip', 'r') as zip_ref:
#     zip_ref.extractall('.')

In [6]:
input_file = 'COMP5329S1A2Dataset/test.csv'
output_file = 'process/test_cleaned.csv'
with open(input_file, "r", encoding="utf-8") as fin, open(output_file, "w", encoding="utf-8") as fout:
    for line in fin:

        comma1 = line.find(',')
        if comma1 == -1:
            fout.write(line)
            continue

        part1 = line[:comma1+1] 
        part2 = line[comma1+1:] 
        part2_no_comma = part2.replace(',', '') 
        fout.write(part1 + part2_no_comma)

In [7]:
class MultimodalDataset(Dataset):
    def __init__(self, csv_path, image_dir, num_classes=20, max_length=128, is_train=True):
        self.data = pd.read_csv(csv_path, quotechar='"', on_bad_lines='skip')
        self.image_dir = image_dir
        self.num_classes = num_classes
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.max_length = max_length
        self.is_train = is_train

        if self.is_train:
            self.transform = transforms.Compose([
                transforms.Resize((256, 256)),
                transforms.RandomResizedCrop(224),
                transforms.RandomHorizontalFlip(),
                transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])
        else:
            self.transform = transforms.Compose([
                transforms.Resize((224, 224)),
                transforms.ToTensor(),
                transforms.Normalize(mean=[0.485, 0.456, 0.406],
                                     std=[0.229, 0.224, 0.225])
            ])


    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        row = self.data.iloc[idx]
        img_path = os.path.join(self.image_dir, row['ImageID'])
        image = self.transform(Image.open(img_path).convert('RGB'))

        caption = str(row['Caption'])
        text = self.tokenizer(caption, truncation=True, padding='max_length',
                              max_length=self.max_length, return_tensors='pt')
        input_ids = text['input_ids'].squeeze(0)
        attention_mask = text['attention_mask'].squeeze(0)
        if self.is_train:
            label_indices = list(map(int, str(row['Labels']).split()))
            labels = torch.zeros(self.num_classes)
            labels[label_indices] = 1.0

            return {
                'image': image,
                'input_ids': input_ids,
                'attention_mask': attention_mask,
                'labels': labels
            }
        else:
            return {
                'image': image,
                'input_ids': input_ids,
                'attention_mask': attention_mask
            }


In [8]:

image_dir = 'COMP5329S1A2Dataset/data'

# Process the test set
test_df = pd.read_csv('process/test_cleaned.csv')
test_df['Caption'] = test_df['Caption'].str.replace('.', '', regex=False).str.lower()

# Load into DataLoader
test_set = MultimodalDataset(csv_path='process/test_cleaned.csv', image_dir=image_dir, num_classes=20, is_train=False)
test_loader = DataLoader(test_set, batch_size=16, shuffle=False)


### 3.3 Prediction

In [9]:
quantized_model_test.eval()
preds = []

with torch.no_grad():
    for batch in test_loader:
        image = batch['image'].to('cpu')
        input_ids = batch['input_ids'].to('cpu')
        attention_mask = batch['attention_mask'].to('cpu')
        outputs = quantized_model_test(image, input_ids, attention_mask).cpu().numpy()
        preds.append((outputs > 0.5).astype(int))
preds = np.vstack(preds)

# Convert prediction results to labels
pred_labels = []
for pred in preds:
    pred_labels.append(' '.join(map(str, np.where(pred == 1)[0] + 1)))  # +1 is because the index starts from 1

torch.cuda.empty_cache()



### 3.4 Save Prediction Results

In [10]:
test_df['PredictedLabels'] = pred_labels
pred_df = test_df[['ImageID', 'PredictedLabels']]
pred_df.columns = ['ImageID', 'Labels']
pred_df.to_csv('Predicted_labels.csv', index=False, header=True, index_label=False)

# save to txt file
# with open('Predicted_labels.txt', 'w') as f:
#     for index, row in pred_df.iterrows():
#         f.write(f"{row['ImageID']},{row['Labels']}\n")

