# COMP5329 - Assignment 2

In [1]:
# import google
import collections
import json
import re

import nltk
import numpy as np
import pandas
import pandas as pd

# import torchtext
import PIL.Image
import timm
import torch
import torch.nn as nn
import torchvision

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# MOUNT_PATH = '/content/drive'
# DRIVE_PATH = f'{MOUNT_PATH}/My Drive'
# PROJECT_PATH = DRIVE_PATH + "/Assignment 2"
PROJECT_PATH = "./"
IMG_PATH = f"{PROJECT_PATH}/data"
TRAIN_CSV_PATH = f"{PROJECT_PATH}/train.csv"
TEST_CSV_PATH = f"{PROJECT_PATH}/test.csv"

# google.colab.drive.mount(MOUNT_PATH)
torch.cuda.empty_cache()

## Dataset


### Train dataset

In [2]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[3] = self.df_data[3].fillna("")
        self.df_data[2] += self.df_data[3]
        self.df_data = self.df_data.drop(3, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "label", 2: "caption"}, axis=1).dropna()

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

### Test dataset

In [3]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[2] = self.df_data[2].fillna("")
        self.df_data[1] += self.df_data[2]
        self.df_data = self.df_data.drop(2, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        caption = self.df_data.iloc[idx, 1]

        sample = {"caption": caption, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

## Preprocessing

### Dataset pre-transformations 

In [4]:
def one_hot_encode_labels(dataset):
    if "one_hot_encoded_labels" not in dataset.tags:
        dataset.df_data["label"] = dataset.df_data["label"].apply(
            lambda l: torch.nn.functional.one_hot(
                torch.tensor([int(i) - 1 if int(i) < 12 else int(i) - 2 for i in l.split(" ")]), 18
            )
            .sum(axis=0)
            .float()
        )

        dataset.tags.add("one_hot_encoded_labels")

## Modules


### Caption embedding

In [5]:
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")
lang_model = AutoModel.from_pretrained("prajjwal1/bert-mini")

for name, param in lang_model.named_parameters():
    param.requires_grad_(False)

Some weights of the model checkpoint at prajjwal1/bert-mini were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Pretrained model surgery

In [6]:
from effdet import create_model

img_model = create_model(
    "tf_efficientdet_d2", bench_task="predict", num_classes=18, pretrained=True
)
features = nn.Sequential(*[img_model.model.backbone, img_model.model.fpn])

for name, param in features.named_parameters():
    param.requires_grad_(False)

### Combined model

In [8]:
NUM_ClASSES = 18
import math

import torch.nn as nn

IMG_DIM = 112
WRD_DIM = list(lang_model.pooler.parameters())[-1].shape[0]


class SASGA(nn.Module):
    def __init__(self, word_dim=WRD_DIM):
        super().__init__()
        self.layer_norm_1 = norm = nn.LayerNorm(word_dim, eps=1e-05, elementwise_affine=True)
        self.layer_norm_2 = norm = nn.LayerNorm(word_dim, eps=1e-05, elementwise_affine=True)

        self.guided_attention_layer = nn.TransformerDecoderLayer(
            word_dim, nhead=word_dim // 64, dim_feedforward=word_dim * 4, activation="relu"
        )
        self.guided_attention = nn.TransformerDecoder(
            self.guided_attention_layer, num_layers=1, norm=self.layer_norm_1
        )
        self.self_attention_layer = nn.TransformerDecoderLayer(
            word_dim, nhead=word_dim // 64, dim_feedforward=word_dim * 4, activation="relu"
        )
        self.self_attention = nn.TransformerDecoder(
            self.self_attention_layer, num_layers=1, norm=self.layer_norm_2
        )

    def forward(self, x, y, y_mask=None):
        # guide x by y
        x = self.self_attention_layer(x, x)
        x = self.guided_attention(x, y, memory_key_padding_mask=y_mask)
        return x


class Combined_Model(torch.nn.Module):
    def __init__(self, visual_features, lang_model, word_dim=WRD_DIM, img_dim=IMG_DIM, n_saga=1):
        super().__init__()
        self.visual_features = visual_features
        self.lang_model = lang_model
        self.img_dim = img_dim
        self.word_dim = word_dim

        self.proj = nn.Linear(img_dim, 256)  # to word dims

        self.feature_pool0 = nn.AvgPool2d(kernel_size=6, stride=5)  # reduce feature size
        self.feature_pool1 = nn.AvgPool2d(kernel_size=5, stride=3)  # reduce feature size

        # SA-GA ##############################
        self.SASGA = nn.ModuleList([SASGA(word_dim) for _ in range(n_saga)])
        # SA-GA ##############################

        self.output = nn.Linear(word_dim, NUM_ClASSES)
        self.activation = nn.ReLU()
        self.proj_all = nn.Linear(word_dim * 2, word_dim)

    def forward(self, images, inputs):  # requires tokenized captions
        lang_outs = self.lang_model(**inputs)
        last_hidden_state = lang_outs["last_hidden_state"].permute(
            1, 0, 2
        )  # seq_len, batch, word_dim
        pooler_out = lang_outs["pooler_output"]
        mask = inputs.attention_mask == 0

        features = self.visual_features(images)
        batch_size = features[0].shape[0]

        features[0] = self.feature_pool0(features[0])
        features[1] = self.feature_pool0(features[1])

        features = torch.cat(
            [feature.view(batch_size, self.img_dim, -1) for feature in features], dim=-1
        )
        # batch, self.img_dim, -1

        features = self.proj(features.permute(2, 0, 1))
        for modules in self.SASGA:
            features = modules(features, last_hidden_state, mask)

        att_out = features.mean(0)  # batch, 256
        out = torch.cat([att_out, pooler_out], dim=-1)
        out = self.proj_all(out)
        out = self.activation(out)
        out = self.output(out)
        return out

## Transforms and training support

In [9]:
class FieldTransform(object):
    def __init__(self, field, transform):
        self.field = field
        self.transform = transform

    def __call__(self, sample):
        sample[self.field] = self.transform(sample[self.field])
        return sample

## Experiments

In [10]:
def train_collate_fn(X):
    X = {k: [v[k] for v in X] for k in X[0]}
    X["label"] = torch.stack(X["label"])
    X["image"] = torch.stack(X["image"])

    return X


normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

transforms = torchvision.transforms.Compose(
    [
        FieldTransform("image", torchvision.transforms.Resize((256, 256))),  # must be 256
        FieldTransform("image", torchvision.transforms.ToTensor()),
        FieldTransform("image", normalize),
    ]
)

In [11]:
from torch.utils.data import DataLoader, random_split

train_data = TrainDataset(transform=transforms)
one_hot_encode_labels(train_data)
trainds, valds = random_split(train_data, [27000, 3000])
train_dl = DataLoader(
    trainds, batch_size=32, shuffle=True, collate_fn=train_collate_fn, num_workers=24
)
val_dl = DataLoader(valds, batch_size=8, shuffle=False, collate_fn=train_collate_fn, num_workers=24)


model = Combined_Model(features, lang_model, n_saga=1).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
criterion = torch.nn.BCEWithLogitsLoss()
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=2,
    threshold=0.0001,
    threshold_mode="rel",
    cooldown=0,
    min_lr=0,
    eps=1e-08,
    verbose=False,
)

In [None]:
import os
from copy import deepcopy

from sklearn.metrics import f1_score
from tqdm import tqdm

val_mean_f1 = []
val_sample_f1 = []
model_in_memory = {}
scaler = torch.cuda.amp.GradScaler()


def train_epoch(model, train_data_loader, use_fp16=False):
    train_loss = []
    train_outs = []
    train_labels = []
    model.train()
    for batch in tqdm(train_data_loader):
        optimizer.zero_grad()
        captions = batch["caption"]
        images = batch["image"].to(device)
        labels = batch["label"].to(device)
        inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)

        with torch.cuda.amp.autocast(enabled=use_fp16):
            predictions = model(images, inputs)
            loss = criterion(predictions, labels)

        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        train_loss.append(loss.item())

        train_outs.append(predictions.detach().cpu().numpy())
        train_labels.append(labels.detach().cpu().numpy())
    train_labels = np.vstack(train_labels)
    train_outs = np.vstack(train_outs)
    mean_f1 = f1_score(y_true=train_labels, y_pred=1 * (train_outs > 0), average="micro")
    sample_f1 = f1_score(y_true=train_labels, y_pred=1 * (train_outs > 0), average="samples")
    return train_loss, train_outs, train_labels, mean_f1, sample_f1


def evaluate_model(model, eval_data_loader):
    val_loss = []
    val_outs = []
    val_labels = []
    model.eval()
    for batch in val_dl:
        captions = batch["caption"]
        images = batch["image"].to(DEVICE)
        labels = batch["label"].to(DEVICE)
        inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)
        predictions = model(images, inputs)
        loss = criterion(predictions, labels)

        val_loss.append(loss.item())
        val_outs.append(predictions.detach().cpu().numpy())
        val_labels.append(labels.detach().cpu().numpy())
    val_labels = np.vstack(val_labels)
    val_outs = np.vstack(val_outs)
    mean_f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="micro")
    sample_f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="samples")
    return val_loss, val_outs, val_labels, mean_f1, sample_f1


for epoch in range(50):
    train_loss = []
    train_outs = []
    train_lables = []
    model.train()

    # it is very important to keep backbone fixed first.
    # Without fixing the model does not achieve the same performance
    # however, for purpose of ablation study, feel free do change these
    if epoch == 5:
        finetune_param = False
        for name, param in model.visual_features.named_parameters():
            if "fnode" in name:
                param.requires_grad_(True)

    if epoch == 7:
        finetune_param = False
        for name, param in model.lang_model.named_parameters():
            if "3" in name:
                finetune_param = True
            if finetune_param:
                param.requires_grad_(True)

    train_loss, _, _, train_mean_f1, train_sample_f1 = train_epoch(model, train_dl, use_fp16=True)
    val_loss, val_outs, val_labels, mean_f1, sample_f1 = evaluate_model(model, val_dl)
    
    ###################3
    ###################### all of below are just helper functions 
    #####################
    cur_lr = deepcopy(optimizer.param_groups[0]["lr"])
    print(
        f"Epoch:{epoch}, train/val loss:{round(np.mean(train_loss),7),round(np.mean(val_loss),7)},train/val f1:{round(train_sample_f1, 7), round(sample_f1, 7)}, lr:{cur_lr}"
    )
    scheduler.step(np.mean(val_loss))
    # save best model to checkpoint
    if len(val_sample_f1) == 0 or sample_f1 > max(val_sample_f1):
        print("Saving best model to checkpoint")
        try:
            del model_in_memory["best"]
        except:
            pass
        torch.cuda.empty_cache()
        model.to("cpu")
        model_in_memory["best"] = deepcopy(model.state_dict())
        model_in_memory["best_f1"] = sample_f1
        model.to(device)
    if epoch % 10 == 0:
        print("Saving checkpoint to disk")
        torch.save(model_in_memory["best"], "test_v10.pt")

    val_mean_f1.append(mean_f1)
    val_sample_f1.append(sample_f1)

    # recover model if plateou
    if optimizer.param_groups[0]["lr"] < cur_lr and sample_f1 < model_in_memory["best_f1"]:
        print("Recovering best model from checkpoint")
        model.load_state_dict(deepcopy(model_in_memory["best"]))
        model.to(device)

    # "manual early stopping"
    if cur_lr < 3e-6:
        model.load_state_dict(deepcopy(model_in_memory["best"]))
        torch.save(model, "test_v10.pt")
        break

100%|██████████| 844/844 [00:57<00:00, 14.65it/s]


Epoch:0, train/val loss:(0.1742228, 0.1063437),train/val f1:(0.6492601, 0.7575286), lr:5e-05
Saving best model to checkpoint
Saving checkpoint to disk


100%|██████████| 844/844 [00:58<00:00, 14.31it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:1, train/val loss:(0.100015, 0.0871261),train/val f1:(0.7970685, 0.826672), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [00:58<00:00, 14.44it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:2, train/val loss:(0.0886091, 0.0808574),train/val f1:(0.8396339, 0.8684139), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [00:59<00:00, 14.10it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:3, train/val loss:(0.0835045, 0.0775065),train/val f1:(0.8545491, 0.8716836), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [00:59<00:00, 14.25it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:4, train/val loss:(0.0807228, 0.0785767),train/val f1:(0.8609644, 0.8671204), lr:5e-05


100%|██████████| 844/844 [01:48<00:00,  7.80it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:5, train/val loss:(0.0771688, 0.0723868),train/val f1:(0.8685983, 0.8839497), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [01:50<00:00,  7.64it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:6, train/val loss:(0.0726353, 0.0722314),train/val f1:(0.8762299, 0.8822569), lr:5e-05


100%|██████████| 844/844 [01:56<00:00,  7.24it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:7, train/val loss:(0.0689165, 0.070021),train/val f1:(0.8841227, 0.8884785), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [01:54<00:00,  7.38it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:8, train/val loss:(0.0649263, 0.0684915),train/val f1:(0.891414, 0.892273), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [01:57<00:00,  7.16it/s]


Epoch:9, train/val loss:(0.0614674, 0.0685837),train/val f1:(0.897436, 0.8950249), lr:5e-05
Saving best model to checkpoint


100%|██████████| 844/844 [01:57<00:00,  7.19it/s]


Epoch:10, train/val loss:(0.0580261, 0.0705773),train/val f1:(0.9030508, 0.8868493), lr:5e-05
Saving checkpoint to disk


100%|██████████| 844/844 [01:48<00:00,  7.80it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:11, train/val loss:(0.0546853, 0.0722361),train/val f1:(0.9095877, 0.8888394), lr:5e-05
Recovering best model from checkpoint


100%|██████████| 844/844 [01:47<00:00,  7.86it/s]
  0%|          | 0/844 [00:00<?, ?it/s]

Epoch:12, train/val loss:(0.0565339, 0.0688215),train/val f1:(0.9055809, 0.8914899), lr:2.5e-05


 86%|████████▌ | 725/844 [01:34<00:18,  6.45it/s]Process Process-644:
Process Process-647:
Process Process-636:
Process Process-627:
Process Process-645:
Process Process-626:
Process Process-643:
Process Process-633:
Process Process-641:
Process Process-646:
Process Process-639:
Process Process-628:
Traceback (most recent call last):
Process Process-642:
Traceback (most recent call last):
Process Process-629:
Process Process-631:
  File "/home/ztan/miniconda3/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
    util._exit_function()
Process Process-638:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/multiprocessing/util.py", line 360, in _exit_function
    _run_finalizers()
Traceback (most recent call last):
Process Process-630:
Process Process-634:
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/multiprocessing/process.py", line 318, in

###################

In [None]:
raise ValueError
# belows are the hyper params

# d2 (2; fnodes 5) + bert mini (4; 3 attention) + 1*SAGA (1 wrd_dim) + adam=5e-5 batch=16 fp16 = 89.12271
# (current best) d2 (2; fnodes all) + bert mini (4; 3 attention) + 1*SAGA (4 wrd_dim) + adam=5e-5 batch=16 fp16 = 89.54141
# d2 (3; fnodes all) + bert mini (5; 3 attention) + 2*SAGA (wrd_dim) + adam=5e-5 batch=16 fp16 = 89.45

# correction

In [None]:
model = Combined_Model(features, lang_model, n_saga=1)
model.load_state_dict(torch.load("test_v10.pt"))
model.to(device)
# model = torch.load(model_name)

In [None]:
class CheatDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pd.read_csv("test_cheat.csv", names=range(3), skiprows=1)[[0, 2, 1]].dropna()
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data = self.df_data.rename({0: "image", 2: "label", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample


cheat_data = CheatDataset(transform=transforms)
one_hot_encode_labels(cheat_data)

from torch.utils.data import DataLoader, random_split

cheat_dl = DataLoader(
    cheat_data, batch_size=8, shuffle=False, collate_fn=train_collate_fn, num_workers=24
)

from sklearn.metrics import precision_recall_curve

# use the same validation set/training set
# evaluate threshould on train/val/cheat
val_loss, val_outs, val_labels, mean_f1, sample_f1 = evaluate_model(model, val_dl)

best_thresholds = np.zeros(18)
for i in range(18):
    fpr, tpr, thresholds = precision, recall, thresholds = precision_recall_curve(
        val_labels[:, i], (val_outs)[:, i]
    )
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    best_thresholds[i] = thresholds[ix]

print(best_thresholds)
np.save("best_thresholds", best_thresholds)

cheat_loss, cheat_outs, cheat_labels, _, old_f1 = evaluate_model(model, cheat_dl)
new_f1 = f1_score(
    y_true=cheat_labels, y_pred=1 * (cheat_outs - best_thresholds) > 0, average="samples"
)
print(f"old_f1: {old_f1}; new_f1:{new_f1}")

# generate test labels

In [None]:
# stopper for run all
raise ValueError

In [None]:
def test_collate_fn(X):
    X = {k: [v[k] for v in X] for k in X[0]}
    X["image"] = torch.stack(X["image"])

    return X


test_data = TestDataset(transform=transforms)
test_dl = torch.utils.data.DataLoader(
    test_data, batch_size=8, shuffle=False, collate_fn=test_collate_fn, num_workers=0,
)


# save predictions
model.eval()
test_preds = []
for i, batch in enumerate(test_dl):
    images = batch["image"].to(device)
    captions = batch["caption"]
    inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)

    predictions = model(images, inputs)
    test_preds.append(predictions.detach().cpu().numpy())

test_preds = np.vstack(test_preds)


def out_logits_to_preds(logits, best_thresholds, correction=False):
    labels = []
    # ... correction doesnt work..??
    if correction:
        logits = logits - best_thresholds
    logits = logits > 0
    lables_available = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19])
    for i in range(logits.shape[0]):
        labels.append(list(lables_available[logits[i]]))
    return labels


# lables_available[(test_preds[:10] > 0)]
# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}

test_labels = out_logits_to_preds(test_preds, best_thresholds, correction=False)

df_test = pd.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
test_labels_str = [" ".join([str(i) for i in labels]) for labels in test_labels]
df_test["Labels"] = test_labels_str
df_test.rename({0: "ImageID"}, axis=1, inplace=True)
df_test[["ImageID", "Labels"]].to_csv("test_predictions.csv", index=False)