# COMP5329 - Assignment 2

In [1]:
# import google
import collections
import json
import re

import nltk
import numpy as np
import pandas
import pandas as pd

# import torchtext
import PIL.Image
import torch
import torch.nn as nn
import torchvision

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# MOUNT_PATH = '/content/drive'
# DRIVE_PATH = f'{MOUNT_PATH}/My Drive'
# PROJECT_PATH = DRIVE_PATH + "/Assignment 2"
PROJECT_PATH = "./"
IMG_PATH = f"{PROJECT_PATH}/data"
TRAIN_CSV_PATH = f"{PROJECT_PATH}/train.csv"
TEST_CSV_PATH = f"{PROJECT_PATH}/test.csv"

# google.colab.drive.mount(MOUNT_PATH)
torch.cuda.empty_cache()

## Dataset


### Train dataset

In [2]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[3] = self.df_data[3].fillna("")
        self.df_data[2] += self.df_data[3]
        self.df_data = self.df_data.drop(3, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "label", 2: "caption"}, axis=1).dropna()

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

### Test dataset

In [3]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[2] = self.df_data[2].fillna("")
        self.df_data[1] += self.df_data[2]
        self.df_data = self.df_data.drop(2, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        caption = self.df_data.iloc[idx, 1]

        sample = {"caption": caption, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

## Preprocessing

### Dataset pre-transformations 

In [4]:
def one_hot_encode_labels(dataset):
    if "one_hot_encoded_labels" not in dataset.tags:
        dataset.df_data["label"] = dataset.df_data["label"].apply(
            lambda l: torch.nn.functional.one_hot(
                torch.tensor([int(i) - 1 if int(i) < 12 else int(i) - 2 for i in l.split(" ")]), 18
            )
            .sum(axis=0)
            .float()
        )

        dataset.tags.add("one_hot_encoded_labels")

## Modules


### Caption embedding

In [None]:
from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")
lang_model = AutoModel.from_pretrained("prajjwal1/bert-mini")

for name, param in lang_model.named_parameters():
    param.requires_grad_(False)



### Pretrained model surgery

In [None]:
from efficientnet_pytorch import EfficientNet

# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}
eff_net = EfficientNet.from_pretrained("efficientnet-b2")

for name, param in eff_net.named_parameters():
    param.requires_grad_(False)


class Flattened(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.module_list = nn.ModuleList()
        self.module_names = []
        for name, child in pretrained_model.named_children():
            if name == "_fc":
                continue  # get rid of last layer
            if isinstance(child, nn.ModuleList):
                for idx, i in enumerate(child):
                    self.module_list.append(i)
                    self.module_names.append(f"{name}_{str(idx)}")
            else:
                self.module_list.append(child)
                self.module_names.append(name)
        self._swish_idx = self.module_names.index("_swish")

    def forward(self, img_batch):
        all_outs = {}
        x = img_batch
        for idx, module in enumerate(self.module_list):
            if self.module_names[idx] == "_swish":
                continue
            elif self.module_names[idx] == "_dropout":
                shape = x.shape
                x = module(x.view(-1, x.shape[1]))
            else:
                x = module(x)

            all_outs[self.module_names[idx]] = x
            if "bn" in self.module_names[idx]:
                x = self.module_list[self._swish_idx](x)
        return x, all_outs


features = Flattened(eff_net)

In [None]:
# from torchinfo import summary
# summary(features.to("cuda"), (1, 3, 224, 224))

In [None]:
# for name, child in features.named_children():
#     print(name)

### Combined model

In [None]:
NUM_ClASSES = 18
import math

import torch.nn as nn


class Combined_Model(torch.nn.Module):
    def __init__(
        self, visual_features, lang_model, word_dim=256, img_dim=1408, num_decoder_layers=1
    ):
        super().__init__()
        self.visual_features = visual_features
        self.lang_model = lang_model
        self.img_dim = img_dim
        self.word_dim = word_dim
        
        self.bottle_neck = nn.Conv2d(in_channels=img_dim, out_channels=word_dim, kernel_size=1)
        self.layer_norm = norm = nn.LayerNorm(word_dim, eps=1e-05, elementwise_affine=True)
        
        # SA-GA ##############################
        self.guided_attention_layer = nn.TransformerDecoderLayer(
            word_dim, nhead=4, dim_feedforward=2 * word_dim, dropout=0.1, activation="relu"
        )
        self.guided_attention = nn.TransformerDecoder(
            self.guided_attention_layer, num_layers=1, norm=self.layer_norm
        )
        self.self_attention_layer = nn.TransformerDecoderLayer(
            word_dim, 4, dim_feedforward=2 * word_dim, dropout=0.1, activation="relu"
        )
        self.self_attention = nn.TransformerDecoder(
            self.self_attention_layer, num_layers=1, norm=self.layer_norm
        )
        # SA-GA ##############################
 
        self.output = nn.Linear(word_dim * 2, NUM_ClASSES)
#         self.activation = nn.ReLU()
#         self.proj_all = nn.Linear(img_dim + word_dim * 2, word_dim)

    def forward(self, images, inputs):  # requires tokenized captions
        lang_outs = self.lang_model(**inputs)
        last_hidden_state = lang_outs["last_hidden_state"].permute(1, 0, 2)
        pooler_out = lang_outs["pooler_output"]
        mask = (inputs.attention_mask==0)

        images, intermediate_outputs = self.visual_features(images)
        image_query = intermediate_outputs['_conv_head']
        image_query = self.bottle_neck(image_query)
        image_query = image_query.view(-1, self.word_dim, 49).permute(2, 0, 1)
        
        # self attention
        att_out = self.self_attention_layer(image_query, image_query)
        
        # guided attention
        att_out = self.guided_attention(image_query, last_hidden_state, memory_key_padding_mask=mask)
        att_out = att_out.permute(1,2,0) # 49, batch, word_dim -> batch, word_dim, 49
        att_out = att_out.mean(-1) # -> batch, word_dim
        
        out = torch.cat([att_out, pooler_out], dim=-1)
#         out = self.proj_all(out)
#         out = self.activation(out)
        out = self.output(out)
        return out

In [None]:
# for name, _ in model.module_list.named_parameters():
#     print(name, _.requires_grad)

## Transforms and training support

In [None]:
class FieldTransform(object):
    def __init__(self, field, transform):
        self.field = field
        self.transform = transform

    def __call__(self, sample):
        sample[self.field] = self.transform(sample[self.field])
        return sample

## Experiments

In [None]:
def train_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["label"] = torch.stack(X["label"])
    X["image"] = torch.stack(X["image"])

    return X


normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

transforms = torchvision.transforms.Compose(
    [
        FieldTransform("image", torchvision.transforms.Resize((224, 224))),
        FieldTransform("image", torchvision.transforms.ToTensor()),
        FieldTransform("image", normalize),
    ]
)

In [None]:
train_data = TrainDataset(transform=transforms)
# vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)

from torch.utils.data import DataLoader, random_split

# train_data.__len__() == 30000
trainds, valds = random_split(train_data, [27000, 3000])

train_dl = DataLoader(
    trainds, batch_size=64, shuffle=True, collate_fn=train_collate_fn, num_workers=24
)

val_dl = DataLoader(valds, batch_size=8, shuffle=False, collate_fn=train_collate_fn, num_workers=24)

# model = Combined().to(DEVICE)


model = Combined_Model(features, lang_model, num_decoder_layers=3).to(device)


from sam import SAM

# base_optimizer = torch.optim.Adam  # define an optimizer for the "sharpness-aware" update
# optimizer = SAM(model.parameters(), base_optimizer, lr=1e-3)
optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)
criterion = torch.nn.BCEWithLogitsLoss()

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer,
    mode="min",
    factor=0.5,
    patience=5,
    threshold=0.0001,
    threshold_mode="rel",
    cooldown=5,
    min_lr=0,
    eps=1e-08,
    verbose=False,
)

In [None]:
# for name, param in model.module_list.named_parameters():
#     print(name)
# raise ValueError

In [None]:
import os
from copy import deepcopy

from sklearn.metrics import f1_score
from tqdm import tqdm

# os.environ["TOKENIZERS_PARALLELISM"] = "true"


val_mean_f1 = []
val_sample_f1 = []
model_in_memory = {}

# scaler = torch.cuda.amp.GradScaler()
for epoch in range(500):
    train_loss = []
    train_outs = []
    train_lables = []
    #     model.train()

    # fine tune CNN for 5 epoches
    if epoch == 0:
        finetune_param = False
        for name, param in model.visual_features.module_list.named_parameters():
            if "22" in name.split("."):  # or 21, required grad for after the attentions
                finetune_param = True
            if finetune_param:
                print(name)
                param.requires_grad_(True)
                 
    if epoch == 0:
        finetune_param = False
        for name, param in model.lang_model.named_parameters():
            if "3" in name and "attention" in name:
                finetune_param = True
            if finetune_param:
                print(name)
                param.requires_grad_(True)

    for i, batch in enumerate(tqdm(train_dl)):
        optimizer.zero_grad()
        captions = batch["caption"]
        images = batch["image"].to(device)
        labels = batch["label"].to(device)
        inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)

#         raise ValueError

        #         with torch.cuda.amp.autocast():
        # first second step
        predictions = model(images, inputs)
        loss = criterion(predictions, labels)
        loss.backward()
        optimizer.step()
#         optimizer.first_step(zero_grad=True)

        #         with torch.cuda.amp.autocast():
        # second step
#         predictions = model(images, inputs)
#         loss = criterion(predictions, labels)
#         loss.backward()
#         optimizer.second_step(zero_grad=True)
        #
        #         scaler.scale(loss).backward()
        #         scaler.step(optim)
        #         scaler.update()
        #         train_loss.append(scaled_loss.item())
        train_loss.append(loss.item())

    val_loss = []
    val_outs = []
    val_labels = []

    model.eval()
    for i, batch in enumerate(val_dl):

        captions = batch["caption"]
        images = batch["image"].to(DEVICE)
        labels = batch["label"].to(DEVICE)
        inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)

        #         predictions = model(images, captions)
        #         with torch.cuda.amp.autocast():
        predictions = model(images, inputs)
        loss = criterion(predictions, labels)
        val_loss.append(loss.item())
        val_outs.append(predictions.detach().cpu().numpy())
        val_labels.append(labels.detach().cpu().numpy())
    val_labels = np.vstack(val_labels)
    val_outs = np.vstack(val_outs)
    mean_f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="micro")  # mean f1
    sample_f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="samples")  # mean f1

    cur_lr = optimizer.param_groups[0]["lr"]
    print(
        f"Epoch:{epoch}, train/val loss:{round(np.mean(train_loss),7),round(np.mean(val_loss),7)},sample/mean f1:{round(sample_f1, 7), round(mean_f1, 7)}, lr:{cur_lr}"
    )
    scheduler.step(np.mean(val_loss))

    if len(val_sample_f1) == 0 or sample_f1 > max(val_sample_f1):
        print("saving best")
        try:
            del model_in_memory["best"]
        except:
            pass
        torch.cuda.empty_cache()
        model.to("cpu")
        model_in_memory["best"] = deepcopy(model.state_dict())
        model.to(device)
    if epoch % 10 == 0:
        torch.save(model_in_memory["best"], "test_v8.pt")

    val_mean_f1.append(mean_f1)
    val_sample_f1.append(sample_f1)

###################

In [None]:
outs = lang_model(**inputs)
last_hidden_state = outs.last_hidden_state
pooler_output = outs.pooler_output

In [None]:
image_feats, all_outs = features(images)

In [None]:
all_outs.keys()

In [None]:
all_outs["_conv_head"].shape

In [None]:
comp.shape, last_hidden_state.shape

In [None]:
bottle_neck = nn.Conv2d(in_channels=1408, out_channels=256, kernel_size=1).to(device)
norm = nn.LayerNorm(256, eps=1e-05, elementwise_affine=True)
decoder_layer = nn.TransformerDecoderLayer(
    256, 8, dim_feedforward=1024, dropout=0.1, activation="relu"
)
decoder = nn.TransformerDecoder(decoder_layer, num_layers=1, norm=norm).to(device)

comp = bottle_neck(all_outs["_conv_head"]).view(-1, 256, 49).permute(2, 0, 1)
decoder(comp, last_hidden_state, memory_key_padding_mask=(inputs.attention_mask == 0)).shape

In [None]:
import torch
import torch.nn as nn

images = torch.randn(1, 3, 224, 224)

In [None]:
from efficientnet_pytorch import EfficientNet

eff_net = EfficientNet.from_pretrained("efficientnet-b2")

for name, param in eff_net.named_parameters():
    param.requires_grad_(False)


class Flattened(nn.Module):
    def __init__(self, pretrained_model):
        super().__init__()
        self.module_list = nn.ModuleList()
        self.module_names = []
        for name, child in pretrained_model.named_children():
            if name == "_fc":
                continue  # get rid of last layer
            if isinstance(child, nn.ModuleList):
                for idx, i in enumerate(child):
                    self.module_list.append(i)
                    self.module_names.append(f"{name}_{str(idx)}")
            else:
                self.module_list.append(child)
                self.module_names.append(name)
        self._swish_idx = self.module_names.index("_swish")

    def forward(self, img_batch):
        all_outs = {}
        x = img_batch
        for idx, module in enumerate(self.module_list):
            if self.module_names[idx] == "_swish":
                continue
            elif self.module_names[idx] == "_dropout":
                shape = x.shape
                x = module(x.view(-1, x.shape[1]))
            else:
                x = module(x)

            all_outs[self.module_names[idx]] = x
            if "bn" in self.module_names[idx]:
                x = self.module_list[self._swish_idx](x)
        return x, all_outs


eff_net.eval()
eff_net
f = Flattened(eff_net)
x, all_outs = f(images)

In [None]:
activation = {}


def get_activation(name):
    def hook(eff_net, input, output):
        activation[name] = output

    return hook


for name, child in eff_net.named_children():
    if isinstance(child, nn.ModuleList):
        for idx, i in enumerate(child):
            i.register_forward_hook(get_activation(f"{name}_{str(idx)}"))
    else:
        child.register_forward_hook(get_activation(name))


output = eff_net(images)

In [None]:
for key in activation.keys():
    if key == "_swish":
        continue
    print(key)
    print((all_outs[key] == activation[key]).all())

In [None]:
# eff_net.to("cuda")
# f.to("cuda")

# %timeit -n10 -r5 eff_net(images.cuda())
# %timeit -n10 -r5 f(images.cuda())

In [None]:
inputs.input_ids.shape

In [None]:
from torchinfo import summary

batch_size = 1
summary(f, input_size=(batch_size, 3, 224, 224))

In [None]:
for k, v in all_outs.items():
    print(k, v.shape)

In [None]:
208 * 3 + 352 * 2

# correction

In [None]:
# model_name = "test.pt"
# model = LUKE_CRF(tag_to_ix, lang_model=lang_model, hidden_dim=1024)
model = Combined_Model(features, lang_model)
model.load_state_dict(torch.load("test.pt"))
model.to(device)
# model = torch.load(model_name)

In [None]:
# import numpy as np
# from sklearn.metrics import roc_curve, precision_recall_curve

# model = torch.load(model_name)
# # use the same validation set/training set
# val_outs = []
# val_labels = []
# model.eval()
# for i, batch in enumerate(val_dl):

#     captions = batch["caption"]
#     images = batch["image"].to(DEVICE)
#     labels = batch["label"].to(DEVICE)

#     inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False)
#     input_ids = inputs["input_ids"].to(device)
#     attention_mask = inputs["attention_mask"].to(device)

#     # predictions = model(images, captions)
#     predictions = model(images, input_ids, attention_mask)

#     val_outs.append(predictions.detach().cpu().numpy())
#     val_labels.append(labels.detach().cpu().numpy())
# val_labels = np.vstack(val_labels)
# val_outs = np.vstack(val_outs)

# best_thresholds = np.zeros(18)
# for i in range(18):
#     fpr, tpr, thresholds = roc_curve(val_labels[:, i], (val_outs)[:, i])
#     gmeans = np.sqrt(tpr * (1 - fpr))
#     ix = np.argmax(gmeans)
#     best_thresholds[i] = thresholds[ix]

In [None]:
# cheat
class CheatDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pd.read_csv("test_cheat.csv", names=range(3), skiprows=1)[[0, 2, 1]].dropna()
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data = self.df_data.rename({0: "image", 2: "label", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample


cheat_data = CheatDataset(transform=transforms)
# vocabularise_caption(train_data, vocab)
one_hot_encode_labels(cheat_data)

from torch.utils.data import DataLoader, random_split

# train_data.__len__() == 30000
cheat_dl = DataLoader(
    cheat_data, batch_size=32, shuffle=False, collate_fn=train_collate_fn, num_workers=24
)

from sklearn.metrics import precision_recall_curve

# use the same validation set/training set
val_outs = []
val_labels = []
model.eval()
for i, batch in enumerate(cheat_dl):

    captions = batch["caption"]
    images = batch["image"].to(device)
    labels = batch["label"].to(device)
    inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)

    # predictions = model(images, captions)
    predictions = model(images, inputs)

    val_outs.append(predictions.detach().cpu().numpy())
    val_labels.append(labels.detach().cpu().numpy())
val_labels = np.vstack(val_labels)
val_outs = np.vstack(val_outs)


best_thresholds = np.zeros(18)
for i in range(18):
    fpr, tpr, thresholds = precision, recall, thresholds = precision_recall_curve(
        val_labels[:, i], (val_outs)[:, i]
    )
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    best_thresholds[i] = thresholds[ix]

print(best_thresholds)
np.save("best_thresholds", best_thresholds)

# generate test labels

In [None]:
# stopper for run all
raise ValueError

In [None]:
def test_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["image"] = torch.stack(X["image"])

    return X


test_data = TestDataset(transform=transforms)
test_dl = torch.utils.data.DataLoader(
    test_data, batch_size=16, shuffle=False, collate_fn=test_collate_fn, num_workers=0,
)


# save predictions
model.eval()
test_preds = []
for i, batch in enumerate(test_dl):
    captions = batch["caption"]
    inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False).to(device)

    predictions = model(images, inputs)
    test_preds.append(predictions.detach().cpu().numpy())

test_preds = np.vstack(test_preds)


def out_logits_to_preds(logits, best_thresholds):
    labels = []
    logits = (logits - np.zeros_like(best_thresholds)) > 0
    lables_available = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19])
    for i in range(logits.shape[0]):
        labels.append(list(lables_available[logits[i]]))
    return labels


# lables_available[(test_preds[:10] > 0)]
# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}

test_labels = out_logits_to_preds(test_preds, best_thresholds)

df_test = pd.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
test_labels_str = [" ".join([str(i) for i in labels]) for labels in test_labels]
df_test["Labels"] = test_labels_str
df_test.rename({0: "ImageID"}, axis=1, inplace=True)

df_test[["ImageID", "Labels"]].to_csv("test_predictions.csv", index=False)