# COMP5329 - Assignment 2

In [1]:
# import google
import collections
import json
import re

import nltk
import numpy as np
import pandas
import pandas as pd

# import torchtext
import PIL.Image
import torch
import torch.nn as nn
import torchvision

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# MOUNT_PATH = '/content/drive'
# DRIVE_PATH = f'{MOUNT_PATH}/My Drive'
# PROJECT_PATH = DRIVE_PATH + "/Assignment 2"
PROJECT_PATH = "./"
IMG_PATH = f"{PROJECT_PATH}/data"
TRAIN_CSV_PATH = f"{PROJECT_PATH}/train.csv"
TEST_CSV_PATH = f"{PROJECT_PATH}/test.csv"

# google.colab.drive.mount(MOUNT_PATH)
torch.cuda.empty_cache()

## Dataset


### Train dataset

In [2]:
# df = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
# classes = df[1].apply(lambda x: list(map(int, x.strip().split()))).to_list()
# n_classes = set()
# for i in classes:
#     n_classes.update(i)
# n_clsses

In [40]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[3] = self.df_data[3].fillna("")
        self.df_data[2] += self.df_data[3]
        self.df_data = self.df_data.drop(3, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "label", 2: "caption"}, axis=1).dropna()

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

### Test dataset

In [4]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[2] = self.df_data[2].fillna("")
        self.df_data[1] += self.df_data[2]
        self.df_data = self.df_data.drop(2, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        caption = self.df_data.iloc[idx, 1]

        sample = {"caption": caption, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

## Preprocessing

### Vocab

In [5]:
# nltk.download("stopwords")
# stopwords = set(nltk.corpus.stopwords.words("english"))
# basic_tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
# glove = torchtext.vocab.GloVe(name="6B", dim=100)

# with open(f"{PROJECT_PATH}/correct_spelling.json", "r") as f:
#     correct_spelling = json.load(f)


# def caption_tokenizer(caption):
#     caption = re.sub("[^a-zA-Z]", " ", caption).lower()
#     raw_tokens = basic_tokenizer(caption)
#     raw_tokens = list(set(raw_tokens).difference(stopwords))
#     tokens = []

#     for token in raw_tokens:

#         if token in correct_spelling:
#             corrected_spelling = correct_spelling[token]

#             for corrected_token in corrected_spelling.split():
#                 if corrected_token in glove.stoi:
#                     tokens.append(corrected_token)
#         else:
#             if token in glove.stoi:
#                 tokens.append(token)

#     return tokens

In [6]:
# train_data = TrainDataset()
# test_data = TestDataset()
# counter = collections.Counter()

# for caption in train_data.df_data["caption"]:
#     counter.update(caption_tokenizer(caption))

# for caption in test_data.df_data["caption"]:
#     counter.update(caption_tokenizer(caption))

# vocab = torchtext.vocab.Vocab(counter, vectors="glove.6B.100d", specials=("<unk>", "<BOS>", "<EOS>", "<PAD>"))

# del train_data
# del test_data

### Dataset pre-transformations 

In [7]:
# def vocabularise_caption(dataset, vocab):

#     if "vocabularised_caption" not in dataset.tags:
#         # turns string caption to list of vocab indices
#         dataset.df_data["caption"] = dataset.df_data["caption"].apply(
#             lambda c: torch.tensor([vocab.stoi[t] for t in caption_tokenizer(c)])
#         )

#         dataset.tags.add("vocabularised_caption")


def one_hot_encode_labels(dataset):
    if "one_hot_encoded_labels" not in dataset.tags:
        dataset.df_data["label"] = dataset.df_data["label"].apply(
            lambda l: torch.nn.functional.one_hot(
                torch.tensor([int(i) - 1 if int(i) < 12 else int(i) - 2 for i in l.split(" ")]), 18
            )
            .sum(axis=0)
            .float()
        )

        dataset.tags.add("one_hot_encoded_labels")

## Modules


### Caption embedding

In [8]:
# class CaptionEmbedding(torch.nn.Module):
#     def __init__(self, vocab):
#         super(CaptionEmbedding, self).__init__()
#         self.word_embedding = torch.nn.Embedding.from_pretrained(vocab.vectors)
#         self.linear1 = torch.nn.Linear(vocab.vectors.shape[1], 18)
#         self.lrelu1 = torch.nn.LeakyReLU()
#         self.linear2 = torch.nn.Linear(18, 1)
#         self.lrelu2 = torch.nn.LeakyReLU()
#         self.lrelu3 = torch.nn.LeakyReLU()

#     def forward(self, captions):
#         def _embed_caption(c):
#             c = c.to(DEVICE)
#             c = self.word_embedding(c)
#             word_importance = self.linear1(c)
#             word_importance = self.lrelu1(word_importance)
#             word_importance = self.linear2(word_importance)
#             word_importance = self.lrelu2(word_importance)
#             return self.lrelu3(c.T @ word_importance).view(-1)

#         return torch.stack([*map(lambda c: _embed_caption(c), captions)])

from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-mini")
lang_model = AutoModel.from_pretrained("prajjwal1/bert-mini")

# finetune_param = False
# for name, param in lang_model.named_parameters():
#     name = name.split(".")
# #     if "3" in name and "intermediate" in name:
# #         finetune_param = True
#     param.requires_grad = finetune_param
# #     print(name, param.requires_grad)

for name, param in lang_model.named_parameters():
    param.requires_grad = False

Some weights of the model checkpoint at prajjwal1/bert-mini were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Pretrained model surgery

In [9]:
# class Surgery(torch.nn.Module):
#     def __init__(self, name, selection, training_disabled=True):
#         super(Surgery, self).__init__()
#         model = getattr(torchvision.models, name)(pretrained=True)
#         children = list(model.children())

#         self.sequential = torch.nn.Sequential(*[children[i] for i in selection])

#         if training_disabled:
#             for param in self.sequential.parameters():
#                 param.requires_grad = False

#     def forward(self, x):

#         return self.sequential(x)


# def surgery_info(name):
#     model = getattr(torchvision.models, name)()

#     for idx, child in enumerate(model.children()):
#         print(f"Accessible at {idx}:\n{child}\n")

# !pip install efficientnet_pytorch
from efficientnet_pytorch import EfficientNet

# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}
features = EfficientNet.from_pretrained("efficientnet-b1")

# finetune_param = False
# for name, param in features.named_parameters():
#     #     print(name.split(".")[1])
#     if "18" in name.split("."):  # required grad for last 2 blocks
#         finetune_param = True
#     param.requires_grad = finetune_param
#     print(name, param.requires_grad)

for name, param in features.named_parameters():
    param.requires_grad = False

Loaded pretrained weights for efficientnet-b1


### Combined model

In [62]:
NUM_ClASSES = 18
import math

import torch.nn as nn


class Combined_Model(torch.nn.Module):
    def __init__(self, visual_features, lang_model):
        super().__init__()
        self.visual_features = visual_features
        self.lang_model = lang_model
        self.proj = nn.Linear(1000, 256)  # (feature.shape, hidden.shape)
        self.scale = math.sqrt(256)
        self.activation = nn.ReLU()
        self.combine = nn.Linear(
            256 + 1000 + 256, 256
        )  # (feature.shape + hidden.shape + hidden.shape)
        #         self.hidden1 = nn.Linear(256, 128)
        self.output = nn.Linear(256, NUM_ClASSES)

    def forward(self, images, input_ids, attention_mask):  # requires tokenized captions
        images = self.visual_features(images)
        proj = self.proj(images)

        lang_outs = self.lang_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden = lang_outs["last_hidden_state"]
        pooler_out = lang_outs["pooler_output"]

        attention_weights = (
            torch.bmm(hidden, proj.unsqueeze(-1)).squeeze(-1) / self.scale
        )  # (batch, seq_len, 1)
        attention_out = torch.bmm(attention_weights.unsqueeze(1), hidden).squeeze(1)

        out = torch.cat([images, attention_out, pooler_out], dim=-1)
        out = self.activation(out)
        out = self.combine(out)
        #         out = self.activation(out)
        #         out = self.hidden1(out)
        out = self.activation(out)
        out = self.output(out)
        return out

In [11]:
# class Combined(torch.nn.Module):
#     def __init__(self):
#         super(Combined, self).__init__()
#         # self.image_embedder = Surgery('vgg16', [0])
#         self.image_embedder = features
#         # self.caption_embedder = CaptionEmbedding(vocab)

#         # self.linear1 = torch.nn.Linear(2148, 256)
#         self.linear1 = torch.nn.Linear(1000, 256)
#         self.lrelu1 = torch.nn.LeakyReLU()
#         # self.linear2 = torch.nn.Linear(256, 18)
#         # self.lrelu2 = torch.nn.LeakyReLU()

#         self.heads = nn.ModuleList([nn.Linear(256, 1) for i in range(NUM_ClASSES)])

#     def forward(self, images, captions):
#         image_embeddings = self.image_embedder(images)
#         # image_embeddings = image_embeddings.view(images.shape[0], -1) #???

#         # caption_embeddings = self.caption_embedder(captions)
#         # caption_embeddings = caption_embeddings.view(caption_embeddings.shape[0], -1)

#         # combined_embedding = torch.cat([image_embeddings, caption_embeddings], 1)
#         # y = self.linear1(combined_embedding)

#         y = self.linear1(image_embeddings)
#         y = self.lrelu1(y)

#         outs = []
#         for i in range(NUM_ClASSES):
#             outs.append(self.heads[i](y))

#         return torch.cat(outs, dim=-1)

## Transforms and training support

In [12]:
class FieldTransform(object):
    def __init__(self, field, transform):
        self.field = field
        self.transform = transform

    def __call__(self, sample):
        sample[self.field] = self.transform(sample[self.field])
        return sample

## Experiments

In [13]:
def train_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["label"] = torch.stack(X["label"])
    X["image"] = torch.stack(X["image"])

    return X


normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

transforms = torchvision.transforms.Compose(
    [
        #   FieldTransform('image', torchvision.transforms.Resize((64, 64))),
        #   FieldTransform('image', torchvision.transforms.CenterCrop(64)),
        FieldTransform("image", torchvision.transforms.Resize((224, 224))),
        FieldTransform("image", torchvision.transforms.ToTensor()),
        FieldTransform("image", normalize),
    ]
)

In [14]:
train_data = TrainDataset(transform=transforms)
# vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)

from torch.utils.data import DataLoader, random_split

# train_data.__len__() == 30000
trainds, valds = random_split(train_data, [27000, 3000])

train_dl = DataLoader(
    trainds, batch_size=64, shuffle=True, collate_fn=train_collate_fn, num_workers=24
)

val_dl = DataLoader(valds, batch_size=8, shuffle=False, collate_fn=train_collate_fn, num_workers=24)

# model = Combined().to(DEVICE)
model = Combined_Model(features, lang_model).to(device)
# optim = torch.optim.Adam(model.parameters(), lr=0.001)

from sam import SAM

base_optimizer = torch.optim.Adam  # define an optimizer for the "sharpness-aware" update
optimizer = SAM(model.parameters(), base_optimizer, lr=0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

In [15]:
import os

from sklearn.metrics import f1_score
from tqdm import tqdm

# os.environ["TOKENIZERS_PARALLELISM"] = "true"


val_mean_f1 = []
val_sample_f1 = []
# scaler = torch.cuda.amp.GradScaler()
for epoch in range(20):
    train_loss = []
    train_outs = []
    train_lables = []
    model.train()

    # fine tune after 5 epoches
    if epoch == 5:
        finetune_param = False
        for name, param in model.visual_features.named_parameters():
            #     print(name.split(".")[1])
            if "18" in name.split("."):  # required grad for last 2 blocks
                finetune_param = True
            param.requires_grad = True

        finetune_param = False
        for name, param in model.lang_model.named_parameters():
            name = name.split(".")
            if "3" in name and "attention" in name:
                finetune_param = True
            param.requires_grad = True

    #         for name, param in model.named_parameters():
    #             print(name, param.requires_grad)

    for i, batch in enumerate(tqdm(train_dl)):
        #         optim.zero_grad()
        captions = batch["caption"]
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False)
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.cuda.amp.autocast():
            # first second step
            predictions = model(images, input_ids, attention_mask)
            loss = criterion(predictions, labels)
        loss.backward()
        optimizer.first_step(zero_grad=True)

        with torch.cuda.amp.autocast():
            # second step
            predictions = model(images, input_ids, attention_mask)
            loss = criterion(predictions, labels)
        loss.backward()
        optimizer.second_step(zero_grad=True)
        #
        #         scaler.scale(loss).backward()
        #         scaler.step(optim)
        #         scaler.update()
        #         train_loss.append(scaled_loss.item())
        train_loss.append(loss.item())

    val_loss = []
    val_outs = []
    val_labels = []

    model.eval()
    for i, batch in enumerate(val_dl):

        captions = batch["caption"]
        images = batch["image"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False)
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        #         predictions = model(images, captions)
        with torch.cuda.amp.autocast():
            predictions = model(images, input_ids, attention_mask)
            loss = criterion(predictions, labels)
        val_loss.append(loss.item())
        val_outs.append(predictions.detach().cpu().numpy())
        val_labels.append(labels.detach().cpu().numpy())
    val_labels = np.vstack(val_labels)
    val_outs = np.vstack(val_outs)
    mean_f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="micro")  # mean f1
    sample_f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="samples")  # mean f1

    if len(val_sample_f1) == 0 or sample_f1 > max(val_sample_f1):
        model_name = f"test_model_{str(sample_f1.round(3)*100)}.pt"
        torch.save(model, model_name)
    val_mean_f1.append(mean_f1)
    val_sample_f1.append(sample_f1)
    print(
        f"Epoch: {epoch},\
        Train Loss: {np.mean(train_loss)},\
        Val Loss: {np.mean(val_loss)},\
        mean F1: {mean_f1},\
        sample f1: {sample_f1}"
    )

100%|██████████| 422/422 [00:47<00:00,  8.93it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 0,        Train Loss: 0.208066868979784,        Val Loss: 0.13693628182013828,        mean F1: 0.6815599173553718,        sample f1: 0.7009669312169312


100%|██████████| 422/422 [00:47<00:00,  8.81it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 1,        Train Loss: 0.14013433444062115,        Val Loss: 0.11347906984388828,        mean F1: 0.7425630726747835,        sample f1: 0.7666126984126983


100%|██████████| 422/422 [00:47<00:00,  8.92it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 2,        Train Loss: 0.12455619350781938,        Val Loss: 0.10600241481761137,        mean F1: 0.7699115044247787,        sample f1: 0.7980783068783069


100%|██████████| 422/422 [00:47<00:00,  8.96it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 3,        Train Loss: 0.11598462952186146,        Val Loss: 0.1000751814643542,        mean F1: 0.7832099963605483,        sample f1: 0.8123544973544974


100%|██████████| 422/422 [00:47<00:00,  8.90it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 4,        Train Loss: 0.11090400412466854,        Val Loss: 0.09803482109804948,        mean F1: 0.789071038251366,        sample f1: 0.8198560846560846


100%|██████████| 422/422 [03:03<00:00,  2.30it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 5,        Train Loss: 0.09875211108175781,        Val Loss: 0.0764458608329296,        mean F1: 0.8398525740102247,        sample f1: 0.8703994708994709


100%|██████████| 422/422 [03:04<00:00,  2.28it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 6,        Train Loss: 0.0795636076515461,        Val Loss: 0.07162661390999953,        mean F1: 0.8497913769123783,        sample f1: 0.8802830687830687


100%|██████████| 422/422 [03:01<00:00,  2.33it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 7,        Train Loss: 0.0695685323469023,        Val Loss: 0.06902662125602364,        mean F1: 0.8553719008264463,        sample f1: 0.8838297258297259


100%|██████████| 422/422 [03:03<00:00,  2.30it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 8,        Train Loss: 0.06133171704143144,        Val Loss: 0.06869658972322941,        mean F1: 0.8546405823475888,        sample f1: 0.8858329244829244


100%|██████████| 422/422 [03:04<00:00,  2.29it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 9,        Train Loss: 0.05309134949952082,        Val Loss: 0.06989924866209428,        mean F1: 0.8570769940064545,        sample f1: 0.8862780423280422


100%|██████████| 422/422 [03:00<00:00,  2.33it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 10,        Train Loss: 0.046150900772221,        Val Loss: 0.07376181374366085,        mean F1: 0.8543136808559071,        sample f1: 0.8848497113997114


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 11,        Train Loss: 0.03931865352978356,        Val Loss: 0.07659793976213163,        mean F1: 0.8550592525068368,        sample f1: 0.8846097606097607


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 12,        Train Loss: 0.033871847915494045,        Val Loss: 0.08282555364755292,        mean F1: 0.8525377229080933,        sample f1: 0.8813276094276095


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 13,        Train Loss: 0.02821898455312255,        Val Loss: 0.08443483709994083,        mean F1: 0.8537549407114624,        sample f1: 0.8845851574351574


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 14,        Train Loss: 0.024347411837181632,        Val Loss: 0.09353847861689671,        mean F1: 0.8511685116851168,        sample f1: 0.8848436026936027


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 15,        Train Loss: 0.020852846459825457,        Val Loss: 0.09780148371860074,        mean F1: 0.8545662100456621,        sample f1: 0.8855406167906168


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 16,        Train Loss: 0.018106578997120048,        Val Loss: 0.09714174044605654,        mean F1: 0.8518477416490956,        sample f1: 0.8859256336256337


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 17,        Train Loss: 0.016244722482796882,        Val Loss: 0.10305687864821327,        mean F1: 0.8536363636363637,        sample f1: 0.8869350408850408


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 18,        Train Loss: 0.014395089658921787,        Val Loss: 0.10400898464947629,        mean F1: 0.8492569002123141,        sample f1: 0.8832000925000923


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 19,        Train Loss: 0.012552122042020032,        Val Loss: 0.11324860001399065,        mean F1: 0.8544793087767166,        sample f1: 0.8856938912938913


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 20,        Train Loss: 0.011769563554989183,        Val Loss: 0.10857847632229095,        mean F1: 0.8513060951105157,        sample f1: 0.8835663743663745


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 21,        Train Loss: 0.010742571288212181,        Val Loss: 0.115262445439992,        mean F1: 0.851668169522092,        sample f1: 0.8827563011063012


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 22,        Train Loss: 0.010129249480363176,        Val Loss: 0.11786150469338463,        mean F1: 0.8537293434569004,        sample f1: 0.8859047138047138


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 23,        Train Loss: 0.00912939682542192,        Val Loss: 0.11432883078563949,        mean F1: 0.8510922871154702,        sample f1: 0.8839361952861953


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 24,        Train Loss: 0.008653247500527032,        Val Loss: 0.12004747541760905,        mean F1: 0.8538262143581652,        sample f1: 0.8865474266474266


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 25,        Train Loss: 0.008420005936994355,        Val Loss: 0.11778038554329032,        mean F1: 0.8522916435467762,        sample f1: 0.8858737614237615


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 26,        Train Loss: 0.007603160353373047,        Val Loss: 0.12098112036228607,        mean F1: 0.8492935635792779,        sample f1: 0.8841716450216451


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 27,        Train Loss: 0.007588202326880742,        Val Loss: 0.12167487688102605,        mean F1: 0.8493333333333334,        sample f1: 0.8834442039442041


100%|██████████| 422/422 [02:58<00:00,  2.36it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 28,        Train Loss: 0.007292353887976063,        Val Loss: 0.12339213759357032,        mean F1: 0.8523852385238523,        sample f1: 0.8850786435786435


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 29,        Train Loss: 0.006524582916508778,        Val Loss: 0.1292802200485797,        mean F1: 0.8515120625212369,        sample f1: 0.8859007455507455


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 30,        Train Loss: 0.0064782441177977015,        Val Loss: 0.12253978618146115,        mean F1: 0.845879732739421,        sample f1: 0.8773129352129352


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 31,        Train Loss: 0.0067807745169991655,        Val Loss: 0.12201031206940145,        mean F1: 0.8524224157177941,        sample f1: 0.8842722703222702


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 32,        Train Loss: 0.005921339002792101,        Val Loss: 0.12941880113813872,        mean F1: 0.8544453186467349,        sample f1: 0.8852580567580567


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 33,        Train Loss: 0.0057750428565810944,        Val Loss: 0.12765440345613266,        mean F1: 0.8508493150684932,        sample f1: 0.8840835497835497


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 34,        Train Loss: 0.005732063330258618,        Val Loss: 0.13079633170833888,        mean F1: 0.8542349881262015,        sample f1: 0.8855009139009139


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 35,        Train Loss: 0.0057298026226407965,        Val Loss: 0.12481284990192701,        mean F1: 0.853120496619,        sample f1: 0.8878601935101936


100%|██████████| 422/422 [02:58<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 36,        Train Loss: 0.005504662756255267,        Val Loss: 0.12607732084608142,        mean F1: 0.8544653946031388,        sample f1: 0.8860928571428571


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 37,        Train Loss: 0.00522585206853239,        Val Loss: 0.1354060484928911,        mean F1: 0.8569776852815355,        sample f1: 0.8876109668109669


100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 38,        Train Loss: 0.004711358940534479,        Val Loss: 0.12977208564675918,        mean F1: 0.8521699819168173,        sample f1: 0.8846970621970622


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 39,        Train Loss: 0.005125208202224243,        Val Loss: 0.13157194007855044,        mean F1: 0.8510977135706974,        sample f1: 0.882841242091242


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 40,        Train Loss: 0.005007088334289693,        Val Loss: 0.13127572285675462,        mean F1: 0.8503477675566524,        sample f1: 0.8818768398268398


100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 41,        Train Loss: 0.00455800044689365,        Val Loss: 0.13128740436645844,        mean F1: 0.8556806550665303,        sample f1: 0.8864630110630112


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 42,        Train Loss: 0.004728946136586562,        Val Loss: 0.1377396979503513,        mean F1: 0.8551897003531959,        sample f1: 0.8876155603655603


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 43,        Train Loss: 0.004367287920664343,        Val Loss: 0.13273749305445623,        mean F1: 0.851906324244824,        sample f1: 0.8851850649350649


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 44,        Train Loss: 0.004267763890785615,        Val Loss: 0.1392966343843315,        mean F1: 0.8559717636342936,        sample f1: 0.8863967772967773


100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 45,        Train Loss: 0.004014759457734153,        Val Loss: 0.13853983085866398,        mean F1: 0.850875198908843,        sample f1: 0.8845055518555519


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 46,        Train Loss: 0.004252333079050383,        Val Loss: 0.13788473133586135,        mean F1: 0.8513197460741732,        sample f1: 0.883149975949976


100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 47,        Train Loss: 0.004208507126578465,        Val Loss: 0.13573913641844168,        mean F1: 0.8523889825744799,        sample f1: 0.8859576682576682


100%|██████████| 422/422 [02:57<00:00,  2.38it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 48,        Train Loss: 0.004084895279421802,        Val Loss: 0.13456605362949933,        mean F1: 0.8583305246601506,        sample f1: 0.889993771043771


100%|██████████| 422/422 [02:57<00:00,  2.37it/s]


Epoch: 49,        Train Loss: 0.004064095960844471,        Val Loss: 0.14012763746437365,        mean F1: 0.8516202387720294,        sample f1: 0.8830279942279943


# correction

In [34]:
model_name = "test_models/test_model-89-1.pt"

model = torch.load(model_name)

In [None]:
# import numpy as np
# from sklearn.metrics import roc_curve, precision_recall_curve

# model = torch.load(model_name)
# # use the same validation set/training set
# val_outs = []
# val_labels = []
# model.eval()
# for i, batch in enumerate(val_dl):

#     captions = batch["caption"]
#     images = batch["image"].to(DEVICE)
#     labels = batch["label"].to(DEVICE)

#     inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False)
#     input_ids = inputs["input_ids"].to(device)
#     attention_mask = inputs["attention_mask"].to(device)

#     # predictions = model(images, captions)
#     predictions = model(images, input_ids, attention_mask)

#     val_outs.append(predictions.detach().cpu().numpy())
#     val_labels.append(labels.detach().cpu().numpy())
# val_labels = np.vstack(val_labels)
# val_outs = np.vstack(val_outs)

# best_thresholds = np.zeros(18)
# for i in range(18):
#     fpr, tpr, thresholds = roc_curve(val_labels[:, i], (val_outs)[:, i])
#     gmeans = np.sqrt(tpr * (1 - fpr))
#     ix = np.argmax(gmeans)
#     best_thresholds[i] = thresholds[ix]

In [63]:
# cheat
class CheatDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pd.read_csv("test_cheat.csv", names=range(3), skiprows=1)[[0, 2, 1]].dropna()
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data = self.df_data.rename({0: "image", 2: "label", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample


cheat_data = CheatDataset(transform=transforms)
# vocabularise_caption(train_data, vocab)
one_hot_encode_labels(cheat_data)

from torch.utils.data import DataLoader, random_split

# train_data.__len__() == 30000
cheat_dl = DataLoader(
    cheat_data, batch_size=8, shuffle=False, collate_fn=train_collate_fn, num_workers=24
)

from sklearn.metrics import precision_recall_curve

# use the same validation set/training set
val_outs = []
val_labels = []
model.eval()
for i, batch in enumerate(cheat_dl):

    captions = batch["caption"]
    images = batch["image"].to(DEVICE)
    labels = batch["label"].to(DEVICE)

    inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    # predictions = model(images, captions)
    predictions = model(images, input_ids, attention_mask)

    val_outs.append(predictions.detach().cpu().numpy())
    val_labels.append(labels.detach().cpu().numpy())
val_labels = np.vstack(val_labels)
val_outs = np.vstack(val_outs)


best_thresholds = np.zeros(18)
for i in range(18):
    fpr, tpr, thresholds = precision, recall, thresholds = precision_recall_curve(
        val_labels[:, i], (val_outs)[:, i]
    )
    fscore = (2 * precision * recall) / (precision + recall)
    # locate the index of the largest f score
    ix = np.argmax(fscore)
    best_thresholds[i] = thresholds[ix]

print(best_thresholds)
np.save("best_thresholds", best_thresholds)

[ 0.53748935 -0.6078487  -0.73230952 -0.14457074 -2.01079559 -0.04025624
  0.55249649 -0.56891191  0.42223552 -0.42266083  0.28386441  0.01706048
 -0.83964258 -0.28619879 -0.60904086  0.9375723   0.31321821 -0.7129485 ]


# generate test labels

In [None]:
# stopper for run all
raise ValueError

In [64]:
def test_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["image"] = torch.stack(X["image"])

    return X


test_data = TestDataset(transform=transforms)
test_dl = torch.utils.data.DataLoader(
    test_data, batch_size=16, shuffle=False, collate_fn=test_collate_fn, num_workers=0,
)


# save predictions
model.eval()
test_preds = []
for i, batch in enumerate(test_dl):
    captions = batch["caption"]
    images = batch["image"].to(DEVICE)

    inputs = tokenizer(captions, return_tensors="pt", padding=True, truncation=False)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)

    predictions = model(images, input_ids, attention_mask)
    test_preds.append(predictions.detach().cpu().numpy())

test_preds = np.vstack(test_preds)


def out_logits_to_preds(logits, best_thresholds):
    labels = []
    logits = (logits - best_thresholds) > 0
    lables_available = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19])
    for i in range(logits.shape[0]):
        labels.append(list(lables_available[logits[i]]))
    return labels


# lables_available[(test_preds[:10] > 0)]
# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}

test_labels = out_logits_to_preds(test_preds, best_thresholds)

df_test = pd.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
test_labels_str = [" ".join([str(i) for i in labels]) for labels in test_labels]
df_test["Labels"] = test_labels_str
df_test.rename({0: "ImageID"}, axis=1, inplace=True)

df_test[["ImageID", "Labels"]].to_csv("test_predictions.csv", index=False)