# COMP5329 - Assignment 2

In [1]:
# import google
import collections
import json
import re

import nltk
import numpy as np
import pandas
import pandas as pd

# import torchtext
import PIL.Image
import torch
import torch.nn as nn
import torchvision

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# MOUNT_PATH = '/content/drive'
# DRIVE_PATH = f'{MOUNT_PATH}/My Drive'
# PROJECT_PATH = DRIVE_PATH + "/Assignment 2"
PROJECT_PATH = "./"
IMG_PATH = f"{PROJECT_PATH}/data"
TRAIN_CSV_PATH = f"{PROJECT_PATH}/train.csv"
TEST_CSV_PATH = f"{PROJECT_PATH}/test.csv"

# google.colab.drive.mount(MOUNT_PATH)

## Dataset


### Train dataset

In [2]:
# df = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
# classes = df[1].apply(lambda x: list(map(int, x.strip().split()))).to_list()
# n_classes = set()
# for i in classes:
#     n_classes.update(i)
# n_clsses

In [3]:
class TrainDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[3] = self.df_data[3].fillna("")
        self.df_data[2] += self.df_data[3]
        self.df_data = self.df_data.drop(3, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "label", 2: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        label = self.df_data.iloc[idx, 1]
        caption = self.df_data.iloc[idx, 2]

        sample = {"caption": caption, "label": label, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

### Test dataset

In [4]:
class TestDataset(torch.utils.data.Dataset):
    def __init__(self, transform=None):
        self.transform = transform
        self.tags = set()
        self.df_data = pandas.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
        self.df_data[0] = IMG_PATH + "/" + self.df_data[0]
        self.df_data[2] = self.df_data[2].fillna("")
        self.df_data[1] += self.df_data[2]
        self.df_data = self.df_data.drop(2, axis=1)
        self.df_data = self.df_data.rename({0: "image", 1: "caption"}, axis=1)

    def __len__(self):
        return len(self.df_data)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        image = PIL.Image.open(self.df_data.iloc[idx, 0])
        caption = self.df_data.iloc[idx, 1]

        sample = {"caption": caption, "image": image}

        if self.transform:
            sample = self.transform(sample)

        return sample

## Preprocessing

### Vocab

In [5]:
# nltk.download("stopwords")
# stopwords = set(nltk.corpus.stopwords.words("english"))
# basic_tokenizer = torchtext.data.utils.get_tokenizer("basic_english")
# glove = torchtext.vocab.GloVe(name="6B", dim=100)

# with open(f"{PROJECT_PATH}/correct_spelling.json", "r") as f:
#     correct_spelling = json.load(f)


# def caption_tokenizer(caption):
#     caption = re.sub("[^a-zA-Z]", " ", caption).lower()
#     raw_tokens = basic_tokenizer(caption)
#     raw_tokens = list(set(raw_tokens).difference(stopwords))
#     tokens = []

#     for token in raw_tokens:

#         if token in correct_spelling:
#             corrected_spelling = correct_spelling[token]

#             for corrected_token in corrected_spelling.split():
#                 if corrected_token in glove.stoi:
#                     tokens.append(corrected_token)
#         else:
#             if token in glove.stoi:
#                 tokens.append(token)

#     return tokens

In [6]:
# train_data = TrainDataset()
# test_data = TestDataset()
# counter = collections.Counter()

# for caption in train_data.df_data["caption"]:
#     counter.update(caption_tokenizer(caption))

# for caption in test_data.df_data["caption"]:
#     counter.update(caption_tokenizer(caption))

# vocab = torchtext.vocab.Vocab(counter, vectors="glove.6B.100d", specials=("<unk>", "<BOS>", "<EOS>", "<PAD>"))

# del train_data
# del test_data

### Dataset pre-transformations 

In [7]:
# def vocabularise_caption(dataset, vocab):

#     if "vocabularised_caption" not in dataset.tags:
#         # turns string caption to list of vocab indices
#         dataset.df_data["caption"] = dataset.df_data["caption"].apply(
#             lambda c: torch.tensor([vocab.stoi[t] for t in caption_tokenizer(c)])
#         )

#         dataset.tags.add("vocabularised_caption")


def one_hot_encode_labels(dataset):
    if "one_hot_encoded_labels" not in dataset.tags:
        dataset.df_data["label"] = dataset.df_data["label"].apply(
            lambda l: torch.nn.functional.one_hot(
                torch.tensor([int(i) - 1 if int(i) < 12 else int(i) - 2 for i in l.split(" ")]), 18
            )
            .sum(axis=0)
            .float()
        )

        dataset.tags.add("one_hot_encoded_labels")

## Modules


### Caption embedding

In [8]:
# class CaptionEmbedding(torch.nn.Module):
#     def __init__(self, vocab):
#         super(CaptionEmbedding, self).__init__()
#         self.word_embedding = torch.nn.Embedding.from_pretrained(vocab.vectors)
#         self.linear1 = torch.nn.Linear(vocab.vectors.shape[1], 18)
#         self.lrelu1 = torch.nn.LeakyReLU()
#         self.linear2 = torch.nn.Linear(18, 1)
#         self.lrelu2 = torch.nn.LeakyReLU()
#         self.lrelu3 = torch.nn.LeakyReLU()

#     def forward(self, captions):
#         def _embed_caption(c):
#             c = c.to(DEVICE)
#             c = self.word_embedding(c)
#             word_importance = self.linear1(c)
#             word_importance = self.lrelu1(word_importance)
#             word_importance = self.linear2(word_importance)
#             word_importance = self.lrelu2(word_importance)
#             return self.lrelu3(c.T @ word_importance).view(-1)

#         return torch.stack([*map(lambda c: _embed_caption(c), captions)])

from transformers import AutoFeatureExtractor, AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("prajjwal1/bert-tiny")
lang_model = AutoModel.from_pretrained("prajjwal1/bert-tiny")
lang_model.requires_grad = False

Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


### Pretrained model surgery

In [9]:
# class Surgery(torch.nn.Module):
#     def __init__(self, name, selection, training_disabled=True):
#         super(Surgery, self).__init__()
#         model = getattr(torchvision.models, name)(pretrained=True)
#         children = list(model.children())

#         self.sequential = torch.nn.Sequential(*[children[i] for i in selection])

#         if training_disabled:
#             for param in self.sequential.parameters():
#                 param.requires_grad = False

#     def forward(self, x):

#         return self.sequential(x)


# def surgery_info(name):
#     model = getattr(torchvision.models, name)()

#     for idx, child in enumerate(model.children()):
#         print(f"Accessible at {idx}:\n{child}\n")

# !pip install efficientnet_pytorch
from efficientnet_pytorch import EfficientNet

# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}
features = EfficientNet.from_pretrained("efficientnet-b0")

require_grad = False
for name, param in features.named_parameters():
    #     print(name.split(".")[1])
    if name.split(".")[1] == "14":  # required grad for last 2 blocks
        require_grad = True
    param.require_grad = require_grad

Loaded pretrained weights for efficientnet-b0


### Combined model

In [10]:
NUM_ClASSES = 18
import math

import torch.nn as nn


class Combined_Model(torch.nn.Module):
    def __init__(self, visual_features, lang_model):
        super().__init__()
        self.visual_features = visual_features
        self.lang_model = lang_model
        self.proj = nn.Linear(1000, 128)  # (feature.shape, hidden.shape)
        self.scale = math.sqrt(128)
        self.activation = nn.ReLU()
        self.out = nn.Linear(256, NUM_ClASSES)

    def forward(self, images, input_ids, attention_mask):  # requires tokenized captions
        images = self.visual_features(images)
        images = self.proj(images)

        lang_outs = self.lang_model(input_ids=input_ids, attention_mask=attention_mask)
        hidden = lang_outs["last_hidden_state"]

        attention_weights = (
            torch.bmm(hidden, images.unsqueeze(-1)).squeeze(-1) / self.scale
        )  # (batch, seq_len, 1)
        attention_out = torch.bmm(attention_weights.unsqueeze(1), hidden).squeeze(1)

        return self.out(self.activation(torch.cat([images, attention_out], dim=-1)))

In [11]:
# class Combined(torch.nn.Module):
#     def __init__(self):
#         super(Combined, self).__init__()
#         # self.image_embedder = Surgery('vgg16', [0])
#         self.image_embedder = features
#         # self.caption_embedder = CaptionEmbedding(vocab)

#         # self.linear1 = torch.nn.Linear(2148, 256)
#         self.linear1 = torch.nn.Linear(1000, 256)
#         self.lrelu1 = torch.nn.LeakyReLU()
#         # self.linear2 = torch.nn.Linear(256, 18)
#         # self.lrelu2 = torch.nn.LeakyReLU()

#         self.heads = nn.ModuleList([nn.Linear(256, 1) for i in range(NUM_ClASSES)])

#     def forward(self, images, captions):
#         image_embeddings = self.image_embedder(images)
#         # image_embeddings = image_embeddings.view(images.shape[0], -1) #???

#         # caption_embeddings = self.caption_embedder(captions)
#         # caption_embeddings = caption_embeddings.view(caption_embeddings.shape[0], -1)

#         # combined_embedding = torch.cat([image_embeddings, caption_embeddings], 1)
#         # y = self.linear1(combined_embedding)

#         y = self.linear1(image_embeddings)
#         y = self.lrelu1(y)

#         outs = []
#         for i in range(NUM_ClASSES):
#             outs.append(self.heads[i](y))

#         return torch.cat(outs, dim=-1)

## Transforms and training support

In [12]:
class FieldTransform(object):
    def __init__(self, field, transform):
        self.field = field
        self.transform = transform

    def __call__(self, sample):
        sample[self.field] = self.transform(sample[self.field])
        return sample

## Experiments

In [13]:
def train_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["label"] = torch.stack(X["label"])
    X["image"] = torch.stack(X["image"])

    return X


normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

transforms = torchvision.transforms.Compose(
    [
        #   FieldTransform('image', torchvision.transforms.Resize((64, 64))),
        #   FieldTransform('image', torchvision.transforms.CenterCrop(64)),
        FieldTransform("image", torchvision.transforms.Resize((224, 224))),
        FieldTransform("image", torchvision.transforms.ToTensor()),
        FieldTransform("image", normalize),
    ]
)

In [14]:
train_data = TrainDataset(transform=transforms)
# vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)

# model = Combined().to(DEVICE)
model = Combined_Model(features, lang_model).to(device)
optim = torch.optim.Adam(model.parameters(),lr = 0.0001)
criterion = torch.nn.BCEWithLogitsLoss()

# train_data.__len__() == 30000
trainds, valds = torch.utils.data.random_split(train_data, [27000, 3000])

train_dl = torch.utils.data.DataLoader(
    trainds, batch_size=64, shuffle=True, collate_fn=train_collate_fn, num_workers=24,
)

val_dl = torch.utils.data.DataLoader(
    valds, batch_size=8, shuffle=False, collate_fn=train_collate_fn, num_workers=24
)

In [15]:
from sklearn.metrics import f1_score
from tqdm import tqdm

val_f1 = []
scaler = torch.cuda.amp.GradScaler()
for epoch in range(50):
    train_loss = []
    train_outs = []
    train_lables = []

    model.train()
    for i, batch in enumerate(tqdm(train_dl)):
        optim.zero_grad()
        captions = batch["caption"]
        images = batch["image"].to(device)
        labels = batch["label"].to(device)

        inputs = tokenizer(
            captions, return_tensors="pt", padding=True, truncation=True, max_length=40
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)

        with torch.cuda.amp.autocast():
            predictions = model(images, input_ids, attention_mask)
            loss = criterion(predictions, labels)

        #
        scaler.scale(loss).backward()
        scaler.step(optim)
        scaler.update()
#         train_loss.append(scaled_loss.item())
        train_loss.append(loss.item())

    val_loss = []
    val_outs = []
    val_labels = []

    
    model.eval()
    for i, batch in enumerate(val_dl):

        captions = batch["caption"]
        images = batch["image"].to(DEVICE)
        labels = batch["label"].to(DEVICE)

        inputs = tokenizer(
            captions, return_tensors="pt", padding=True, truncation=True, max_length=40
        )
        input_ids = inputs["input_ids"].to(device)
        attention_mask = inputs["attention_mask"].to(device)
        
        #         predictions = model(images, captions)
        predictions = model(images, input_ids, attention_mask)
        loss = criterion(predictions, labels)
        val_loss.append(loss.item())
        val_outs.append(predictions.detach().cpu().numpy())
        val_labels.append(labels.detach().cpu().numpy())
    val_labels = np.vstack(val_labels)
    val_outs = np.vstack(val_outs)
    f1 = f1_score(y_true=val_labels, y_pred=1 * (val_outs > 0), average="samples")
    if len(val_f1) == 0 or f1 > min(val_f1):
        torch.save(model, "test_model.pt")
    val_f1.append(f1)
    print(
        f"Epoch: {epoch}, Train Loss: {np.mean(train_loss)}, Val Loss: {np.mean(val_loss)}, F1: {f1}"
    )

100%|██████████| 422/422 [01:13<00:00,  5.76it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 0, Train Loss: 0.22427690230415895, Val Loss: 0.12526776771247386, F1: 0.7864962962962964


100%|██████████| 422/422 [01:12<00:00,  5.85it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 1, Train Loss: 0.1131653900112586, Val Loss: 0.10221032429983219, F1: 0.8331309523809522


100%|██████████| 422/422 [01:11<00:00,  5.88it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 2, Train Loss: 0.09509854064592253, Val Loss: 0.09455411384006342, F1: 0.8480767195767197


100%|██████████| 422/422 [01:10<00:00,  5.98it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 3, Train Loss: 0.08483526234236939, Val Loss: 0.08992218871042132, F1: 0.861554761904762


100%|██████████| 422/422 [01:11<00:00,  5.87it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 4, Train Loss: 0.07662882341592797, Val Loss: 0.08635167007272442, F1: 0.8667267195767197


100%|██████████| 422/422 [01:10<00:00,  6.02it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 5, Train Loss: 0.07057392773311144, Val Loss: 0.08404278780147434, F1: 0.8681457671957672


100%|██████████| 422/422 [01:09<00:00,  6.06it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 6, Train Loss: 0.0646725934687384, Val Loss: 0.08311533896438777, F1: 0.8723773689273691


100%|██████████| 422/422 [01:09<00:00,  6.07it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 7, Train Loss: 0.0588071371177079, Val Loss: 0.08415715181455016, F1: 0.8749452380952382


100%|██████████| 422/422 [01:12<00:00,  5.81it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 8, Train Loss: 0.05412490576752837, Val Loss: 0.08620182118875284, F1: 0.8789531746031746


100%|██████████| 422/422 [01:14<00:00,  5.66it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 9, Train Loss: 0.048907259399715755, Val Loss: 0.09145594121453662, F1: 0.8737526455026454


100%|██████████| 422/422 [01:15<00:00,  5.57it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 10, Train Loss: 0.04360284597140635, Val Loss: 0.09222581857210026, F1: 0.8726945406445407


100%|██████████| 422/422 [01:12<00:00,  5.83it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 11, Train Loss: 0.03874325933888272, Val Loss: 0.09946337987730901, F1: 0.872916354016354


100%|██████████| 422/422 [01:12<00:00,  5.82it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 12, Train Loss: 0.03416787921914487, Val Loss: 0.10385541129702082, F1: 0.8728761183261182


100%|██████████| 422/422 [01:12<00:00,  5.81it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 13, Train Loss: 0.02976147650401174, Val Loss: 0.10992494471340129, F1: 0.8724391534391535


100%|██████████| 422/422 [01:09<00:00,  6.05it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 14, Train Loss: 0.026196358921844954, Val Loss: 0.12005773759734197, F1: 0.8726223184223184


100%|██████████| 422/422 [01:10<00:00,  5.99it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 15, Train Loss: 0.023428243818865, Val Loss: 0.11936209173717846, F1: 0.8696636123136122


100%|██████████| 422/422 [01:12<00:00,  5.81it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 16, Train Loss: 0.020133718538155418, Val Loss: 0.13438180018123239, F1: 0.8727236411736411


100%|██████████| 422/422 [01:13<00:00,  5.75it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 17, Train Loss: 0.018348399080750998, Val Loss: 0.1355632109933067, F1: 0.869273088023088


100%|██████████| 422/422 [01:11<00:00,  5.91it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 18, Train Loss: 0.016556678688533215, Val Loss: 0.13548857188162705, F1: 0.8683131553631553


100%|██████████| 422/422 [01:13<00:00,  5.75it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 19, Train Loss: 0.014070803247131761, Val Loss: 0.15062384279700927, F1: 0.8723885521885523


100%|██████████| 422/422 [01:12<00:00,  5.80it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 20, Train Loss: 0.013033692448585342, Val Loss: 0.15661603413498962, F1: 0.87152506012506


100%|██████████| 422/422 [01:12<00:00,  5.83it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 21, Train Loss: 0.011878777906945267, Val Loss: 0.1648251425541627, F1: 0.8715428811928811


100%|██████████| 422/422 [01:12<00:00,  5.78it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 22, Train Loss: 0.010526084644561059, Val Loss: 0.17017447080144968, F1: 0.8698165223665224


100%|██████████| 422/422 [01:13<00:00,  5.78it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 23, Train Loss: 0.010215512394929366, Val Loss: 0.17221969422104302, F1: 0.8711769841269841


100%|██████████| 422/422 [01:13<00:00,  5.77it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 24, Train Loss: 0.00871878245876334, Val Loss: 0.1781954098175047, F1: 0.8728740740740741


100%|██████████| 422/422 [01:23<00:00,  5.06it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 25, Train Loss: 0.008612048817865807, Val Loss: 0.18048335141444113, F1: 0.870763011063011


100%|██████████| 422/422 [01:13<00:00,  5.74it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 26, Train Loss: 0.007925413751569731, Val Loss: 0.18820414627275509, F1: 0.8711962962962961


100%|██████████| 422/422 [01:15<00:00,  5.58it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 27, Train Loss: 0.007264070913953689, Val Loss: 0.18933897789897552, F1: 0.8713803511303513


100%|██████████| 422/422 [01:14<00:00,  5.69it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 28, Train Loss: 0.006933129749534502, Val Loss: 0.19522928214371496, F1: 0.8724820586820586


100%|██████████| 422/422 [01:12<00:00,  5.84it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 29, Train Loss: 0.006368257390526653, Val Loss: 0.1958871137296422, F1: 0.8705836940836941


100%|██████████| 422/422 [01:14<00:00,  5.64it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 30, Train Loss: 0.00633046254780479, Val Loss: 0.19528370608649373, F1: 0.8702512746512745


100%|██████████| 422/422 [01:13<00:00,  5.78it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 31, Train Loss: 0.006027726213968032, Val Loss: 0.211394925245307, F1: 0.875670731120731


100%|██████████| 422/422 [01:12<00:00,  5.86it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 32, Train Loss: 0.005599334182618274, Val Loss: 0.1933275813755851, F1: 0.8731588263588262


100%|██████████| 422/422 [01:09<00:00,  6.08it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 33, Train Loss: 0.0054251866053965855, Val Loss: 0.22523601199168963, F1: 0.873152477152477


100%|██████████| 422/422 [01:12<00:00,  5.84it/s]
  0%|          | 0/422 [00:00<?, ?it/s]

Epoch: 34, Train Loss: 0.005229638509771795, Val Loss: 0.21473534053738694, F1: 0.8720251322751322


 91%|█████████ | 385/422 [01:06<00:05,  6.20it/s]Process Process-1688:
Process Process-1689:
Process Process-1687:
Process Process-1685:
Process Process-1693:
Process Process-1694:
Process Process-1692:
Process Process-1682:
Process Process-1690:
Process Process-1691:
Traceback (most recent call last):
Traceback (most recent call last):
Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/multiprocessing/process.py", line 318, in _bootstrap
    util._exit_function()
Traceback (most recent call last):
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7efcc3f3a1f0>
Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/site-packages/torch/utils/data/dataloader.py", line 1203, in __del__
    self._shutdown_workers()
  File "/home/ztan/miniconda3/lib/python3.8

Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 3437, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-15-9680e2e8347f>", line 30, in <module>
    scaler.step(optim)
  File "/home/ztan/miniconda3/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 320, in step
    if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
  File "/home/ztan/miniconda3/lib/python3.8/site-packages/torch/cuda/amp/grad_scaler.py", line 320, in <genexpr>
    if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
KeyboardInterrupt

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/ztan/miniconda3/lib/python3.8/site-packages/IPython/core/interactiveshell.py", line 2061, in showtraceback
    stb = value._render_traceback_()
AttributeError: 'KeyboardInterrupt' obj

TypeError: object of type 'NoneType' has no len()

In [None]:
# stopper for run all
raise ValueError

In [None]:
import torch

model = torch.load("test_model.pt")

def test_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["image"] = torch.stack(X["image"])

    return X


test_data = TestDataset(transform=transforms)
test_dl = torch.utils.data.DataLoader(
    test_data, batch_size=16, shuffle=False, collate_fn=test_collate_fn, num_workers=0,
)


In [None]:
# save predictions
model.eval()
test_preds = []
for i, batch in enumerate(test_dl):
    captions = batch["caption"]
    images = batch["image"].to(DEVICE)

    predictions = model(images, input_ids, attention_mask)
    test_preds.append(predictions.detach().cpu().numpy())

test_preds = np.vstack(test_preds)

def out_logits_to_preds(logits):
    labels = []
    logits = logits > 0
    lables_available = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19])
    for i in range(logits.shape[0]):
        labels.append(list(lables_available[logits[i]]))
    return labels


# lables_available[(test_preds[:10] > 0)]
# {1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17, 18, 19}
test_labels = out_logits_to_preds(test_preds)

df_test = pd.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
test_labels_str = [" ".join([str(i) for i in labels]) for labels in test_labels]
df_test["Labels"] = test_labels_str
df_test.rename({0: "ImageID"}, axis=1, inplace=True)

df_test[["ImageID", "Labels"]].to_csv("test_predictions.csv", index=False)

### Just label model

In [None]:
def weight1(dataset):
    one_hot_encode_labels(dataset)
    labels = torch.stack(dataset.df_data["label"].to_list())
    counts = labels.sum(axis=0)
    weights = labels @ (1 / counts)
    return weights / weights.sum()

In [None]:
def weight2(dataset):
    one_hot_encode_labels(dataset)
    labels = torch.stack(dataset.df_data["label"].to_list())
    dataset.df_data = dataset.df_data[(labels.sum(axis=1) == 1).numpy()]
    dataset.df_data = dataset.df_data.reset_index(drop=True)
    labels = torch.stack(dataset.df_data["label"].to_list())
    counts = labels.sum(axis=0)
    weights = labels @ (1 / counts)
    return weights / weights.sum()

In [None]:
def weight3(dataset):
    one_hot_encode_labels(dataset)
    labels = torch.stack(dataset.df_data["label"].to_list())
    sums = torch.sum(labels * 2 ** torch.arange(18), axis=1)
    uniques, counts = torch.unique(sums, return_counts=True)
    p = 1 / counts
    w = torch.zeros_like(sums)

    for i in range(len(uniques)):
        w[sums == uniques[i]] = p[i]

    return w / w.sum()

In [None]:
class CaptionEmbedding(torch.nn.Module):
    def __init__(self, vocab):
        super(CaptionEmbedding, self).__init__()
        self.word_embedding = torch.nn.Embedding.from_pretrained(vocab.vectors)
        self.linear1 = torch.nn.Linear(vocab.vectors.shape[1], 54)
        self.linear2 = torch.nn.Linear(54, 18)
        self.linear3 = torch.nn.Linear(18, 1)

    def forward(self, captions):
        def _embed_caption(c):
            c = c.to(DEVICE)
            c = self.word_embedding(c)
            word_importance = torch.nn.functional.leaky_relu(self.linear1(c))
            word_importance = torch.nn.functional.leaky_relu(self.linear2(word_importance))
            word_importance = torch.nn.functional.leaky_relu(self.linear3(word_importance))
            return (c.T @ word_importance).view(-1)

        return torch.stack([*map(lambda c: _embed_caption(c), captions)])

In [None]:
class AlexJustLabelModel(torch.nn.Module):
    def __init__(self):
        super(AlexJustLabelModel, self).__init__()
        self.caption_embedder = CaptionEmbedding(vocab)

        self.linear1 = torch.nn.Linear(100, 54)
        self.linear2 = torch.nn.Linear(54, 36)
        self.linear3 = torch.nn.Linear(100, 18)
        self.dropout = torch.nn.Dropout(0.05)

    def forward(self, captions):
        caption_embeddings = self.caption_embedder(captions)
        y = self.dropout(caption_embeddings)
        # y = torch.nn.functional.leaky_relu(self.linear1(y))
        # y = torch.nn.functional.leaky_relu(self.linear2(y))
        y = self.linear3(y)
        return y

In [None]:
def train_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["label"] = torch.stack(X["label"])

    return X


train_data = TrainDataset(transform=transforms)
vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)
weights = weight2(train_data)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

dataloader = torch.utils.data.DataLoader(
    train_data, batch_size=32, num_workers=2, collate_fn=train_collate_fn, sampler=sampler
)

model = AlexJustLabelModel().to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

for e in range(10):
    print(f"EPOCH {e}")

    for i, batch in enumerate(dataloader):
        optim.zero_grad()
        captions = batch["caption"]
        labels = batch["label"].to(DEVICE)

        predictions = model(captions)
        loss = criterion(predictions, labels)
        loss.backward()
        optim.step()

        print(loss)

In [None]:
def train_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}
    X["label"] = torch.stack(X["label"])

    return X


train_data = TrainDataset(transform=transforms)
vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)
weights = weight3(train_data)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

dataloader = torch.utils.data.DataLoader(
    train_data, batch_size=32, num_workers=2, collate_fn=train_collate_fn, sampler=sampler
)

model.eval()

for i, batch in enumerate(dataloader):

    captions = batch["caption"]
    labels = batch["label"].to(DEVICE)

    predictions = torch.nn.functional.sigmoid(model(captions))

    for p in range(len(predictions)):
        print(labels[p])
        print(predictions[p])
        print()

In [None]:
def train_collate_fn(X):
    # convert [{key: val, ...}, ...]
    # to [key: [val, ...],  ...}
    X = {k: [v[k] for v in X] for k in X[0]}

    return X


train_data = TestDataset(transform=transforms)
vocabularise_caption(train_data, vocab)

dataloader = torch.utils.data.DataLoader(
    train_data, batch_size=32, num_workers=2, collate_fn=train_collate_fn
)

for e in range(10):
    print(f"EPOCH {e}")

    for i, batch in enumerate(dataloader):
        captions = batch["caption"]
        predictions = model(captions)
        for p in torch.nn.functional.sigmoid(predictions):
            print(p)

## Evaluation

In [None]:
import sklearn.metrics
import sklearn.preprocessing

mlb = sklearn.preprocessing.MultiLabelBinarizer([1, 2, 3, 4, 5])
y_true = mlb.fit_transform([{1, 2}, {3}])
y_pred = mlb.fit_transform([{1, 3}, {3}])
sklearn.metrics.f1_score(y_pred, y_true, average="samples")