# COMP5329 - Assignment 2

In [1]:
import google
import pandas
import torch 
import torchvision
import torchtext
import PIL.Image
import json
import nltk
import re
import collections

DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
MOUNT_PATH = '/content/drive'
DRIVE_PATH = f'{MOUNT_PATH}/My Drive'
PROJECT_PATH = f'{DRIVE_PATH}/Assignment 2'
IMG_PATH = f'{PROJECT_PATH}/data'
TRAIN_CSV_PATH = f'{PROJECT_PATH}/train.csv'
TEST_CSV_PATH = f'{PROJECT_PATH}/test.csv'

google.colab.drive.mount(MOUNT_PATH)

Mounted at /content/drive


## Dataset


### Train dataset

In [2]:
class TrainDataset(torch.utils.data.Dataset):

  def __init__(self, transform=None):
    self.transform = transform
    self.tags = set()
    self.df_data = pandas.read_csv(TRAIN_CSV_PATH, names=range(4), skiprows=1)
    self.df_data[0] = IMG_PATH + '/' + self.df_data[0]
    self.df_data[3] = self.df_data[3].fillna('')
    self.df_data[2] += self.df_data[3]
    self.df_data = self.df_data.drop(3, axis=1)
    self.df_data = self.df_data.rename({
      0: 'image',
      1: 'label',
      2: 'caption'
    }, axis=1)
    

  def __len__(self):
    return len(self.df_data)


  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    image = PIL.Image.open(self.df_data.iloc[idx, 0])
    label = self.df_data.iloc[idx, 1]
    caption = self.df_data.iloc[idx, 2]

    sample = {'caption': caption, 'label': label, 'image': image}

    if self.transform:
      sample = self.transform(sample)

    return sample

### Test dataset

In [3]:
class TestDataset(torch.utils.data.Dataset):

  def __init__(self, transform=None):
    self.transform = transform
    self.tags = set()
    self.df_data = pandas.read_csv(TEST_CSV_PATH, names=range(3), skiprows=1)
    self.df_data[0] = IMG_PATH + '/' + self.df_data[0]
    self.df_data[2] = self.df_data[2].fillna('')
    self.df_data[1] += self.df_data[2]
    self.df_data = self.df_data.drop(2, axis=1)
    self.df_data = self.df_data.rename({
      0: 'image',
      1: 'caption'
    }, axis=1)
    

  def __len__(self):
    return len(self.df_data)


  def __getitem__(self, idx):
    if torch.is_tensor(idx):
      idx = idx.tolist()

    image = PIL.Image.open(self.df_data.iloc[idx, 0])
    caption = self.df_data.iloc[idx, 1]

    sample = {'caption': caption, 'image': image}

    if self.transform:
      sample = self.transform(sample)

    return sample

## Preprocessing

### Vocab

In [4]:
nltk.download('stopwords')
stopwords = set(nltk.corpus.stopwords.words('english'))
basic_tokenizer = torchtext.data.utils.get_tokenizer('basic_english')
glove = torchtext.vocab.GloVe(name='6B', dim=100)

with open(f'{PROJECT_PATH}/correct_spelling.json', 'r') as f:
    correct_spelling = json.load(f)
    

def caption_tokenizer(caption):
  caption = re.sub('[^a-zA-Z]', ' ', caption).lower()
  raw_tokens = basic_tokenizer(caption)
  raw_tokens = list(set(raw_tokens).difference(stopwords))
  tokens = []
  
  for token in raw_tokens:
    
    if token in correct_spelling:
      corrected_spelling = correct_spelling[token]
      
      for corrected_token in corrected_spelling.split():
        if corrected_token in glove.stoi:
          tokens.append(corrected_token)
    else:
      if token in glove.stoi:
          tokens.append(token)

  return tokens

[nltk_data] Downloading package stopwords to /root/nltk_data...


.vector_cache/glove.6B.zip: 0.00B [00:00, ?B/s]

[nltk_data]   Unzipping corpora/stopwords.zip.


.vector_cache/glove.6B.zip:   8%|▊         | 64.8M/862M [00:07<01:31, 8.70MB/s]


KeyboardInterrupt: ignored

In [None]:
train_data = TrainDataset()
test_data = TestDataset()
counter = collections.Counter()

for caption in train_data.df_data['caption']:
  counter.update(caption_tokenizer(caption))

for caption in test_data.df_data['caption']:
  counter.update(caption_tokenizer(caption))

vocab = torchtext.vocab.Vocab(
  counter,
  vectors='glove.6B.100d',
  specials=('<unk>', '<BOS>', '<EOS>', '<PAD>')
)

del train_data
del test_data

### Dataset pre-transformations 

In [5]:
def vocabularise_caption(dataset, vocab):

  if 'vocabularised_caption' not in dataset.tags:
    # turns string caption to list of vocab indices
    dataset.df_data['caption'] = dataset.df_data['caption'].apply(
      lambda c: torch.tensor([vocab.stoi[t] for t in caption_tokenizer(c)])
    )

    dataset.tags.add('vocabularised_caption')

def one_hot_encode_labels(dataset):
  if 'one_hot_encoded_labels' not in dataset.tags:
    dataset.df_data['label'] = dataset.df_data['label'].apply(
      lambda l: torch.nn.functional.one_hot(
        torch.tensor([
          int(i)-1 if int(i)<12 else int(i)-2 for i in l.split(' ')
        ]), 18).sum(axis=0).float()
    )

    dataset.tags.add('one_hot_encoded_labels')

## Modules


### Caption embedding

In [None]:
class CaptionEmbedding(torch.nn.Module):

  def __init__(self, vocab):
    super(CaptionEmbedding, self).__init__()
    self.word_embedding = torch.nn.Embedding.from_pretrained(vocab.vectors)
    self.linear1 = torch.nn.Linear(vocab.vectors.shape[1], 18)
    self.lrelu1 = torch.nn.LeakyReLU()
    self.linear2 = torch.nn.Linear(18, 1)
    self.lrelu2 = torch.nn.LeakyReLU()
    self.lrelu3 = torch.nn.LeakyReLU()


  def forward(self, captions):

    def _embed_caption(c):
      c = c.to(DEVICE)
      c = self.word_embedding(c)
      word_importance = self.linear1(c)
      word_importance = self.lrelu1(word_importance)
      word_importance = self.linear2(word_importance)
      word_importance = self.lrelu2(word_importance)
      return self.lrelu3(c.T @ word_importance).view(-1)

    return torch.stack([*map(lambda c: _embed_caption(c), captions)])

### Pretrained model surgery

In [None]:
class Surgery(torch.nn.Module):
  
  def __init__(self, name, selection, training_disabled=True):
    super(Surgery, self).__init__()
    model = getattr(torchvision.models, name)(pretrained=True)
    children = list(model.children())

    self.sequential = torch.nn.Sequential(*[
      children[i] for i in selection
    ])
    
    if training_disabled:
      for param in self.sequential.parameters():
        param.requires_grad = False

  def forward(self, x):
    
    return self.sequential(x)


def surgery_info(name):
  model = getattr(torchvision.models, name)()

  for idx, child in enumerate(model.children()):
    print(f'Accessible at {idx}:\n{child}\n')

In [8]:
!pip install efficientnet_pytorch
from efficientnet_pytorch import EfficientNet
model = EfficientNet.from_pretrained('efficientnet-b0', num_classes=18).to(DEVICE)

Loaded pretrained weights for efficientnet-b0


In [9]:
require_grad = False
for name, param in model.named_parameters():
    #     print(name.split(".")[1])
    if name.split(".")[1] == "14":  # required grad for last 2 blocks
        require_grad = True
    param.require_grad = require_grad

### Combined model

In [None]:
NUM_ClASSES = 18
import torch.nn as nn

class Combined(torch.nn.Module):

  def __init__(self):
    super(Combined, self).__init__()
    # self.image_embedder = Surgery('vgg16', [0])
    self.image_embedder = features
    # self.caption_embedder = CaptionEmbedding(vocab)

    # self.linear1 = torch.nn.Linear(2148, 256)
    self.linear1 = torch.nn.Linear(1000, 256)
    self.lrelu1 = torch.nn.LeakyReLU()
    # self.linear2 = torch.nn.Linear(256, 18)
    # self.lrelu2 = torch.nn.LeakyReLU()

    self.heads = nn.ModuleList([nn.Linear(256, 1) for i in range(NUM_ClASSES)])
    
  
  def forward(self, images, captions):
    image_embeddings = self.image_embedder(images)
    # image_embeddings = image_embeddings.view(images.shape[0], -1) #???

    # caption_embeddings = self.caption_embedder(captions)
    # caption_embeddings = caption_embeddings.view(caption_embeddings.shape[0], -1)

    # combined_embedding = torch.cat([image_embeddings, caption_embeddings], 1)
    # y = self.linear1(combined_embedding)

    y = self.linear1(image_embeddings)
    y = self.lrelu1(y)

    outs = []
    for i in range(NUM_ClASSES):
        outs.append(self.heads[i](y))
    
    return torch.cat(outs, dim=-1)

## Transforms and training support

In [10]:
class FieldTransform(object):

  def __init__(self, field, transform):
    self.field = field
    self.transform = transform

  
  def __call__(self, sample):
    sample[self.field] = self.transform(sample[self.field])
    return sample

## Experiments

In [11]:
def train_collate_fn(X):
  # convert [{key: val, ...}, ...]
  # to [key: [val, ...],  ...}
  X = {k: [v[k] for v in X] for k in X[0]}
  X['label'] = torch.stack(X['label'])
  X['image'] = torch.stack(X['image'])
  
  return X

normalize = torchvision.transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])

transforms = torchvision.transforms.Compose([
#   FieldTransform('image', torchvision.transforms.Resize((64, 64))),
#   FieldTransform('image', torchvision.transforms.CenterCrop(64)),
  FieldTransform('image', torchvision.transforms.Resize((224,224))),  
  FieldTransform('image', torchvision.transforms.ToTensor()),
  FieldTransform('image', normalize),
])

In [12]:
train_data = TrainDataset(transform=transforms)
# vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)

dataloader = torch.utils.data.DataLoader(
  train_data,
  batch_size=32,
  shuffle=True,
  num_workers=2,
  collate_fn=train_collate_fn
)

# model = Combined().to(DEVICE)
optim = torch.optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

In [15]:
train_losses = []
for i, batch in enumerate(dataloader):
  optim.zero_grad()
  captions = batch['caption']
  images = batch['image'].to(DEVICE)
  labels = batch['label'].to(DEVICE)

  predictions = model(images)
  loss = criterion(predictions, labels)
  loss.backward()
  optim.step()
  train_losses.append(loss.item())
#   print(loss)

tensor(0.7228, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.6538, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.6021, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.5358, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)
tensor(0.4665, device='cuda:0', grad_fn=<BinaryCrossEntropyWithLogitsBackward>)


KeyboardInterrupt: ignored

### Just label model

In [None]:
def weight1(dataset):
  one_hot_encode_labels(dataset)
  labels = torch.stack(dataset.df_data['label'].to_list())
  counts = labels.sum(axis=0)
  weights = labels @ (1 / counts)
  return weights / weights.sum()

In [None]:
def weight2(dataset):
  one_hot_encode_labels(dataset)
  labels = torch.stack(dataset.df_data['label'].to_list())
  dataset.df_data = dataset.df_data[(labels.sum(axis=1) == 1).numpy()]
  dataset.df_data = dataset.df_data.reset_index(drop=True)
  labels = torch.stack(dataset.df_data['label'].to_list())
  counts = labels.sum(axis=0)
  weights = labels @ (1 / counts)
  return weights / weights.sum()

In [None]:
def weight3(dataset):
  one_hot_encode_labels(dataset)
  labels = torch.stack(dataset.df_data['label'].to_list())
  sums = torch.sum(labels * 2**torch.arange(18), axis=1)
  uniques, counts = torch.unique(sums, return_counts=True)
  p = 1 / counts
  w = torch.zeros_like(sums)

  for i in range(len(uniques)):
    w[sums == uniques[i]] = p[i]

  return w / w.sum()

In [None]:
class CaptionEmbedding(torch.nn.Module):

  def __init__(self, vocab):
    super(CaptionEmbedding, self).__init__()
    self.word_embedding = torch.nn.Embedding.from_pretrained(vocab.vectors)
    self.linear1 = torch.nn.Linear(vocab.vectors.shape[1], 54)
    self.linear2 = torch.nn.Linear(54, 18)
    self.linear3 = torch.nn.Linear(18, 1)


  def forward(self, captions):

    def _embed_caption(c):
      c = c.to(DEVICE)
      c = self.word_embedding(c)
      word_importance = torch.nn.functional.leaky_relu(self.linear1(c))
      word_importance = torch.nn.functional.leaky_relu(self.linear2(word_importance))
      word_importance = torch.nn.functional.leaky_relu(self.linear3(word_importance))
      return (c.T @ word_importance).view(-1)

    return torch.stack([*map(lambda c: _embed_caption(c), captions)])

In [None]:
class AlexJustLabelModel(torch.nn.Module):

  def __init__(self):
    super(AlexJustLabelModel, self).__init__()
    self.caption_embedder = CaptionEmbedding(vocab)

    self.linear1 = torch.nn.Linear(100, 54)
    self.linear2 = torch.nn.Linear(54, 36)
    self.linear3 = torch.nn.Linear(100, 18)
    self.dropout = torch.nn.Dropout(0.05)
  
  def forward(self, captions):
    caption_embeddings = self.caption_embedder(captions)
    y = self.dropout(caption_embeddings)
    #y = torch.nn.functional.leaky_relu(self.linear1(y))
    #y = torch.nn.functional.leaky_relu(self.linear2(y))
    y = self.linear3(y)
    return y

In [None]:
def train_collate_fn(X):
  # convert [{key: val, ...}, ...]
  # to [key: [val, ...],  ...}
  X = {k: [v[k] for v in X] for k in X[0]}
  X['label'] = torch.stack(X['label'])
  
  return X


train_data = TrainDataset(transform=transforms)
vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)
weights = weight2(train_data)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

dataloader = torch.utils.data.DataLoader(
  train_data,
  batch_size=32,
  num_workers=2,
  collate_fn=train_collate_fn,
  sampler=sampler
)

model = AlexJustLabelModel().to(DEVICE)
model.train()
optim = torch.optim.Adam(model.parameters())
criterion = torch.nn.BCEWithLogitsLoss()

for e in range(10):
  print(f'EPOCH {e}')

  for i, batch in enumerate(dataloader):
    optim.zero_grad()
    captions = batch['caption']
    labels = batch['label'].to(DEVICE)

    predictions = model(captions)
    loss = criterion(predictions, labels)
    loss.backward()
    optim.step()

    print(loss)

In [None]:
def train_collate_fn(X):
  # convert [{key: val, ...}, ...]
  # to [key: [val, ...],  ...}
  X = {k: [v[k] for v in X] for k in X[0]}
  X['label'] = torch.stack(X['label'])
  
  return X


train_data = TrainDataset(transform=transforms)
vocabularise_caption(train_data, vocab)
one_hot_encode_labels(train_data)
weights = weight3(train_data)
sampler = torch.utils.data.sampler.WeightedRandomSampler(weights, len(weights))

dataloader = torch.utils.data.DataLoader(
  train_data,
  batch_size=32,
  num_workers=2,
  collate_fn=train_collate_fn,
  sampler=sampler
)

model.eval()

for i, batch in enumerate(dataloader):

    captions = batch['caption']
    labels = batch['label'].to(DEVICE)

    predictions = torch.nn.functional.sigmoid(model(captions))

    for p in range(len(predictions)):
      print(labels[p])
      print(predictions[p])
      print()
    



tensor([0., 0., 1., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.],
       device='cuda:0')
tensor([8.8039e-04, 5.7867e-05, 8.3729e-04, 4.8270e-04, 1.4365e-02, 2.2701e-03,
        5.8817e-05, 1.0935e-03, 1.5089e-04, 3.9024e-04, 4.7024e-04, 8.4411e-04,
        3.0371e-03, 1.0395e-04, 1.0163e-03, 1.7564e-06, 2.3156e-04, 2.0784e-05],
       device='cuda:0', grad_fn=<SelectBackward>)

tensor([0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.],
       device='cuda:0')
tensor([5.4053e-04, 1.0790e-05, 3.1568e-03, 2.7270e-03, 9.5689e-01, 2.4573e-04,
        1.8184e-04, 5.9412e-02, 2.2587e-05, 3.0209e-05, 2.0881e-05, 1.5983e-05,
        6.2746e-05, 4.5761e-07, 1.6164e-04, 4.9506e-06, 9.2768e-06, 1.6719e-07],
       device='cuda:0', grad_fn=<SelectBackward>)

tensor([0., 0., 0., 0., 1., 1., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.],
       device='cuda:0')
tensor([1.5122e-04, 5.0239e-06, 1.2762e-04, 8.4366e-06, 4.7365e-01, 2.1145e-05,
        3.2185e-05, 2

KeyboardInterrupt: ignored

In [None]:
def train_collate_fn(X):
  # convert [{key: val, ...}, ...]
  # to [key: [val, ...],  ...}
  X = {k: [v[k] for v in X] for k in X[0]}
  
  return X


train_data = TestDataset(transform=transforms)
vocabularise_caption(train_data, vocab)

dataloader = torch.utils.data.DataLoader(
  train_data,
  batch_size=32,
  num_workers=2,
  collate_fn=train_collate_fn
)

for e in range(10):
  print(f'EPOCH {e}')

  for i, batch in enumerate(dataloader):
    captions = batch['caption']
    predictions = model(captions)
    for p in torch.nn.functional.sigmoid(predictions):
      print(p)

EPOCH 0


Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f59fec3d830>
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1324, in __del__
    self._shutdown_workers()
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1316, in _shutdown_workers
    if w.is_alive():
  File "/usr/lib/python3.7/multiprocessing/process.py", line 151, in is_alive
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f59fec3d830>
    assert self._parent_pid == os.getpid(), 'can only test a child process'
AssertionError: can only test a child process
Traceback (most recent call last):
  File "/usr/local/lib/python3.7/dist-packages/torch/utils/data/dataloader.py", line 1324, in __del__
Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f59fec3d830>
Traceback (most recent call last):
    self._shutdown_workers()
  File "/usr/local/lib/python3

tensor([4.0399e-01, 1.4926e-01, 4.2590e-01, 7.6673e-02, 2.7809e-04, 1.0540e-01,
        2.8719e-04, 2.3134e-01, 1.1400e-03, 1.0011e-02, 2.8609e-01, 5.6659e-02,
        2.7999e-03, 1.3109e-01, 1.5632e-01, 3.4401e-02, 1.7741e-03, 9.5525e-05],
       device='cuda:0', grad_fn=<UnbindBackward>)
tensor([0.9942, 0.2448, 0.5761, 0.0202, 0.0708, 0.2392, 0.1317, 0.3884, 0.2030,
        0.2413, 0.1557, 0.0885, 0.0401, 0.2047, 0.0917, 0.0306, 0.2024, 0.0196],
       device='cuda:0', grad_fn=<UnbindBackward>)
tensor([0.9936, 0.1644, 0.5412, 0.0624, 0.0015, 0.0301, 0.0016, 0.1260, 0.0255,
        0.0452, 0.0157, 0.0230, 0.0158, 0.6194, 0.0544, 0.0313, 0.0017, 0.0014],
       device='cuda:0', grad_fn=<UnbindBackward>)
tensor([9.3726e-01, 6.5657e-01, 5.9527e-01, 4.2178e-02, 1.0211e-02, 2.9580e-01,
        6.0213e-02, 3.7934e-01, 4.8535e-02, 5.7849e-01, 9.3162e-02, 1.3028e-01,
        1.2301e-01, 5.5286e-02, 4.9762e-02, 2.5826e-04, 8.6864e-01, 2.3259e-01],
       device='cuda:0', grad_fn=<UnbindBackwar

KeyboardInterrupt: ignored

## Evaluation

In [None]:
# test_data = TestDataset(transform=transforms)
# # vocabularise_caption(train_data, vocab)
# # one_hot_encode_labels(train_data)

# test_dataloader = torch.utils.data.DataLoader(
#   test_data,
#   batch_size=32,
#   shuffle=False,
#   num_workers=2,
#   collate_fn=train_collate_fn
# )

In [None]:
y_preds = []
model.eval()
# training acc
for i, batch in enumerate(dataloader):
#   optim.zero_grad()
  captions = batch['caption']
  images = batch['image'].to(DEVICE)
#   labels = batch['label'].to(DEVICE)

  predictions = model(images, captions)
#   loss = criterion(predictions, labels)
#   loss.backward()
#   optim.step()
  y_preds.append(predictions)

In [None]:
y_preds[0][0]

tensor([  3.4958,  -5.6405,  -5.7197,  -9.4492, -10.5480,  -9.3610,  -8.4963,
         -9.2781, -11.4059,  -7.8615,  -7.9696, -12.5368, -10.4251,  -5.3012,
         -5.8542,  -4.1120,   0.1493,  -9.6091], device='cuda:0',
       grad_fn=<SelectBackward>)

In [None]:
TRAIN_CSV_PATH

In [None]:
import sklearn.metrics
import sklearn.preprocessing

mlb = sklearn.preprocessing.MultiLabelBinarizer([1, 2, 3, 4, 5])
y_true = mlb.fit_transform([{1, 2}, {3}])
y_pred = mlb.fit_transform([{1, 3}, {3}])
sklearn.metrics.f1_score(y_pred, y_true, average='samples')

0.75