In [1]:
from transformers import (
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoImageProcessor,
    BertForSequenceClassification,
    BertTokenizer,
    VisionTextDualEncoderConfig,
    ViTForImageClassification,
    AutoModelForSequenceClassification)

import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from torch.nn.utils.rnn import pad_sequence

from PIL import Image
import json
from sklearn import preprocessing
import pandas as pd
import os
from os import walk
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
from tqdm.auto import tqdm

Data feed in: set of image filenames, set of corresponding text, set of corresponding labels

to do:
- try to pretrain a model? https://github.com/NielsRogge/Transformers-Tutorials
- Reqs.txt
- Proper configs and classification head for BERT - finetune bert first as a sequence classification and pass this as a local model?
- Fix loss function. Doesn't work with either one hot or categorical, stuck with the default func.
- Change the dataset function to accept 'one hot or not' param instead of commenting out
- Change padding in dataloader to label count not hard coded


# Data Reading
As usual, these are my paths and you would need to update to your own. Or build a cleverer way to do it - whichever you prefer. Uncommented next line for your own path.

In [None]:
#path = 'this is the path for the images in train'

In [34]:
path = r'X:\PhD\SemEval Task4\Data\subtask2a_images\train_images\train_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

df = pd.read_json(r'X:\PhD\SemEval Task4\Data\annotations\data\subtask2a\train.json') # your path
df = pd.merge(df, images_df, on='image')

Two types of labels are currently used - integers and one hot encoding. One hot is requried for BCE w/ logits.

In [35]:
labels = df['labels'].values

unique_labels = []

for i in labels:
    for x in i:
        unique_labels.append(x)
unique_labels = list(set(unique_labels))

le = LabelEncoder()
le.fit(unique_labels)

# try a one-hot as well
ml = MultiLabelBinarizer()

df['one_hot_labels'] = ml.fit_transform(df['labels'])[:,1:].tolist()
df['labels'] = df['labels'].apply(le.transform) # actually you can do the above on this as well, but I didn't change it

In [36]:
# dataloader is better without a dataframe, so this time we're going with lists

images = [str(i) for i in df['filepath'].values]
texts = [str(i) for i in df['text'].astype(str).values.tolist()]
labels = df['labels'].values
one_hot_labels = df['one_hot_labels'].values

## Validation

In [5]:
path = r'X:\PhD\SemEval Task4\Data\subtask2a_images\validation_images\validation_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

df_val = pd.read_json(r'X:\PhD\SemEval Task4\Data\annotations\data\subtask2a\validation.json')
df_val = pd.merge(df_val, images_df, on='image')

val_labels = df_val['labels'].values

unique_labels = []

for i in val_labels:
    for x in i:
        unique_labels.append(x)
unique_labels = list(set(unique_labels))

le = LabelEncoder()
le.fit(unique_labels)

# try a one-hot as well
ml = MultiLabelBinarizer()

df_val['one_hot_labels'] = ml.fit_transform(df_val['labels'])[:,1:].tolist()
df_val['labels'] = df_val['labels'].apply(le.transform)

images_val = [str(i) for i in df_val['filepath'].values]
texts_val = [str(i) for i in df_val['text'].astype(str).values.tolist()]
labels_val = df_val['labels'].values
one_hot_labels_val = df_val['one_hot_labels'].values

## Dataset

In [37]:
class VisionTextDataset(torch.utils.data.Dataset):
    def __init__(self, img, txt, lbs, processor):
        self.image = img
        self.text = txt
        self.labels = lbs
        self.processor = processor

    def __len__(self):
        return len(self.image)

    def __getitem__(self, idx):
        image = self.image[idx]
        text = self.text[idx]
        label = self.labels[idx]

        encoding = self.processor(text, Image.open(image).convert('RGB'), is_split_into_words=True, 
                                  padding="max_length", truncation=True, return_tensors="pt")
        encoding = {k:v.squeeze() for k,v in encoding.items()}

        ### comment out below for not one-hot labels ####

        def pad_tensor(t):
             t = torch.tensor(t)
             padding = 22 - t.size()[0] # change to label count
             t = torch.nn.functional.pad(t, (0, padding))
             return t
            
        encoding["labels"] = label
        encoding["labels"] = pad_tensor(label)

         ### comment out above for not one-hot labels ####

        

        # uncomment for one hot
        #encoding["labels"] = torch.tensor(label)

        return encoding

Test version of this is using BERT and ViT. Should change to something else as both are bad at the task.

In [38]:
#https://huggingface.co/transformers/v3.0.2/main_classes/tokenizer.html

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', use_fast=False)
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer) # image processor and tokenizer here

train_dataset = VisionTextDataset(img=images, txt=texts, lbs=labels, processor=processor)
encoded = train_dataset[0]
encoded

{'input_ids': tensor([  101,  2023,  2003,  2339,  2057,  1005,  2128,  2489,  1032,  1050,
          1032, 23961, 24158,  2003,  2339,  2057,  1005,  2128,  3647,  1032,
          1050,   102,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,   

In [31]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)

batch = next(iter(train_dataloader))
for k,v in batch.items():
  print(k, v.shape)

input_ids torch.Size([16, 512])
token_type_ids torch.Size([16, 512])
attention_mask torch.Size([16, 512])
pixel_values torch.Size([16, 3, 224, 224])
labels torch.Size([16, 22])


In [9]:
val_dataset = VisionTextDataset(img=images_val, txt=texts_val, lbs=labels_val, processor=processor)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# Training

class transformers.PretrainedConfig


Parameters for fine-tuning tasks

architectures (List[str], optional) — Model architectures that can be used with the model pretrained weights.
finetuning_task (str, optional) — Name of the task used to fine-tune the model. This can be used when converting from an original (TensorFlow or PyTorch) checkpoint.
id2label (Dict[int, str], optional) — A map from index (for instance prediction index, or target index) to label.
label2id (Dict[str, int], optional) — A map from label to index for the model.
num_labels (int, optional) — Number of labels to use in the last layer added to the model, typically for a classification task.
task_specific_params (Dict[str, Any], optional) — Additional keyword arguments to store for the current task.
problem_type (str, optional) — Problem type for XxxForSequenceClassification models. Can be one of "regression", "single_label_classification" or "multi_label_classification".

In [10]:
len(unique_labels)

22

In [11]:
text_model = BertForSequenceClassification.from_pretrained(
    "bert-base-uncased", # Use the 12-layer BERT model, with an uncased vocab.
    num_labels = 22,
    problem_type="multi_label_classification") # The number of output labels--2 for binary classification.

image_model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=22, ignore_mismatched_sizes=True)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([22]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([22, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [43]:
config = VisionTextDualEncoderConfig.from_vision_text_configs(image_model.config, text_model.config)
model = VisionTextDualEncoderModel(config=config)

In [44]:
model.parameters

<bound method Module.parameters of VisionTextDualEncoderModel(
  (vision_model): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermedia

In [23]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

model.train()
for epoch in range(5):  # loop over the dataset multiple times
    print(f"Epoch: {epoch}")
    for batch in tqdm(train_dataloader):
        # get the inputs;
        batch = {k:v.to(device) for k,v in batch.items()}

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(input_ids=batch['input_ids'], 
                        token_type_ids=batch['token_type_ids'],
                        attention_mask=batch['attention_mask'], 
                        pixel_values=batch['pixel_values'],
                       return_loss = True)
        
        labels = batch['labels']
        loss, logits = outputs.loss, outputs.logits_per_image
        print("Loss:", loss.item())
        loss.backward()
        optimizer.step()
    
    model.eval()
    with torch.no_grad():
        print(f"Validation: {epoch}")
        for batch in tqdm(val_dataloader):
            batch = {k:v.to(device) for k,v in batch.items()}
            outputs = model(input_ids=batch['input_ids'], 
                        token_type_ids=batch['token_type_ids'],
                        attention_mask=batch['attention_mask'], 
                        pixel_values=batch['pixel_values'],
                       return_loss = True)
            labels = batch['labels']
            loss, logits = outputs.loss, outputs.logits_per_image
            print("Loss:", loss.item())
            torch.cuda.empty_cache()

Epoch: 0


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.7751808166503906
Loss: 2.775420665740967
Loss: 2.777583599090576
Loss: 2.773996114730835
Loss: 2.779356002807617
Loss: 2.7746291160583496
Loss: 2.77052640914917
Loss: 2.772876739501953
Loss: 2.7782959938049316
Loss: 2.7724967002868652
Loss: 2.773310422897339
Loss: 2.7742972373962402
Loss: 2.7758469581604004
Loss: 2.7707138061523438
Loss: 2.77187442779541
Loss: 2.775981903076172
Loss: 2.773958206176758
Loss: 2.773177146911621
Loss: 2.769627809524536
Loss: 2.774658203125
Loss: 2.7754101753234863
Loss: 2.7730846405029297
Loss: 2.77384614944458
Loss: 2.772264003753662
Loss: 2.768883228302002
Loss: 2.7738752365112305
Loss: 2.771636962890625
Loss: 2.772867202758789
Loss: 2.775146961212158
Loss: 2.7732410430908203
Loss: 2.7751519680023193
Loss: 1.3916306495666504
Validation: 0


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.772672176361084
Loss: 2.7727346420288086
Loss: 2.772688150405884
Loss: 2.7727203369140625
Loss: 2.7727675437927246
Loss: 2.7727439403533936
Loss: 2.7727174758911133
Loss: 2.772777557373047
Loss: 2.7727270126342773
Loss: 2.7727015018463135
Loss: 2.77260684967041
Loss: 2.7727503776550293
Loss: 2.7726922035217285
Loss: 2.7730939388275146
Loss: 2.7727577686309814
Loss: 2.7730915546417236
Loss: 2.7727818489074707
Loss: 2.7728257179260254
Loss: 2.7727999687194824
Loss: 2.7727620601654053
Loss: 2.772846221923828
Loss: 2.77278995513916
Loss: 2.772728443145752
Loss: 2.7729830741882324
Loss: 2.7728424072265625
Loss: 2.7727584838867188
Loss: 2.772745132446289
Loss: 2.7727410793304443
Loss: 2.7727527618408203
Loss: 2.772728443145752
Loss: 2.77274751663208
Loss: 1.387087106704712
Epoch: 1


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.772669553756714
Loss: 2.772737503051758
Loss: 2.772662878036499
Loss: 2.7725720405578613
Loss: 2.7725517749786377
Loss: 2.772498607635498
Loss: 2.7724294662475586
Loss: 2.7726259231567383
Loss: 2.7725095748901367
Loss: 2.7725961208343506
Loss: 2.7725725173950195
Loss: 2.7726118564605713
Loss: 2.7724037170410156
Loss: 2.772563934326172
Loss: 2.772632122039795
Loss: 2.772474765777588
Loss: 2.772512912750244
Loss: 2.7725844383239746
Loss: 2.772174119949341
Loss: 2.7724781036376953
Loss: 2.7724967002868652
Loss: 2.772408962249756
Loss: 2.7726283073425293
Loss: 2.7729992866516113
Loss: 2.772859573364258
Loss: 2.7723751068115234
Loss: 2.7723541259765625
Loss: 2.7724380493164062
Loss: 2.7724151611328125
Loss: 2.772456645965576
Loss: 2.7726283073425293
Loss: 1.3863894939422607
Validation: 1


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.7726528644561768
Loss: 2.7724432945251465
Loss: 2.772304058074951
Loss: 2.7731902599334717
Loss: 2.7725586891174316
Loss: 2.7728071212768555
Loss: 2.7728374004364014
Loss: 2.7731199264526367
Loss: 2.772249937057495
Loss: 2.7732996940612793
Loss: 2.7728142738342285
Loss: 2.772268772125244
Loss: 2.7727017402648926
Loss: 2.7729077339172363
Loss: 2.7724108695983887
Loss: 2.7727136611938477
Loss: 2.7724609375
Loss: 2.7727599143981934
Loss: 2.772067070007324
Loss: 2.772838592529297
Loss: 2.772514581680298
Loss: 2.77232027053833
Loss: 2.7725155353546143
Loss: 2.77262020111084
Loss: 2.772892475128174
Loss: 2.772512912750244
Loss: 2.7723917961120605
Loss: 2.773008346557617
Loss: 2.772019386291504
Loss: 2.7725117206573486
Loss: 2.7724452018737793
Loss: 1.3863768577575684
Epoch: 2


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.7727980613708496
Loss: 2.772094249725342
Loss: 2.771908760070801
Loss: 2.771920680999756
Loss: 2.771993637084961
Loss: 2.7693285942077637
Loss: 2.771723747253418
Loss: 2.76641583442688
Loss: 2.821023464202881
Loss: 2.7757468223571777
Loss: 2.773881435394287
Loss: 2.7729673385620117
Loss: 2.7727973461151123
Loss: 2.7726454734802246
Loss: 2.772643804550171
Loss: 2.7727279663085938
Loss: 2.7727456092834473
Loss: 2.7726128101348877
Loss: 2.7726058959960938
Loss: 2.77262282371521
Loss: 2.7727174758911133
Loss: 2.772616386413574
Loss: 2.772675037384033
Loss: 2.7726993560791016
Loss: 2.7726259231567383
Loss: 2.772615432739258
Loss: 2.772603988647461
Loss: 2.7726082801818848
Loss: 2.772594451904297
Loss: 2.772623300552368
Loss: 2.7726168632507324
Loss: 1.3863545656204224
Validation: 2


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.7725839614868164
Loss: 2.772569179534912
Loss: 2.7726125717163086
Loss: 2.772552251815796
Loss: 2.7725753784179688
Loss: 2.7727112770080566
Loss: 2.772584915161133
Loss: 2.77258563041687
Loss: 2.772670269012451
Loss: 2.7726211547851562
Loss: 2.7726173400878906
Loss: 2.7726221084594727
Loss: 2.7726352214813232
Loss: 2.772585391998291
Loss: 2.772608518600464
Loss: 2.7725915908813477
Loss: 2.7726194858551025
Loss: 2.772636890411377
Loss: 2.7725648880004883
Loss: 2.772613763809204
Loss: 2.7725882530212402
Loss: 2.772603988647461
Loss: 2.7725915908813477
Loss: 2.772645950317383
Loss: 2.7725830078125
Loss: 2.7726073265075684
Loss: 2.772613286972046
Loss: 2.772630214691162
Loss: 2.772617816925049
Loss: 2.7726635932922363
Loss: 2.772597551345825
Loss: 1.386338472366333
Epoch: 3


  0%|          | 0/32 [00:00<?, ?it/s]

Loss: 2.7725889682769775
Loss: 2.7725942134857178
Loss: 2.7726328372955322
Loss: 2.772606134414673


KeyboardInterrupt: 

## Save Model

In [None]:
torch.save(model.state_dict(), 'add your path')
torch.save(model, 'add your path')

## With Labels

This section tries to use a different loss function, but has been unsuccessful with the one hot encoding and categorical labels. Will return to this and try to get it working as the previous model doesn't really learn anything so likely nothing is being updated properly.

I think the way forward is to construct the models initiating them from the EncoderModel/Model from Config function (see few sections below for a BERT ensemble example).

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=5e-5)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# one hot CE loss if we use one hot labels, this one seems to work but not great

def one_hot_ce_loss(outputs, targets):
    criterion = nn.CrossEntropyLoss()
    _, labels = torch.max(targets, dim=1)
    return criterion(outputs, labels)

model.train()
for epoch in range(5):  
    print(f"Epoch: {epoch}")
    for itr, batch in tqdm(train_dataloader):
        # get the inputs;
        batch = {k:v.to(device) for k,v in batch.items()}

        # zero the parameter gradients
        optimizer.zero_grad()

        # forward + backward + optimize
        outputs = model(input_ids=batch['input_ids'], 
                        token_type_ids=batch['token_type_ids'],
                        attention_mask=batch['attention_mask'], 
                        pixel_values=batch['pixel_values'],
                       return_loss = True)
        
        labels = batch['labels']
        int_loss, logits = outputs.loss, outputs.logits_per_image
        criterion = torch.nn.BCEWithLogitsLoss()
        loss = criterion(logits, labels)
        print("Loss:", loss)
        loss.backward()
        optimizer.step()
    
    # model.eval()
    # with torch.no_grad():
    #     print(f"Validation: {epoch}")
    #     for batch in tqdm(val_dataloader):
    #         batch = {k:v.to(device) for k,v in batch.items()}
    #         outputs = model(input_ids=batch['input_ids'], 
    #                     token_type_ids=batch['token_type_ids'],
    #                     attention_mask=batch['attention_mask'], 
    #                     pixel_values=batch['pixel_values'],
    #                    return_loss = True)
    #         labels = batch['labels']
    #         loss, logits = outputs.loss, outputs.logits_per_image
    #         print("Loss:", loss.item())
    #         torch.cuda.empty_cache()

In [None]:
# just experimenting with what comes out of the model for the loss function

for itr in tqdm(train_dataloader):
        # get the inputs;
        batch = {k:v for k,v in batch.items()}

        outputs = model(input_ids=batch['input_ids'], 
                        token_type_ids=batch['token_type_ids'],
                        attention_mask=batch['attention_mask'], 
                        pixel_values=batch['pixel_values'],
                       return_loss = True)
        int_loss, logits = outputs.loss, outputs.logits_per_image
        #print(logits)
        criterion = nn.MSELoss()
        labels = itr['labels']
        #score = logits.argmax(1)
        #print(score)
        print(logits)
        print(labels)
        loss = criterion(logits, labels)
        #print(loss)
        break

# Some Other Helper Code
Aside from the data functions, here's another bit of code that uses a ensemble of Bert models which can be modified for a multimodal task. It's simpler than the previous code and should be easier to use and modify. This is for next sentence prediction but generally you can see how models are constructed in PyTorch

In [None]:
class BertEnsembleForNextSentencePrediction(BertPreTrainedModel):
	    def __init__(self, config, *args, **kwargs):
	        super().__init__(config)
	
	        self.n_models = kwargs["n_models"]
	
	        self.bert_model_1 = BertModel(config)
	        self.bert_model_2 = BertModel(config)
	
	        self.cls = nn.Linear(self.n_models * self.config.hidden_size, 2)
	        self.init_weights()
	
	    def forward(
	            self,
	            input_ids=None,
	            attention_mask=None,
	            token_type_ids=None,
	            position_ids=None,
	            head_mask=None,
	            inputs_embeds=None,
	            next_sentence_label=None,
	    ):
	        outputs = []
            input_ids_1 = input_ids[0]
	        attention_mask_1 = attention_mask[0]
	        token_type_ids_1 = token_type_ids[0]
	        outputs.append(self.bert_model_1(input_ids_1,
	                                         attention_mask=attention_mask_1,
	                                         token_type_ids=token_type_ids_1))
	
	        input_ids_2 = input_ids[1]
	        attention_mask_2 = attention_mask[1]
	        token_type_ids_2 = token_type_ids[1]
	        outputs.append(self.bert_model_2(input_ids_2,
	                                         attention_mask=attention_mask_2,
	                                         token_type_ids=token_type_ids_2))
	
	        # just get the [CLS] embeddings
	        last_hidden_states = torch.cat([output[1] for output in outputs], dim=1)
	        logits = self.cls(last_hidden_states)
	
	        # crossentropyloss: https://pytorch.org/docs/stable/nn.html#crossentropyloss
	        if next_sentence_label is not None:
	            loss_fct = nn.CrossEntropyLoss(ignore_index=-1)
	            next_sentence_loss = loss_fct(logits.view(-1, 2), next_sentence_label.view(-1))
	            return next_sentence_loss, logits
	        else:
	            return logits

self.bert_model_1 = BertModel(config)        
self.bert_model_2 = BertModel(config)         
self.cls = nn.Linear(self.n_models * self.config.hidden_size, 2)