A simpler Text + Image Encoder model as a baseline. Uses BERT + VGG16 initiailly, but can be changed to anything else. At present VGG16 has a linear layer, so it's not easy to access the output features in the model. However if the vision model is updated, the code can be changed in the ensemble model structure to accept the output layer of a different model. Note as well the input size would need to be changed in the dataloader.

TO DO:
- Model selection for NLP and Vision
- Dataloader for model selection
- Ensemble model for model selection

In [1]:
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MultiLabelBinarizer
from PIL import Image
from tqdm.notebook import tqdm
import numpy as np

from argparse import Namespace

from transformers import (
    BertTokenizer,
    BertModel,
    BertForSequenceClassification)

import torch
import torchvision
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn.functional as F
from torchmetrics.classification import BinaryAccuracy, BinaryF1Score, MulticlassF1Score, MultilabelConfusionMatrix



# Data
Same junk from the VisionTextEncoder notebook, but this time using the binary classification as an experiment.

TO DO:
- Better data loading, just copying and pasting atm

In [2]:
path = r'X:\PhD\SemEval Task4\Data\subtask2b_images\train'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

df = pd.read_json(r'X:\PhD\SemEval Task4\Data\annotations\data\subtask2b\train.json')
df = pd.merge(df, images_df, on='image')
df.fillna(' ', inplace=True)

path_val = r'X:\PhD\SemEval Task4\Data\subtask2b_images\val'
images_val = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path_val) for f in filenames]
images_df_val = pd.DataFrame(images_val, columns=['filepath'])
images_df_val['image'] = images_df_val['filepath'].str.split('\\').str[-1]

df_val = pd.read_json(r'X:\PhD\SemEval Task4\Data\annotations\data\subtask2b\val.json')
df_val = pd.merge(df_val, images_df_val, on='image')

le = LabelEncoder()

df['encoded_labels'] = le.fit_transform(df['label']).tolist()
df_val['encoded_labels'] = le.fit_transform(df_val['label']).tolist()

In [3]:
# lists

images = [str(i) for i in df['filepath'].values]
texts = [str(i) for i in df['text'].astype(str).values.tolist()]
labels = df['encoded_labels'].values

images_val = [str(i) for i in df_val['filepath'].values]
texts_val = [str(i) for i in df_val['text'].astype(str).values.tolist()]
labels_val = df_val['encoded_labels'].values

## DataLoader

In [11]:
# transformations for the raw images
# change size here for different model

model_transforms = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])

In [8]:
class VisionTextDataset(torch.utils.data.Dataset):
    def __init__(self, img, txt, lbs, tokenizer, n_classes, transform):
        self.image = img
        self.text = txt
        self.labels = lbs
        self.tokenizer = tokenizer
        self.n_classes = n_classes
        self.transforms = transform  

    def __len__(self):
        return len(self.image)

    def __getitem__(self, idx):
        text = self.text[idx]
        image = self.image[idx]

        text_encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )
        
        
        image = Image.open(image).convert('RGB')
        image = self.transforms(image)

        label = torch.tensor(self.labels[idx])

        sample = {'input_ids': text_encoded['input_ids'],
                  'attention_mask': text_encoded['attention_mask'],
                  'image': image,
                  "label": label}
        sample = {k:v.squeeze() for k,v in sample.items()}

        return sample

PyTorch has a problem stacking multiple inputs, the function below converts the data to a list but the model training has to change for this to work. For now batch_size = 1 allows this to work, or sometimes it works depending on the inputs. The collate_fn custom input is good enough but produces the inputs as a list which are more difficult to work with. See more: https://discuss.pytorch.org/t/making-custom-image-to-image-dataset-using-collate-fn-and-dataloader/55951

def collate_fn(batch):
    input_ids = [item['input_ids'] for item in batch]
    attn_mask = [item['attention_mask'] for item in batch]
    img = [item['image'] for item in batch]
    target = [item['label'] for item in batch]
    return [input_ids, attn_mask, img, target] -->

Probably the truncation for the text is the problem, seems to work for now but doesn't always work.

In [9]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True, use_fast=False)
train_dataset = VisionTextDataset(img=images, txt=texts, lbs=labels, tokenizer=tokenizer, n_classes=2, transform=model_transforms)

In [10]:
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
batch = next(iter(train_dataloader))
for k, v in batch.items():
    print(k, v.size())

input_ids torch.Size([16, 512])
attention_mask torch.Size([16, 512])
image torch.Size([16, 3, 224, 224])
label torch.Size([16])


In [11]:
val_dataset = VisionTextDataset(img=images_val, txt=texts_val, lbs=labels_val, tokenizer=tokenizer, n_classes=2, transform=model_transforms)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

# Model

In [4]:
text_model = BertModel.from_pretrained('bert-base-uncased', return_dict=True)
vision_model = torchvision.models.vgg16(pretrained=True)

#num_ftrs = vision_model.classifier[0].in_features
#vision_model.classifier[0] = nn.Linear(num_ftrs, args.n_classes)



In [5]:
class EnsembleModel(nn.Module):
    def __init__(self, image_model, text_model, num_classes):
        super(EnsembleModel, self).__init__()
        self.image_model = image_model
        self.text_model = text_model
        
        self.fc = nn.Linear(
            1000 +                             # this is hardcoded to 1000 for VGG16, as last layer is linear
            text_model.config.hidden_size,     # usual method is something like model.fc.out_features
            num_classes
        )

    def forward(self, image_features, text_features, attention_masks):
        img_feat = self.image_model(image_features)
        txt_feat_bert = self.text_model(text_features, attention_mask=attention_masks).last_hidden_state.mean(dim=1)

        combined_feat = torch.cat((img_feat, txt_feat_bert), dim=1)
        return self.fc(combined_feat)

In [6]:
ensemble_model = EnsembleModel(vision_model, text_model, 2)

# Simple Model (Binary Classification - Subtask2b)

## Test Loop

In [53]:
# test loop

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
ensemble_model.to(device)

criterion = nn.CrossEntropyLoss()
learning_rate = 1e-5
optimizer = optim.Adam(ensemble_model.parameters(), lr=learning_rate)
n_epochs = 5

actual, prediction = [], []

for epoch in range(n_epochs):
    print(f"Epoch: {epoch}")
    train_losses = []
    ensemble_model.train()
    optimizer.zero_grad()

    for batch in tqdm(train_dataloader):
        # get the inputs;
        batch = {k:v.to(device) for k,v in batch.items()}
        
        # forward + backward + optimize
        outputs = ensemble_model(text_features=batch['input_ids'],
                                 attention_masks=batch['attention_mask'],
                       image_features=batch['image'])
        label = batch['label']

        loss = criterion(outputs, label)
        
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

Epoch: 0


  0%|          | 0/75 [00:00<?, ?it/s]

Epoch [1/5], Loss: 0.2545
Epoch: 1


  0%|          | 0/75 [00:00<?, ?it/s]

Epoch [2/5], Loss: 0.0064
Epoch: 2


  0%|          | 0/75 [00:00<?, ?it/s]

Epoch [3/5], Loss: 0.0014
Epoch: 3


  0%|          | 0/75 [00:00<?, ?it/s]

Epoch [4/5], Loss: 0.0019
Epoch: 4


  0%|          | 0/75 [00:00<?, ?it/s]

Epoch [5/5], Loss: 0.0003


## Training and Val Loop

In [44]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.CrossEntropyLoss()
learning_rate = 1e-5
optimizer = optim.Adam(ensemble_model.parameters(), lr=learning_rate)

def train_epoch(ensemble_model, train_dataloader):
    actuals, predictions = [], []
    train_loss = 0
    train_acc = 0
    
    ensemble_model.to(device)
    ensemble_model.train()
    
    for batch in tqdm(train_dataloader):
        # get the inputs;
        batch = {k:v.to(device) for k,v in batch.items()}
            
        # forward + backward + optimize
        outputs = ensemble_model(text_features=batch['input_ids'],
                                 attention_masks=batch['attention_mask'],
                                 image_features=batch['image'])
        label = batch['label']
    
        loss = criterion(outputs, label)

        actuals.extend(label.cpu().numpy().astype(int))
        predictions.extend(F.softmax(outputs, 1).cpu().detach().numpy())
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()

    predictions = np.array(predictions)
    predicted_labels = predictions.argmax(1)
    accuracy = (predicted_labels == actuals).mean()

    return train_loss/len(train_dataloader), accuracy

In [45]:
def val_epoch(ensemble_model, val_dataloader):
    
    actuals, predictions = [], []
    val_loss = 0
    val_acc = 0
    
    ensemble_model.to(device)
    ensemble_model.eval()
    with torch.no_grad():    
    
        for batch in tqdm(val_dataloader):
            # get the inputs;
            batch = {k:v.to(device) for k,v in batch.items()}
                
            # forward + backward + optimize
            outputs = ensemble_model(text_features=batch['input_ids'],
                                     attention_masks=batch['attention_mask'],
                                     image_features=batch['image'])
            label = batch['label']
        
            cur_val_loss = criterion(outputs, label)
    
            actuals.extend(label.cpu().numpy().astype(int))
            predictions.extend(F.softmax(outputs, 1).cpu().detach().numpy())
    
            val_loss += cur_val_loss.item()
            

    predictions = np.array(predictions)
    predicted_labels = predictions.argmax(1)
    accuracy = (predicted_labels == actuals).mean()

    return val_loss/len(val_dataloader), accuracy

In [46]:
NUM_EPOCHS = 10
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

In [49]:
for epoch in range(NUM_EPOCHS):

    train_loss, train_acc = train_epoch(ensemble_model=ensemble_model, train_dataloader=train_dataloader)
    print(f"\n Epoch:{epoch + 1} / {NUM_EPOCHS},train loss:{train_loss:.5f}, train acc: {train_acc:.5f}")
    val_loss, val_acc = val_epoch(ensemble_model=ensemble_model, val_dataloader=val_dataloader)

    print(f"\n Epoch:{epoch + 1} / {NUM_EPOCHS} Val loss:{val_loss:.5f}, Val acc:{val_acc:.5f}")


    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:1 / 10,train loss:0.10252, train acc: 0.95833


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:1 / 10 Val loss:0.63411, Val acc:0.77333


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:2 / 10,train loss:0.02184, train acc: 0.99917


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:2 / 10 Val loss:0.80614, Val acc:0.80000


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:3 / 10,train loss:0.00761, train acc: 0.99917


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:3 / 10 Val loss:0.66051, Val acc:0.80667


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:4 / 10,train loss:0.00544, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:4 / 10 Val loss:0.83755, Val acc:0.77333


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:5 / 10,train loss:0.00224, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:5 / 10 Val loss:1.01516, Val acc:0.80667


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:6 / 10,train loss:0.00112, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:6 / 10 Val loss:0.94118, Val acc:0.79333


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:7 / 10,train loss:0.00088, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:7 / 10 Val loss:0.96382, Val acc:0.78667


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:8 / 10,train loss:0.00054, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:8 / 10 Val loss:0.95280, Val acc:0.79333


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:9 / 10,train loss:0.00036, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:9 / 10 Val loss:0.98356, Val acc:0.77333


  0%|          | 0/75 [00:00<?, ?it/s]


 Epoch:10 / 10,train loss:0.00037, train acc: 1.00000


  0%|          | 0/10 [00:00<?, ?it/s]


 Epoch:10 / 10 Val loss:1.20003, Val acc:0.78000


In [51]:
torch.save(ensemble_model, 'Bert+VGG16_ensemble')
torch.save(ensemble_model.state_dict(),'Bert+VGG16_weights.pth')

## Model Structure for Reuse

TO DO
Rewrite the dataloader and ensemble model to accept different models, similiar to Unimodal baselines.

# Simple Model - Multilabel Classification (Subtask2a)

## Data
NOTE: There is ONE instance in the dataset where no technique is recorded. This is in the training data: prop_meme_24430.png. For now I am going to reclassify it as No Technique but have some concerns about how underrepresented this class is throughout the dataset.

In [7]:
path = r'X:\PhD\SemEval Task4\Data\subtask2a_images\train_images\train_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

df = pd.read_json(r'X:\PhD\SemEval Task4\Data\annotations\data\subtask2a\train.json')
df = pd.merge(df, images_df, on='image')

# reclassify the no technique
index = df[df['labels'].apply(lambda x: len(x)) == 0].index.values.astype(int)[0]
df.loc[index, 'labels']=['No Technique']

In [8]:
labels = df['labels'].values
unique_labels = []

for i in labels:
    for x in i:
        unique_labels.append(x)
unique_labels = list(set(unique_labels))

le = LabelEncoder()
le.fit(unique_labels)

# try a one-hot as well
ml = MultiLabelBinarizer()

df['one_hot_labels'] = ml.fit_transform(df['labels'])[:,1:].tolist()
df['encoded_labels'] = df['labels'].apply(le.transform)

images = [str(i) for i in df['filepath'].values]
texts = [str(i) for i in df['text'].astype(str).values.tolist()]
labels = df['encoded_labels'].values
one_hot_labels = df['one_hot_labels'].values

In [9]:
path = r'X:\PhD\SemEval Task4\Data\subtask2a_images\validation_images\validation_images'
images = [os.path.join(dirpath,f) for (dirpath, dirnames, filenames) in os.walk(path) for f in filenames]
images_df = pd.DataFrame(images, columns=['filepath'])
images_df['image'] = images_df['filepath'].str.split('\\').str[-1]

df_val = pd.read_json(r'X:\PhD\SemEval Task4\Data\annotations\data\subtask2a\validation.json')
df_val = pd.merge(df_val, images_df, on='image')

# reclassify the no technique - there isn't one in val
# index = df_val[df_val['labels'].apply(lambda x: len(x)) == 0].index.values.astype(int)[0]
# df_val.loc[index, 'labels']=['No Technique']

val_labels = df_val['labels'].values

le = LabelEncoder()
le.fit(unique_labels)

# try a one-hot as well
ml = MultiLabelBinarizer()

df_val['one_hot_labels'] = ml.fit_transform(df_val['labels'])[:,1:].tolist()
df_val['encoded_labels'] = df_val['labels'].apply(le.transform)

images_val = [str(i) for i in df_val['filepath'].values]
texts_val = [str(i) for i in df_val['text'].astype(str).values.tolist()]
labels_val =df_val['encoded_labels'].values
one_hot_labels_val = df_val['one_hot_labels'].values

In [10]:
# labels need to be padded
# unless onehot is used

model_transforms = transforms.Compose([transforms.Resize((224,224)), transforms.ToTensor()])

class VisionTextDataset_Multi(torch.utils.data.Dataset):
    def __init__(self, img, txt, lbs, tokenizer, n_classes, transform):
        self.image = img
        self.text = txt
        self.labels = lbs
        self.tokenizer = tokenizer
        self.n_classes = n_classes
        self.transforms = transform

    def __len__(self):
        return len(self.image)

    def __getitem__(self, idx):
        text = self.text[idx]
        image = self.image[idx]        
            
        text_encoded = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            return_token_type_ids=False,
            padding='max_length',
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='pt',
            truncation=True
        )

        
        def pad_tensor(t):
            t = torch.tensor(t)
            padding = self.n_classes - t.size()[0]
            t = torch.nn.functional.pad(t,(0,padding))
            return t
        
        
        image = Image.open(image).convert('RGB')
        image = self.transforms(image)

        label = pad_tensor((self.labels[idx]))

        sample = {'input_ids': text_encoded['input_ids'],
                  'attention_mask': text_encoded['attention_mask'],
                  'image': image,
                  "label": label}
        sample = {k:v.squeeze() for k,v in sample.items()}

        return sample

In [11]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased", do_lower_case=True, use_fast=False)
train_dataset = VisionTextDataset_Multi(img=images, txt=texts, lbs=one_hot_labels, tokenizer=tokenizer, n_classes=23, transform=model_transforms)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=False)
batch = next(iter(train_dataloader))
for k, v in batch.items():
    print(k, v.size(), v.dtype)

input_ids torch.Size([16, 512]) torch.int64
attention_mask torch.Size([16, 512]) torch.int64
image torch.Size([16, 3, 224, 224]) torch.float32
label torch.Size([16, 23]) torch.int64


In [12]:
val_dataset = VisionTextDataset_Multi(img=images_val, txt=texts_val, lbs=one_hot_labels_val, tokenizer=tokenizer, n_classes=23, transform=model_transforms)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=True)

## Training
Just reuse the same ensemble, but BCE with logits instead for one hot, and change label to label.float()

TO DO: Accuracy. Should be calculating when each 1 is matched and then average over the samples.

In [15]:
ensemble_model_multi = EnsembleModel(vision_model, text_model, 23)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
criterion = nn.BCEWithLogitsLoss()
learning_rate = 1e-5
optimizer = optim.Adam(ensemble_model.parameters(), lr=learning_rate)
metric = BinaryAccuracy()
f1 = BinaryF1Score()

def train_epoch(ensemble_model, train_dataloader):
    
    actuals, predictions = [], []
    train_loss = 0
    train_acc = 0
    
    ensemble_model.to(device)
    ensemble_model.train()
    
    for batch in tqdm(train_dataloader):
        # get the inputs;
        batch = {k:v.to(device) for k,v in batch.items()}
            
        # forward + backward + optimize
        outputs = ensemble_model(text_features=batch['input_ids'],
                                 attention_masks=batch['attention_mask'],
                                 image_features=batch['image'])
        label = batch['label']
    
        loss = criterion(outputs, label.float()) # label.float()

        actuals.extend(label.cpu().numpy().astype(int))
        predictions.extend(F.softmax(outputs, 1).cpu().detach().numpy().astype(float))
        
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()

        train_loss += loss.item()
  
    pred = torch.tensor(predictions)
    act = torch.tensor(actuals)
    accuracy = metric(pred, act)
    f1_score = f1(pred,act)

    return train_loss/len(train_dataloader), accuracy, f1_score

def val_epoch(ensemble_model, val_dataloader):
    
    actuals, predictions = [], []
    val_loss = 0
    val_acc = 0
    
    ensemble_model.to(device)
    ensemble_model.eval()
    with torch.no_grad():    
    
        for batch in tqdm(val_dataloader):
            # get the inputs;
            batch = {k:v.to(device) for k,v in batch.items()}
                
            # forward + backward + optimize
            outputs = ensemble_model(text_features=batch['input_ids'],
                                     attention_masks=batch['attention_mask'],
                                     image_features=batch['image'])
            label = batch['label']
        
            cur_val_loss = criterion(outputs, label.float()) # label.float
    
            actuals.extend(label.cpu().numpy().astype(int))
            predictions.extend(F.softmax(outputs, 1).cpu().detach().numpy().astype(float))
    
            val_loss += cur_val_loss.item()

    pred = torch.tensor(predictions)
    act = torch.tensor(actuals)
    accuracy = metric(pred, act)
    f1_score = f1(pred,act)
  
    return val_loss/len(val_dataloader), accuracy, f1_score

In [16]:
n_epochs = 10
history = {'train_loss': [], 'train_acc': [], 'val_loss': [], 'val_acc': []}

for epoch in range(n_epochs):

    train_loss, train_acc, train_f1 = train_epoch(ensemble_model=ensemble_model_multi, train_dataloader=train_dataloader)
    print(f"\n Epoch:{epoch + 1} / {n_epochs},train loss:{train_loss:.5f},train acc: {train_acc:.5f}, train f1: {train_f1:.5f}")
    val_loss, val_acc, val_f1 = val_epoch(ensemble_model=ensemble_model_multi, val_dataloader=val_dataloader)

    print(f"\n Epoch:{epoch + 1} / {n_epochs} val loss:{val_loss:.5f}, val acc:{val_acc:.5f}, val f1:{val_f1:.5f}")


    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)

  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:1 / 10,train loss:0.38184,train acc: 0.90261, train f1: 0.05405


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:1 / 10 val loss:0.32424, val acc:0.89878, val f1:0.00342


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:2 / 10,train loss:0.26192,train acc: 0.90313, train f1: 0.05753


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:2 / 10 val loss:0.31462, val acc:0.89722, val f1:0.00672


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:3 / 10,train loss:0.23705,train acc: 0.90661, train f1: 0.12255


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:3 / 10 val loss:0.31971, val acc:0.89452, val f1:0.02256


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:4 / 10,train loss:0.21708,train acc: 0.91061, train f1: 0.19688


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:4 / 10 val loss:0.31947, val acc:0.89513, val f1:0.05039


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:5 / 10,train loss:0.19308,train acc: 0.91617, train f1: 0.27844


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:5 / 10 val loss:0.32516, val acc:0.89287, val f1:0.05954


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:6 / 10,train loss:0.16592,train acc: 0.92235, train f1: 0.36260


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:6 / 10 val loss:0.35322, val acc:0.88809, val f1:0.06672


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:7 / 10,train loss:0.13926,train acc: 0.92609, train f1: 0.40725


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:7 / 10 val loss:0.36222, val acc:0.89000, val f1:0.08796


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:8 / 10,train loss:0.11799,train acc: 0.92939, train f1: 0.45209


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:8 / 10 val loss:0.44378, val acc:0.88052, val f1:0.08278


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:9 / 10,train loss:0.09709,train acc: 0.93191, train f1: 0.48042


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:9 / 10 val loss:0.48728, val acc:0.88009, val f1:0.09692


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:10 / 10,train loss:0.07298,train acc: 0.93583, train f1: 0.51890


  0%|          | 0/32 [00:00<?, ?it/s]


 Epoch:10 / 10 val loss:0.47236, val acc:0.88261, val f1:0.10477


In [17]:
torch.save(ensemble_model, 'Bert+VGG16_ensemble_subtask2a')
torch.save(ensemble_model.state_dict(),'Bert+VGG16_subtask2a_weights.pth')

# Other Models - free to use if you can get it working
Better method, but model gets upset with the layers in the final part of the classifier and I can't remember how to fix it.

Other layers not included but will add back in

- self.bn = nn.BatchNorm1d(value) < batchnorm
- self.dropout = nn.Dropout(drop_prob) < dropout
- self.classify = nn.Linear(in_features = 512, out_features = num_classes) < linear layer

In [29]:
args = Namespace()

args.img_embed_pool_type = "avg"
args.num_image_embeds = 2
args.hidden_size = 768
args.img_hidden_size = 4098
args.n_classes = len(list(set(labels))) # taken from our label dataset

In [35]:
# text encoder

class BertEncoder(nn.Module):
    def __init__(self, args):
        super(BertEncoder, self).__init__()
        self.args = args
        self.bert = BertModel.from_pretrained("bert-base-uncased") # change the pretrained model here
        self.drop = nn.Dropout(p=0.1)
        self.out = nn.Linear(args.hidden_size, 512)
        self.out2 = nn.Linear(512, args.n_classes)

    def forward(self, input_ids, attention_mask):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        output = self.drop(outputs.pooler_output)
        output = self.out(output)
        output = self.out2(output)
        return output

# image encoder
# see https://pytorch.org/vision/main/models/generated/torchvision.models.vgg16.html

class ImageEncoder(nn.Module):
    def __init__(self, args):
        super(ImageEncoder, self).__init__()
        self.args = args
        model = torchvision.models.vgg16(pretrained=True) # change the pretrained model here, or weights
        modules = list(model.children())[:-2]
        self.model = nn.Sequential(*modules)

        pool_func = (
            nn.AdaptiveAvgPool2d
            if args.img_embed_pool_type == "avg"
            else nn.AdaptiveMaxPool2d
        )

        if args.num_image_embeds in [1, 2, 3, 5, 7]:
            self.pool = pool_func((args.num_image_embeds, 1))
        elif args.num_image_embeds == 4:
            self.pool = pool_func((2, 2))
        elif args.num_image_embeds == 6:
            self.pool = pool_func((3, 2))
        elif args.num_image_embeds == 8:
            self.pool = pool_func((4, 2))
        elif args.num_image_embeds == 9:
            self.pool = pool_func((3, 3))

    def forward(self, x):
        # Bx3x224x224 -> Bx2048x7x7 -> Bx2048xN -> BxNx2048
        out = self.pool(self.model(x))
        out = torch.flatten(out, start_dim=2)
        out = out.transpose(1, 2).contiguous()
        return out  # BxNx2048

In [40]:
# concatenating the model outputs

class MultimodalConcatBertClf(nn.Module):
    def __init__(self, args):
        super(MultimodalConcatBertClf, self).__init__()
        self.args = args
        self.txtenc = BertEncoder(args)
        self.imgenc = ImageEncoder(args)

        last_size = args.hidden_size + (args.img_hidden_size * args.num_image_embeds)
        self.clf = nn.ModuleList()

        self.clf.append(nn.Linear(last_size, args.n_classes))

    def forward(self, txt, mask, img):
        txt = self.txtenc(txt, mask)
        img = self.imgenc(img)   
        
        out = torch.cat((txt, img), dim=1)
        out = self.concat(out)
        for layer in self.clf:
            out = layer(out)
        return out

In [41]:
model = MultimodalConcatBertClf(args)
model.parameters

<bound method Module.parameters of MultimodalConcatBertClf(
  (txtenc): BertEncoder(
    (bert): BertModel(
      (embeddings): BertEmbeddings(
        (word_embeddings): Embedding(30522, 768, padding_idx=0)
        (position_embeddings): Embedding(512, 768)
        (token_type_embeddings): Embedding(2, 768)
        (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
        (dropout): Dropout(p=0.1, inplace=False)
      )
      (encoder): BertEncoder(
        (layer): ModuleList(
          (0-11): 12 x BertLayer(
            (attention): BertAttention(
              (self): BertSelfAttention(
                (query): Linear(in_features=768, out_features=768, bias=True)
                (key): Linear(in_features=768, out_features=768, bias=True)
                (value): Linear(in_features=768, out_features=768, bias=True)
                (dropout): Dropout(p=0.1, inplace=False)
              )
              (output): BertSelfOutput(
                (dense): Linear(in_feat

## FusionNet Module

In [704]:
# where text and image model are the same ones defined

class FusionNet(nn.Module):
    
    def __init__(self, num_classes, drop_prob = 0.1):
        super(FusionNet, self).__init__()
        self.text_model =text_model()
        self.image_model = vision_model()
        
        self.pooler = nn.Linear(in_features=768, out_features=768)
        
        self.concat = nn.Linear(in_features=768+2048, out_features= 512)
        
        self.bn = nn.BatchNorm1d(512)
        self.bn1 = nn.BatchNorm1d(768)
        self.bn2 = nn.BatchNorm1d(2048)
    
        self.dropout = nn.Dropout(drop_prob)
        
        self.classify = nn.Linear(in_features = 512, out_features = num_classes)
        
        
    def forward(self, text_features, image_features):
        text_features = self.text_model(text_features)
        image_features = self.image_model(image_features)
        
        text_features = torch.tanh(self.pooler(text_features))
        text_features = self.dropout(text_features)

        text_features = self.bn1(text_features)
        image_features = self.bn2(image_features)
      
        fused =  torch.cat((text_features, image_features), dim=1)
      
        x = self.concat(fused)
  
        x = F.tanh(self.bn(x))          
  
        x = F.tanh(self.classify(x)) 
  

        return x