In [22]:
# Initiate Neptune experiment
# Comment out lines beginning with neptune. in this notebook if not using Neptune for ML experiment logging

neptune.init(project_qualified_name=os.environ["NEPTUNE_PROJECT"], api_token=os.environ["NEPTUNE_API_TOKEN"])

neptune.create_experiment(name="22012021_01", 
                          params=training_params,
                          tags=["multimodal", "80-20", "607-samples"],
                          upload_source_files=["SemEval-subtask3.ipynb"])

NVMLError: NVML Shared Library Not Found - GPU usage metrics may not be reported.


https://ui.neptune.ai/kruttikanadig/sandbox/e/SAN-38


Experiment(SAN-38)

In [23]:
import os
import pandas as pd
import numpy as np
import logging
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from torch import cuda
import transformers
from transformers import BertTokenizer, BertModel, BertConfig, FeatureExtractionPipeline, pipeline, RobertaTokenizer, RobertaModel, XLNetModel, XLNetTokenizer
from pprint import pprint
import random
import neptune
import itertools
import subprocess
import logging
import json
from sklearn.model_selection import train_test_split
import torchvision.transforms as transforms
import torchvision.datasets as datasets
import torchvision.models as models
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from sklearn import metrics
from PIL import Image
import sentencepiece as spm
device = "cuda" if cuda.is_available() else "cpu"

In [24]:
train_path = "data/training_set_task3/training_set_task3.txt"
dev_path = "data/dev_set_task3/dev_set_task3.txt"

In [25]:
def set_seed(seed):
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)
    np.random.seed(seed)
    random.seed(seed)

set_seed(1)

In [26]:
# Define function to read data and convert each item's text labels into binary labels
def read_data(path, inference=False):

    with open (path, "r") as f:
        data_raw = f.read()
        data_json = json.loads(data_raw)
        df = pd.DataFrame(data_json)
        
    if inference == False:
        all_labels = list(df["labels"].explode().unique())
        all_labels.pop(1)
        
        def encode_labels(labels):
            encoded = [1 if l in labels else 0 for l in all_labels]
            return encoded
        
        df["label_list"] = df["labels"].map(encode_labels)
        return df,  all_labels
    else:
        return df

In [27]:
df, all_labels = read_data(train_path)
df.head(3)

Unnamed: 0,id,labels,text,image,label_list
0,128,"[Black-and-white Fallacy/Dictatorship, Name ca...",THERE ARE ONLY TWO GENDERS\n\nFEMALE \n\nMALE\n,128_image.png,"[1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
1,189,"[Transfer, Reductio ad hitlerum, Smears]",This is not an accident!,189_image.png,"[0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..."
2,96,"[Loaded Language, Name calling/Labeling, Smear...",SO BERNIE BROS HAVEN'T COMMITTED VIOLENCE EH?\...,96_image.png,"[0, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, ..."


In [28]:
# Define model parameters and info that will be logged in Neptune
training_params = {"model": "Fusion",
                   "epochs": 1,
                   "optimizer": "Adam",
                   "learning_rate": 0.0001,
                   "train_batch_size": 8,
                   "val_batch_size": 8,
                   "max_len": 200,
                   "shuffle":False,
                   "num_workers": 0,
                   "loss_fn": "BCEWithLogitsLoss",
                   "metric1": "train_f1_micro",
                   "metric2": "train_f1_macro",
                   "metric3": "val_f1_micro",
                   "metric4": "val_f1_macro",
                   "dropout": 0.2,
                   "language_model": "bert",
                   "vision_model": "resnet18"
                  }

In [29]:
# Define dataset class for loading the training data

# Image transformation function
transformations=transforms.Compose([
                    transforms.Resize((224,224)), 
                    transforms.ToTensor(),
                    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

class CustomDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, transform, df, tokenizer, max_len):
        self.img_dir=img_dir
        self.transform=transform
        self.img_names=df["image"].values
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text = df["text"].values
        self.targets = df["label_list"].values
        
    def __len__(self):
        return len(self.text)
        
    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())
        
        # The inputs below are required by BERT. See Huggingface's BERT documentation
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_len,
            return_token_type_ids=True
        )
        
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        # For each img_name in img_names, load the corresponding img from the img folder
        img = Image.open(os.path.join(self.img_dir, self.img_names[index])).convert('RGB')
        # Return the transformed RGB image
        if self.transform is not None:
            img_vector=self.transform(img)
        
        # For each item in the dataset, returns the item's text and image information together in the form of ids, mask, token_type_ids, img_vector
        # Also return the item's true labels (targets)
        
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'targets': torch.tensor(self.targets[index], dtype=torch.float),
            'img_vector': img_vector
        }
    

In [30]:
# Creating the dataset and dataloader

dataset = CustomDataset(img_dir="data/training_set_task3",
                        transform=transformations,
                        df=df, 
                        tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), 
                        max_len=training_params["max_len"])

dataloader = DataLoader(dataset, 
                        batch_size=training_params["train_batch_size"],
                        num_workers=training_params["num_workers"],
                        worker_init_fn=random.seed(1))

In [31]:
# Build multimodal model fusing BERT and ResNet

class FusionModel(torch.nn.Module):
    def __init__(self):
        super().__init__()
        # BERT layers - language model
        self.bert = transformers.BertModel.from_pretrained('bert-base-uncased', output_hidden_states=True, return_dict=True)
        self.dropout = nn.Dropout(0.2)
        self.bert_features = nn.Sequential(nn.Linear(768, 256), # Transform 768 dim Bert vectors into 256
                                nn.LogSoftmax(dim=1))
        # Resnet layers - vision model
        self.resnet = models.resnet18(pretrained=True)
        for param in self.resnet.parameters():
            param.requires_grad = False
        # Replace the default ResNet classifier layer 
        classifier_input = self.resnet.fc.in_features
        classifier = nn.Sequential(nn.Linear(classifier_input, 256), # Transform 512 dim Resnet vectors into 256
                             nn.LogSoftmax(dim=1)) 
        self.resnet.fc = classifier
        self.fusion_classifier = nn.Linear(512, 22) # 22 labels
        
    def forward(self, ids, mask, token_type_ids, img_vector): # dataloader items
        # Apply language model
        bert_outputs = self.bert(ids, mask, token_type_ids)
        bert_outputs = self.bert_features(self.dropout(bert_outputs[1]))
        # Apply vision model
        resnet_outputs = self.resnet(img_vector)
#         print(resnet_outputs.size())
        # Concatenate
        fused = torch.cat([bert_outputs, resnet_outputs], dim=1)
        return self.fusion_classifier(fused)

In [32]:
model = FusionModel()
model.to(device)

FusionModel(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)


In [34]:
# Shuffle and randomly split dataset into 80:20 training and validation sets
train_size = int(0.8 * len(dataset))
valid_size = len(dataset) - train_size
train_dataset, valid_dataset = torch.utils.data.random_split(dataset, [train_size, valid_size])
training_loader = torch.utils.data.DataLoader(train_dataset, batch_size=8)
validation_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=8)

In [39]:
# Model training function

def train(epochs):
    
    def loss_fn(outputs, targets):
        return torch.nn.BCEWithLogitsLoss()(outputs, targets)
    
    def calculate_metrics(outputs, targets):
        f1_micro = metrics.f1_score(targets, outputs, average="micro")
        f1_macro = metrics.f1_score(targets, outputs, average="macro")
        return f1_micro, f1_macro
    
    def make_preds(outputs, targets, fin_outputs, fin_targets):
        outputs = (np.array(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) >= 0.5).astype(int).tolist()
        targets = np.array(targets.cpu().detach().numpy().tolist()).astype(int).tolist()
        fin_targets.extend(targets)
        fin_outputs.extend(outputs)
        return fin_outputs, fin_targets
    
    def validate(model, validation_loader):
        val_targets=[]
        val_outputs=[]
        model.eval()
        
        for _,data in enumerate(validation_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            img_vector = data["img_vector"].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids, img_vector)
            loss = loss_fn(outputs, targets)
            print("Validation Loss: {}".format(loss.item()))
            neptune.log_metric("Validation epoch", epoch+1)
            neptune.log_metric("Validation batch", _)
            neptune.log_metric("Validation loss", loss.item())
            
            val_outputs, val_targets = make_preds(outputs, targets, val_outputs, val_targets)
        
        return val_outputs, val_targets

        
    # Train model
            
    optimizer = torch.optim.Adam(params=model.parameters(), lr=training_params["learning_rate"])
    model.train()
    train_targets=[]
    train_outputs=[]
    
    for epoch in range(epochs):
        print("Epoch {}".format(epoch+1))
        
        for _,data in enumerate(training_loader, 0):
            optimizer.zero_grad()

            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float)
            img_vector = data["img_vector"].to(device, dtype = torch.float)
            
            outputs = model(ids, mask, token_type_ids, img_vector)
            
            loss = loss_fn(outputs, targets)
            print("Train Loss: {}".format(loss.item()))
            neptune.log_metric("Train epoch", epoch+1)
            neptune.log_metric("Train batch", _)
            neptune.log_metric("Train loss", loss.item())
            
            
            loss.backward()
            optimizer.step()
            
            if epoch == (epochs-1):
                train_outputs, train_targets = make_preds(outputs, targets, train_outputs, train_targets)
                
        val_outputs, val_targets = validate(model, validation_loader)
    
    
    train_f1_micro, train_f1_macro = calculate_metrics(train_outputs, train_targets)
    print("Train F1 Micro score: ", train_f1_micro)
    print("Train F1 Macro score: ", train_f1_macro)
    neptune.log_metric("Train f1_micro", train_f1_micro)
    neptune.log_metric("Train f1_macro", train_f1_macro)
    
    val_f1_micro, val_f1_macro = calculate_metrics(val_outputs, val_targets)
    print("Validation F1 Micro score: ", val_f1_micro)
    print("Validation F1 Macro score: ", val_f1_macro)
    neptune.log_metric("Validation f1_micro", val_f1_micro)
    neptune.log_metric("Validation f1_macro", val_f1_macro)
    
    # The next chunk of code is for prediction analysis and debugging
    
    t_outputs = [0]*22
    t_targets = [0]*22
    for i in range(0, len(train_outputs)):
        t_outputs = list(map(np.add, t_outputs, train_outputs[i]))
        t_targets = list(map(np.add, t_targets, train_targets[i]))
        
    v_outputs = [0]*22
    v_targets = [0]*22
    for i in range(0, len(val_outputs)):
        v_outputs = list(map(np.add, v_outputs, val_outputs[i]))
        v_targets = list(map(np.add, v_targets, val_targets[i]))
    
    print("True train labels")
    print(t_targets)
    print("Train predictions per label")
    print(t_outputs)
    print("True validation labels")
    print(v_targets)
    print("Validation predictions per label")
    print(v_outputs)
    
    return model

In [40]:
# Train the model
model = train(epochs=training_params["epochs"])

Epoch 1
Train Loss: 0.25940197706222534
Train Loss: 0.35454556345939636
Train Loss: 0.21527276933193207
Train Loss: 0.28943657875061035
Train Loss: 0.3271448016166687
Train Loss: 0.23761028051376343
Train Loss: 0.3069416284561157
Train Loss: 0.31029272079467773
Train Loss: 0.2723110318183899
Train Loss: 0.3150988221168518
Train Loss: 0.23621167242527008
Train Loss: 0.16730831563472748
Train Loss: 0.29132020473480225
Train Loss: 0.2430778443813324
Train Loss: 0.323095440864563
Train Loss: 0.2036229521036148
Train Loss: 0.20329636335372925
Train Loss: 0.24246688187122345
Train Loss: 0.2264920175075531
Train Loss: 0.41025158762931824
Train Loss: 0.294776052236557
Train Loss: 0.2263338565826416
Train Loss: 0.37485063076019287
Train Loss: 0.21027569472789764
Train Loss: 0.25249454379081726
Train Loss: 0.26419007778167725
Train Loss: 0.1692648082971573
Train Loss: 0.22729559242725372
Train Loss: 0.20760971307754517
Train Loss: 0.2601621150970459
Train Loss: 0.26518046855926514
Train Loss: 0.

In [None]:
# Models can be large - comment this out if not required
def save_model(PATH):
    torch.save(model.state_dict(), PATH)

save_model(PATH="state_dict_model.pt")

In [None]:
# For SemEval - modify as required
def generate_prediction_file(df, preds):
    
    # Define function to get original text labels from predicted binary labels
    def get_labels(pred_list):
        labels = [all_labels[idx] for idx, pred in enumerate(pred_list) if pred == 1]
        return labels
    
    df["preds"] = preds
    df["pred_labels"] = df["preds"].map(get_labels)
    preds_df = df[["id", "pred_labels"]].copy()
    preds_df.rename(columns={"pred_labels":"labels"}, inplace=True)
    preds_json = preds_df.to_json(orient="records")
    with open('preds.txt', 'w') as f:
        f.write(preds_json)
    print("Predictions file saved")

In [None]:
generate_prediction_file(train_df, preds)

In [None]:
neptune.log_artifact("preds.txt")

In [None]:
# neptune.log_artifact("state_dict_model.pt")

In [None]:
# Load saved model
PATH="state_dict_model.pt"
model = BERTClass()
model.to(device)
model.load_state_dict(torch.load(PATH))

In [None]:
# Define dataset class for loading test data

class TestDataset(torch.utils.data.Dataset):
    def __init__(self, img_dir, transform, df, tokenizer, max_len):
        self.img_dir=img_dir
        self.transform=transform
        self.img_names=df["image"].values
        self.tokenizer=tokenizer
        self.max_len=max_len
        self.text = df["text"].values
        
    def __len__(self):
        return len(self.text)
        
    def __getitem__(self, index):
        text = str(self.text[index])
        text = " ".join(text.split())

        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            padding="max_length",
            max_length=self.max_len,
            return_token_type_ids=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']
        token_type_ids = inputs["token_type_ids"]
        
        # For each img_name in img_names, load the corresponding img from the img folder
        img = Image.open(os.path.join(self.img_dir, self.img_names[index])).convert('RGB')
        # Return the transformed RGB image
        if self.transform is not None:
            img_vector=self.transform(img)
            
        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'token_type_ids': torch.tensor(token_type_ids, dtype=torch.long),
            'img_vector': img_vector
        }
    

In [None]:
# Load dev set
dev_df = read_data(dev_path, inference=True)

dev_set = TestDataset(img_dir="data/dev_set_task3",
                        transform=transformations,
                        df=dev_df, 
                        tokenizer=BertTokenizer.from_pretrained('bert-base-uncased'), 
                        max_len=training_params["max_len"])

dev_loader = DataLoader(dev_set, 
                             batch_size=training_params["train_batch_size"],
                             num_workers=training_params["num_workers"],
                             worker_init_fn=random.seed(1))

In [None]:
def test(testing_loader, model):
    model.eval()
    fin_outputs=[]
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            token_type_ids = data['token_type_ids'].to(device, dtype = torch.long)
            img_vector = data["img_vector"].to(device, dtype = torch.float)
            outputs = model(ids, mask, token_type_ids, img_vector)
            outputs = (np.array(torch.sigmoid(outputs).cpu().detach().numpy().tolist()) >= 0.5).astype(int).tolist()
            fin_outputs.extend(outputs)
    return fin_outputs


In [None]:
dev_preds = test(dev_loader, model)

In [None]:
generate_prediction_file(dev_df, dev_preds)

In [None]:
neptune.log_artifact("preds.txt")

In [None]:
neptune.stop()