In [None]:
%%capture
!pip install wget
!pip install transformers

In [None]:
from google.colab import drive
import sys
drive.mount('/content/drive')
sys.path.append('/content/drive/MyDrive/project/libs')
model_path = '/content/drive/MyDrive/project/models'
training_path = '/content/drive/MyDrive/project/dataset/training'
test_path = '/content/drive/MyDrive/project/dataset/test'

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import torch
import numpy as np
from transformers import AutoTokenizer, AutoModel
from torch import nn
from torch.optim import Adam
from tqdm import tqdm

from IPython.display import Image, display
import PIL.Image
import io
from processing_image import Preprocess
from visualizing_image import SingleImageViz
from modeling_frcnn import GeneralizedRCNN
from utils import Config
import utils

In [None]:
train_labels = pd.read_csv(training_path+'/training.csv', sep='\t', header=0, names=['file_name', 'misogynous',	'shaming',	'stereotype',	'objectification',	'violence', 'text'])
train_labels = train_labels[['file_name', 'misogynous', 'text']]
train_labels.to_csv(training_path+'/labels.csv', index=None, sep='\t')
train_labels.shape

(10000, 3)

In [None]:
test_data = pd.read_csv(test_path+'/test.csv', sep='\t', header=0, names=['file_name', 'text'])
test_labels = pd.read_csv(test_path+'/test_labels.txt', sep='\t', header=None, names=['file_name', "misogynous",	"shaming",	"stereotype",	"objectification",	"violence"])
test_labels = pd.merge(test_labels, test_data, on='file_name')[['file_name', 'misogynous', 'text']]
test_labels.to_csv(test_path+'/labels.csv', index=None, sep='\t')
test_labels.shape

(1000, 3)

In [None]:
train_labels = pd.read_csv('/content/drive/MyDrive/project/dataset/training_labels.csv', sep='\t')
train_labels.shape

(10000, 3)

In [None]:
test_labels = pd.read_csv('/content/drive/MyDrive/project/dataset/test_labels.csv', sep='\t')
test_labels.shape

(1000, 3)

In [None]:
%%capture
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
frcnn_cfg = Config.from_pretrained("unc-nlp/frcnn-vg-finetuned")
frcnn = GeneralizedRCNN.from_pretrained("unc-nlp/frcnn-vg-finetuned", config=frcnn_cfg).to(device)
image_preprocess = Preprocess(frcnn_cfg)

In [None]:
'''
train_preprocessed = pd.read_pickle(training_path+'/preprocessed_cpu.pickle')
processed_files = set(train_preprocessed['file_name'])

use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
train_preprocessed = pd.DataFrame(columns=['file_name', 'misogynous', 'text', 'visual_embeds', 'input_ids', 'token_type_ids', 'attention_mask'])
for index, row in tqdm(train_labels.iterrows(), total=train_labels.shape[0]):
  if row[file_name] in processed_files:
    continue
  images, sizes, scales_yx = image_preprocess(training_path+'/'+row['file_name'])
  output_dict = frcnn(
      images.to(device),
      sizes.to(device),
      scales_yx=scales_yx.to(device),
      padding="max_detections",
      max_detections=512,
      return_tensors="pt",
  )
  image_preprocessed = output_dict.get("roi_features")
  text_preprocessed = tokenizer(row['text'], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
  train_preprocessed = train_preprocessed.append(
      {
       'file_name': row['file_name'], 
       'misogynous': row['misogynous'],
       'text': row['text'],
       'visual_embeds': image_preprocessed,
       'input_ids': text_preprocessed['input_ids'],
       'token_type_ids': text_preprocessed['token_type_ids'],
       'attention_mask': text_preprocessed['attention_mask']
      },
      ignore_index=True
  )
  if index%500 == 0:
      train_preprocessed.to_pickle('/content/drive/MyDrive/project/dataset/training_preprocessed.pickle')
      print(f"Process saved: {index}")
train_preprocessed.to_pickle('/content/drive/MyDrive/project/dataset/training_preprocessed.pickle')
'''
print()




In [None]:
'''
use_cuda = torch.cuda.is_available()
device = torch.device("cuda" if use_cuda else "cpu")
test_preprocessed = pd.DataFrame(columns=['file_name', 'misogynous', 'text', 'visual_embeds', 'input_ids', 'token_type_ids', 'attention_mask'])
for index, row in tqdm(test_labels.iterrows(), total=test_labels.shape[0]):
  images, sizes, scales_yx = image_preprocess(test_path+'/'+row['file_name'])
  output_dict = frcnn(
      images.to(device),
      sizes.to(device),
      scales_yx=scales_yx.to(device),
      padding="max_detections",
      max_detections=512,
      return_tensors="pt",
  )
  image_preprocessed = output_dict.get("roi_features")
  text_preprocessed = tokenizer(row['text'], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
  test_preprocessed = test_preprocessed.append(
      {
       'file_name': row['file_name'], 
       'misogynous': row['misogynous'],
       'text': row['text'],
       'visual_embeds': image_preprocessed,
       'input_ids': text_preprocessed['input_ids'],
       'token_type_ids': text_preprocessed['token_type_ids'],
       'attention_mask': text_preprocessed['attention_mask']
      },
      ignore_index=True
  )
#test_preprocessed.to_pickle('/content/drive/MyDrive/project/dataset/test_preprocessed.pickle')
'''
print()




In [None]:
#train_preprocessed = pd.read_pickle(training_path+'/preprocessed_cpu.pickle')
#test_preprocessed = pd.read_pickle('/content/drive/MyDrive/project/dataset/test_preprocessed.pickle')

In [None]:
labels = {'0':0,
          '1':1
          }

class Dataset(torch.utils.data.Dataset):

    def __init__(self, df):
        #self.images = [image for image in df['visual_embeds']]
        self.images = [image for image in df['file_name']]
        self.labels = [labels[str(label)] for label in df['misogynous']]
        #self.texts = [{'input_ids': text['input_ids'], 'token_type_ids': text['token_type_ids'], 'attention_mask': text['attention_mask']} for _,text in df[['input_ids', 'token_type_ids', 'attention_mask']].iterrows()]
        self.texts = [text for text in df['text']]
        
    def classes(self):
        return self.labels

    def __len__(self):
        return len(self.labels)

    def get_batch_labels(self, idx):
        # Fetch a batch of labels
        return np.array(self.labels[idx])

    def get_batch_texts(self, idx):
        # Fetch a batch of text inputs
        return self.texts[idx]

    def get_batch_images(self, idx):
        # Fetch a batch of image inputs
        return self.images[idx]

    def __getitem__(self, idx):

        batch_images = self.get_batch_images(idx)
        batch_texts = self.get_batch_texts(idx)
        batch_y = self.get_batch_labels(idx)

        return batch_images, batch_texts, batch_y

In [None]:
class VisualBertClassifier(nn.Module):

    def __init__(self, dropout=0.5):
        super(VisualBertClassifier, self).__init__()

        self.model = AutoModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 2)
        #self.relu = nn.ReLU()

    def forward(self, inputs):
      pooled_output = self.model(**inputs)['pooler_output']
      dropout_output = self.dropout(pooled_output)
      linear_output = self.linear(dropout_output)
      #final_layer = self.relu(linear_output)
      return linear_output

In [None]:
def train(model, train_data, val_data, learning_rate, epochs):

    train, val = Dataset(train_data), Dataset(val_data)

    train_dataloader = torch.utils.data.DataLoader(train, batch_size=1, shuffle=True)
    val_dataloader = torch.utils.data.DataLoader(val, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    criterion = nn.BCELoss()
    optimizer = Adam(model.parameters(), lr= learning_rate)

    if use_cuda:

            model = model.cuda()
            criterion = criterion.cuda()

    for epoch_num in range(epochs):

            total_acc_train = 0
            total_loss_train = 0

            for train_image, train_text, train_label in tqdm(train_dataloader):

                train_label = train_label.to(device)
                text_preprocessed = tokenizer(train_text[0], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
                images, sizes, scales_yx = image_preprocess(training_path+'/'+train_image[0])
                output_dict = frcnn(
                    images.to(device),
                    sizes.to(device),
                    scales_yx=scales_yx.to(device),
                    padding="max_detections",
                    max_detections=512,
                    return_tensors="pt",
                )
                image_preprocessed = output_dict.get("roi_features")
                train_inputs = (
                    {
                      "input_ids": text_preprocessed['input_ids'].squeeze(1).to(device),
                      "token_type_ids": text_preprocessed['token_type_ids'].squeeze(1).to(device),
                      "attention_mask": text_preprocessed['attention_mask'].squeeze(1).to(device),
                      "visual_embeds": image_preprocessed.squeeze(1).to(device),
                      "visual_token_type_ids": torch.ones(image_preprocessed.squeeze(1).shape[:-1], dtype=torch.long).to(device),
                      "visual_attention_mask": torch.ones(image_preprocessed.squeeze(1).shape[:-1], dtype=torch.long).to(device),
                      "output_attentions": False,
                      "output_hidden_states": False
                    }
                )
                output = model(train_inputs)
                
                batch_loss = criterion(output, train_label)
                total_loss_train += batch_loss.item()
                
                acc = (output.argmax(dim=1) == train_label).sum().item()
                total_acc_train += acc
                
                model.zero_grad()
                batch_loss.backward()
                optimizer.step()
            
            total_acc_val = 0
            total_loss_val = 0

            with torch.no_grad():

                for val_image, val_text, val_label in val_dataloader:

                    val_label = val_label.to(device)
                    text_preprocessed = tokenizer(val_text[0], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")
                    images, sizes, scales_yx = image_preprocess(training_path+'/'+val_image[0])
                    output_dict = frcnn(
                        images.to(device),
                        sizes.to(device),
                        scales_yx=scales_yx.to(device),
                        padding="max_detections",
                        max_detections=512,
                        return_tensors="pt",
                    )
                    image_preprocessed = output_dict.get("roi_features")
                    val_inputs = (
                        {
                          "input_ids": text_preprocessed['input_ids'].squeeze(1).to(device),
                          "token_type_ids": text_preprocessed['token_type_ids'].squeeze(1).to(device),
                          "attention_mask": text_preprocessed['attention_mask'].squeeze(1).to(device),
                          "visual_embeds": image_preprocessed.squeeze(1).to(device),
                          "visual_token_type_ids": torch.ones(image_preprocessed.squeeze(1).shape[:-1], dtype=torch.long).to(device),
                          "visual_attention_mask": torch.ones(image_preprocessed.squeeze(1).shape[:-1], dtype=torch.long).to(device),
                          "output_attentions": False,
                          "output_hidden_states": False
                        }
                    )

                    output = model(val_inputs)

                    batch_loss = criterion(output, val_label)
                    total_loss_val += batch_loss.item()
                    
                    acc = (output.argmax(dim=1) == val_label).sum().item()
                    total_acc_val += acc
            
            print(
                f'Epochs: {epoch_num + 1} | Train Loss: {total_loss_train / len(train_data): .3f} | Train Accuracy: {total_acc_train / len(train_data): .3f} | Val Loss: {total_loss_val / len(val_data): .3f} | Val Accuracy: {total_acc_val / len(val_data): .3f}')
            torch.save(model.state_dict(), f"{model_path}/{epoch_num + 1}.bin")
                  

In [None]:
def evaluate(model, test_data):

    test = Dataset(test_data)

    test_dataloader = torch.utils.data.DataLoader(test, batch_size=1)

    use_cuda = torch.cuda.is_available()
    device = torch.device("cuda" if use_cuda else "cpu")

    if use_cuda:

        model = model.cuda()

    total_acc_test = 0
    with torch.no_grad():

        for test_image, test_text, test_label in tqdm(test_dataloader):

              test_label = test_label.to(device)
              text_preprocessed = tokenizer(test_text[0], padding='max_length', max_length = 512, truncation=True, return_tensors="pt")

              images, sizes, scales_yx = image_preprocess(test_path+'/'+test_image[0])
              output_dict = frcnn(
                  images.to(device),
                  sizes.to(device),
                  scales_yx=scales_yx.to(device),
                  padding="max_detections",
                  max_detections=512,
                  return_tensors="pt",
              )
              image_preprocessed = output_dict.get("roi_features")
              test_inputs = (
                  {
                    "input_ids": text_preprocessed['input_ids'].squeeze(1).to(device),
                    "token_type_ids": text_preprocessed['token_type_ids'].squeeze(1).to(device),
                    "attention_mask": text_preprocessed['attention_mask'].squeeze(1).to(device),
                    "visual_embeds": image_preprocessed.squeeze(1).to(device),
                    "visual_token_type_ids": torch.ones(image_preprocessed.squeeze(1).shape[:-1], dtype=torch.long).to(device),
                    "visual_attention_mask": torch.ones(image_preprocessed.squeeze(1).shape[:-1], dtype=torch.long).to(device),
                    "output_attentions": False,
                    "output_hidden_states": False
                  }
              )

              output = model(test_inputs)

              acc = (output.argmax(dim=1) == test_label).sum().item()
              total_acc_test += acc
    
    print(f'Test Accuracy: {total_acc_test / len(test_data): .3f}')

In [None]:
np.random.seed(112)
df_train, df_val = np.split(train_labels.head(1000).sample(frac=1, random_state=42), [int(.9*len(train_labels.head(1000)))])

print(len(df_train),len(df_val))

900 100


In [None]:
EPOCHS = 3
model = VisualBertClassifier()
LR = 1e-6
              
train(model, df_train, df_val, LR, EPOCHS)

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
100%|██████████| 900

Epochs: 1 | Train Loss:  0.694 | Train Accuracy:  0.488 | Val Loss:  0.684 | Val Accuracy:  0.550


100%|██████████| 900/900 [19:46<00:00,  1.32s/it]


Epochs: 2 | Train Loss:  0.696 | Train Accuracy:  0.477 | Val Loss:  0.692 | Val Accuracy:  0.570


100%|██████████| 900/900 [19:45<00:00,  1.32s/it]


Epochs: 3 | Train Loss:  0.693 | Train Accuracy:  0.489 | Val Loss:  0.708 | Val Accuracy:  0.550


In [None]:
model = VisualBertClassifier()
model.load_state_dict(torch.load(f"{model_path}/3.bin", map_location=torch.device('cpu')))
model.eval()
print()

Downloading:   0%|          | 0.00/631 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/428M [00:00<?, ?B/s]

Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).





In [None]:
evaluate(model, test_labels.head(100))

100%|██████████| 100/100 [02:50<00:00,  1.71s/it]

Test Accuracy:  0.530



