In [None]:
import torch
import os
import numpy as np
import pandas as pd
import pickle
import time
import tensorflow as tf

# uncomment below line lines to view available cuda memory
# t = torch.cuda.get_device_properties(0).total_memory
# print(t/(1024*1024*1024))

### Flag to set if data is not ready

In [None]:
DATA_READY = True

## Utility functions

In [None]:
def create_dict(keys, val1s, val2s =None, val3s = None, top = False):
    temp = {}
    if (top):
        for key, val in zip(keys, val1s):
            temp[str(key)] = val
    else:
        for key, val1, val2 in zip(keys, val1s, val2s):
            temp[str(key)] = {
                'text':np.array(val1),
                'top':np.array(val3s[str(key)]),
                'label':int(val2)
            }
    return temp

# utility function to load file from pickle dump
def load_file(filename):
    with open(filename, 'rb') as filehandle:
        ret = pickle.load(filehandle)
        return ret
    
# utility function to save file as pickle dump
def save_file(filename, obj):
    with open(filename, 'wb') as filehandle:
        pickle.dump(obj, filehandle)

## Tokenizer class for text data

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

seq_len = 80

class TokenizerWrap(Tokenizer):
    """Wrap the Tokenizer-class from Keras with more functionality."""
    
    def __init__(self, texts, seq_len):
        Tokenizer.__init__(self, oov_token = '<unk>')
        self.seq_len = seq_len
        # Create the vocabulary from the texts.
        self.fit_on_texts(texts)
        self.index_to_word = dict(zip(self.word_index.values(),
                                      self.word_index.keys()))
        
        self.tokens = self.texts_to_sequences(texts)
        
        # The number of integer-tokens in each sequence.
        self.num_tokens = [len(x) for x in self.tokens]

        # Max number of tokens to use in all sequences.
        # We will pad / truncate all sequences to this length.
        # This is a compromise so we save a lot of memory and
        # only have to truncate maybe 5% of all the sequences.
        self.max_tokens = np.mean(self.num_tokens) + 2 * np.std(self.num_tokens)
        self.max_tokens = int(self.max_tokens)
        
        self.total_tokens = len(self.index_to_word)

    def tokens_to_string(self, tokens):
        """Convert a list of integer-tokens to a string."""

        # Create a list of the individual words.
        words = [self.index_to_word[token]
                 for token in tokens
                 if token != 0]
        
        # Concatenate the words to a single string
        # with space between all the words.
        text = " ".join(words)

        return text
    
    def texts_to_tokens(self, text, reverse=False, padding=True):
        """
        Convert a single text-string to tokens with optional
        reversal and padding.
        """

        # Convert to tokens. Note that we assume there is only
        # a single text-string so we wrap it in a list.
        tokens = self.texts_to_sequences(text)

        # Pad and truncate sequences to the given length.
        if padding:
            tokens = pad_sequences(tokens,
                                   maxlen=self.seq_len,
                                   padding='post',
                                   truncating='post')
        tokens = np.array(tokens)
        return tokens

## Proprcessing class to download required data and build embedding matrix

In [None]:
class Preprocessor():
    def __init__(self, data_ready):
        self.data_ready = data_ready

    def download(self):
        url_img_dataset = 'https://www.dropbox.com/s/ofmxf7fxyixdw4a/dataset_image_all.zip?dl=1'
        file_img_dataset = 'dataset_image_all.zip'
        if not(self.data_ready):
            tf.keras.utils.get_file(fname=file_img_dataset, origin=url_img_dataset, extract=True, cache_subdir=os.getcwd())
        
        url_dataset = 'https://www.dropbox.com/s/n5i5pid134v5rkj/twitter-multi-modal.zip?dl=1'
        file_dataset = 'twitter-multi-modal.zip'
        if not(self.data_ready):
            tf.keras.utils.get_file(fname=file_dataset, origin=url_dataset, extract=True, cache_subdir=os.getcwd())
        
        url_glove = "http://nlp.stanford.edu/data/glove.twitter.27B.zip"
        file_glove_zip = 'glove.twitter.27B.zip'
        self.file_glove = './glove.twitter.27B.100d.txt'
        if not(self.data_ready):
            tf.keras.utils.get_file(fname=file_glove_zip, origin=url_glove, extract=True, cache_subdir=os.getcwd())
            
    def load_embedding(self):
        self.embeddings_index = {}
        with open(self.file_glove) as f:
            for line in f:
                word, coefs = line.split(maxsplit=1)
                coefs = np.fromstring(coefs, "f", sep=" ")
                self.embeddings_index[word] = coefs
        
        self.embedding_dim = len(self.embeddings_index.get(list(self.embeddings_index.keys())[0]))
        self.embedding_vocab = len(self.embeddings_index)
        return self.embedding_vocab, self.embedding_dim
    
    def get_embedding_matrix(self, tokenizer):
        num_tokens = tokenizer.total_tokens + 1
        hits = 0
        misses = 0
        miss_list = []

        # Prepare embedding matrix
        embedding_matrix = np.zeros((num_tokens, self.embedding_dim))
        for word, i in tokenizer.word_index.items():
            if i == 1:
                continue
            try:
                embedding_vector =  self.embeddings_index.get(word)
            except:
                embedding_vector = np.random.normal(scale = 0.6, size = (self.embedding_dim, ))
            if embedding_vector is not None:
                # Words not found in embedding index will be all-zeros.
                # This includes the representation for "padding" and "OOV"
                embedding_matrix[i] = embedding_vector
                hits += 1
            else:
                miss_list.append(word)
                misses += 1
        return embedding_matrix, hits, misses, miss_list

In [None]:
pp = Preprocessor(DATA_READY)
pp.download()

## Prepare data

In [None]:
train_data = load_file('./train_data')
valid_data = load_file('./valid_data')
test_data = load_file('./test_data')
image_top = load_file('./image_top')

train_text = [str(item['text']) for item in train_data.values()]
valid_text = [str(item['text']) for item in valid_data.values()]
test_text = [str(item['text']) for item in test_data.values()]
top_text = [ " ".join(item) for item in image_top.values()]

train_labels = [int(item['label']) for item in train_data.values()]
valid_labels = [int(item['label']) for item in valid_data.values()]
test_labels = [int(item['label']) for item in test_data.values()]

train_id = [str(item) for item in train_data.keys()]
valid_id = [str(item) for item in valid_data.keys()]
test_id = [str(item) for item in test_data.keys()]
top_id = [ str(item) for item in image_top.keys()]

texts = train_text + valid_text + test_text + top_text

if not(DATA_READY):
    save_file('./train_text', train_text)
    save_file('./train_labels', train_labels)
    save_file('./train_id', train_id)

    save_file('./test_text', test_text)
    save_file('./test_labels', test_labels)
    save_file('./test_id', test_id)

    save_file('./valid_text', valid_text)
    save_file('./valid_labels', valid_labels)
    save_file('./valid_id', valid_id)

    save_file('./image_top', image_top)

## Load prepared data

In [None]:
train_text = load_file("/home/ckm/sarcasm_detection/sarcasm/full/train_text")
train_labels = load_file("/home/ckm/sarcasm_detection/sarcasm/full/train_labels")
train_id = load_file("/home/ckm/sarcasm_detection/sarcasm/full/train_id")
image_top = load_file("/home/ckm/sarcasm_detection/sarcasm/full/image_top")
valid_text = load_file("/home/ckm/sarcasm_detection/sarcasm/full/valid_text")
valid_labels = load_file("/home/ckm/sarcasm_detection/sarcasm/full/valid_labels")
valid_id = load_file("/home/ckm/sarcasm_detection/sarcasm/full/valid_id")
test_text = load_file("/home/ckm/sarcasm_detection/sarcasm/full/test_text")
test_labels = load_file("/home/ckm/sarcasm_detection/sarcasm/full/test_labels")
test_id = load_file("/home/ckm/sarcasm_detection/sarcasm/full/test_id")

## Build embedding matrix

In [None]:
embedding_vocab_size, embedding_dim = pp.load_embedding()

tokenizer_wrap = TokenizerWrap(texts, seq_len)

embedding_matrix, num_not_oov, num_oov, miss_list = pp.get_embedding_matrix(tokenizer_wrap)
vocab_size, embed_dim = embedding_matrix.shape

### Import transformer model

In [None]:
# !pip install transformers
from transformers import RobertaTokenizer
tokenizer_bert = RobertaTokenizer.from_pretrained('roberta-base')

## Image processing pipelines

In [None]:
from PIL import Image
from torchvision import transforms

transform_pipe = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

transform_pipe_train = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.ColorJitter(brightness=0, contrast=0, saturation=0, hue=0),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

# Dataset

In [None]:
import re
class TwitterDataset(torch.utils.data.Dataset):
    """
    This is our custom dataset class which will load the images, perform transforms on them,
    and load their corresponding labels.
    """
    def __init__(self, img_dir, labels, text, filenames, ocr_text = None, transform=None, tokenizer_wrap = None, tokenizer_bert = None, image_top = None):
        """
        img_dir = dir in which images are located
        labels = list containing true (0/1) values
        text = list containing all the texts
        filenames = list containing the names of images
        transform = transformer to preprocess images
        """
        self.img_dir = img_dir
        self.labels = labels
        self.text = text
        self.filenames = filenames
        self.transform = transform
        self.tokenizer_wrap = tokenizer_wrap
        self.tokenizer_bert = tokenizer_bert
        self.image_top = image_top
        # self.ocr_text = ocr_text

    def __getitem__(self, idx):
        try:
            img_path = os.path.join(
                self.img_dir,
                "{}.jpg".format(self.filenames[idx])
            )
        except Exception as e: 
            print(e)
                
        img = Image.open(img_path)
        
        if self.transform:
            img = self.transform(img)
        
        sample = {
            "image": img,
        }
        
        # filter emojis from text
        text = re.sub(r'(\s)emoji\w+', '', self.text[idx])
        
        # text tokens for FiLM pipeline
        token = self.tokenizer_wrap.texts_to_tokens([text])
        token.resize(seq_len)
        tokens_tensor = torch.tensor(token).long()


        # text tokens for transformer pipeline
        indexed_tokens_for_text = self.tokenizer_bert(text)['input_ids']
        while len(indexed_tokens_for_text) < 360:
            indexed_tokens_for_text.append(1)
        tokens_tensor_text = torch.tensor(indexed_tokens_for_text)

        # attribute tokens for transformer pipeline
        attribute = self.image_top[int(self.filenames[idx])]
        attribute = ' '.join(attribute)
        indexed_tokens_for_attribute = self.tokenizer_bert(attribute)['input_ids']
        while len(indexed_tokens_for_attribute) < 13:
            indexed_tokens_for_attribute.append(1)
        tokens_tensor_attribute = torch.tensor(indexed_tokens_for_attribute)

        try:
            sample["label"] = self.labels[idx]
            sample["token"] = tokens_tensor
            sample["text"] = tokens_tensor_text
            sample["attribute"] = tokens_tensor_attribute
        except Exception as e:
            print(e)
        
        return sample
    
    def __len__(self):
        return len(self.labels)

## Make Dataloaders

In [None]:
train_data_object = TwitterDataset(
    img_dir="./dataset_image/",
    labels = train_labels,
    text = train_text,
    filenames = train_id,
    transform=transform_pipe_train,
    tokenizer_wrap=tokenizer_wrap,
    tokenizer_bert=tokenizer_bert,
    image_top = image_top
)

test_data_object = TwitterDataset(
    img_dir="./dataset_image/",
    labels = test_labels,
    text = test_text,
    filenames = test_id,
    transform=transform_pipe,
    tokenizer_wrap=tokenizer_wrap,
    tokenizer_bert=tokenizer_bert,
    image_top = image_top
)

val_data_object = TwitterDataset(
    img_dir="./dataset_image/",
    labels = valid_labels,
    text = valid_text,
    filenames = valid_id,
    transform=transform_pipe,
    tokenizer_wrap=tokenizer_wrap,
    tokenizer_bert=tokenizer_bert,
    image_top = image_top
)

BATCH_SIZE = 32

train_loader = torch.utils.data.DataLoader(
    train_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=8,
    shuffle=True,
    drop_last=True
 )

test_loader = torch.utils.data.DataLoader(
    test_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=8,
    shuffle=True,
    drop_last=True
 )

val_loader = torch.utils.data.DataLoader(
    val_data_object,
    batch_size=BATCH_SIZE,
    pin_memory=True,
    num_workers=8,
    shuffle=True,
    drop_last=True
 )

dataloaders = {'train': train_loader, 'test': test_loader, 'val': val_loader}

### Verify Cuda device is loaded properly 

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)

torch.backends.cudnn.benchmark = True
torch.backends.cudnn.enabled   = True

print(torch.cuda.device_count())
print('GPU Allocated')

dtype = torch.cuda.FloatTensor

# Model Preparation

## Co-attention layer

In [None]:
import math

class Co_attention(torch.nn.Module):
    def __init__(self, size_in, size_out):
        super().__init__()
        self.size_in, self.size_out = size_in, size_out
        weights = torch.Tensor(size_out, size_in)
        self.weights = torch.nn.Parameter(weights)  # nn.Parameter is a Tensor that's a module parameter.

        # initialize weights and biases
        torch.nn.init.kaiming_uniform_(self.weights, a=math.sqrt(5)) # weight init
        fan_in, _ = torch.nn.init._calculate_fan_in_and_fan_out(self.weights)

        self.tanh = torch.nn.Tanh()
        self.max_pool = torch.nn.MaxPool2d(kernel_size = (360,1))

    def forward(self, H, T):
        # H.shape = batch_size, 360, 768
        # T.shape = batch_size, 13, 768
        T_tmp = T.clone()
        T = T.permute(0, 2, 1)
        z_1 = torch.matmul(H, self.weights)
        z_2 = torch.matmul(z_1, T)
        
        # print(self.weights.shape) # [768, 768]
        
        C = self.tanh(z_2)
        alpha = self.max_pool(C)
        
        # print(C.shape) batch_size, 360, 13
        # print(alpha.shape) batch_size, 1, 13
        HT = torch.matmul(alpha, T_tmp).resize(BATCH_SIZE, self.size_in)

        return HT

## Transformer Models

In [None]:
from transformers import RobertaModel
from torchvision import models

bertl_text = RobertaModel.from_pretrained('roberta-base')
bertl_attribute = RobertaModel.from_pretrained('roberta-base')

## Final Model Class

In [None]:
class FilmModel(torch.nn.Module):
  # define model elements
    def __init__(self):
        super(FilmModel, self).__init__()

        self.bertl_text = bertl_text.to(device)
        self.bertl_attribute = bertl_attribute.to(device)

        self.co_attention = Co_attention(768, 768).cuda()
        
        self.fc = torch.nn.Linear(768+768+1000, 2)
        self.sigm = torch.nn.Sigmoid()

    # forward propagate input
    def forward(self, X1, X2, X3, X4):
        """
        X1=images
        X2=token (tokenizer_wrap)
        X3=text (bert)
        X4=attribute (bert) 
        """
        
        bert_embed_attribute = self.bertl_attribute.embeddings(input_ids = X4) # torch.Size([1, 12, 768]) 

        bert_embed_text = self.bertl_text.embeddings(input_ids = X3) # torch.Size([1, 360, 768]) 

        bert_text = self.bertl_text.encoder.layer[0](bert_embed_text)[0]
        
        for i in range(12):
            bert_attribute = self.bertl_attribute.encoder.layer[i](bert_embed_attribute)[0]
            bert_embed_attribute = bert_attribute

        out2 = self.co_attention(bert_text, bert_attribute)

        out3 = bert_text[:,0,:]

        out = torch.cat((out2, out3), dim = 1) # torch.Size([batch_size, 768+768])
        out = self.fc(out) # torch.Size([batch_size, 1])
        yhat = self.sigm(out)
        
        return yhat

### Building instance of our model

In [None]:
model = FilmModel()
model = model.to(device)

### Loading optimizers and Loss functions

In [None]:
optimizer = torch.optim.Adam(
    [
        {'params': model.bertl_text.parameters(), 'lr': 1e-6},
        {'params': model.bertl_attribute.parameters(), 'lr': 1e-6},
        {'params': model.fc.parameters()},
        {'params': model.co_attention.parameters()}
    ], lr = 1e-4, weight_decay = 1e-2
)

loss_fn = torch.nn.CrossEntropyLoss().cuda()

### Runner code

In [None]:
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
import time

num_epochs = 15

tick = time.time()

print(num_epochs)

for epoch in range(0, num_epochs):

    print('-'*10)
    print('Epoch {}/{}'.format(epoch+1, num_epochs))

    for phase in ['train', 'val', 'test']:
        running_loss = 0.0
        running_corrects = 0.0

        y_true = []
        y_pred = []

        if phase == 'train':
            model.train()
        else:
            model.eval()

        for i, batch in enumerate(dataloaders[phase]):
            token = batch["token"]
            images = batch["image"]
            labels = batch["label"]
            text = batch["text"]
            attribute = batch["attribute"]
        
            token = Variable(token.cuda())
            images = Variable(images.cuda())
            labels = Variable(labels.cuda())
            text = Variable(text.cuda())
            attribute = Variable(attribute.cuda())
            
            scores = model(images, token, text, attribute)

            loss = loss_fn(scores, labels)
            
            if (phase == 'train'):
                model.zero_grad()
                loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1)
                optimizer.step()
                
        
            running_loss += loss.item() * labels.size(0)
            
            if scores is not None:
                _, preds = scores.data.max(1)
                running_corrects += (preds == labels).sum()
                y_pred.extend(preds.tolist())
                y_true.extend(labels.tolist())

            if (i%100 == 0) and phase == 'train':
                print(i, running_loss/((i+1)*labels.size(0)))

            del loss, scores, token, images, labels, text, attribute

        epoch_loss = running_loss / (len(dataloaders[phase]) * BATCH_SIZE)
        epoch_acc = float(running_corrects) / (len(dataloaders[phase]) * BATCH_SIZE)
        
        print(confusion_matrix(y_true, y_pred))
        pre = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        print(phase + ": F1: {:.4f}, Precision: {:.4f}, Recall : {:.4f}, Accuracy: {:.4f}, Loss: {:.4f}.".format(f1, pre, recall, epoch_acc, epoch_loss))
        
        # Uncomment below lines to save the weights of the modle
        # if phase == "train":
        #     save_dir = './saved_models/film_roberta_wo_film' + str(epoch+1) + "_" + str(f1)
        #     torch.save({'model_state_dict': model.state_dict(),                                                 
        #         'optimizer_state_dict': optimizer.state_dict()}, save_dir)
        
print("Time taken to compute the results:", time.time() - tick)