In [1]:
class Args():
    def __init__(self):
        self.plot = False
        self.eager = False
        self.mf = 'model.jpg'
        self.tf = 'main-out.txt'
        self.pf = 'training-plot.jpg'
        self.pt = 'Training Multimodal'
        self.data_ready = True
        self.epochs = 20

args = Args()

## imports

In [2]:
import os
import numpy as np
import pandas as pd
import pickle
import time

import tensorflow as tf

In [3]:
tf.config.experimental_run_functions_eagerly(args.eager)

Instructions for updating:
Use `tf.config.run_functions_eagerly` instead of the experimental version.


## directories

## utility functions

In [4]:
def create_dict(keys, val1s, val2s =None, val3s = None, top = False):
    temp = {}
    if (top):
        for key, val in zip(keys, val1s):
            temp[str(key)] = val
    else:
        for key, val1, val2 in zip(keys, val1s, val2s):
            temp[str(key)] = {
                'text':np.array(val1),
                'top':np.array(val3s[str(key)]),
                'label':int(val2)
            }
    return temp

In [5]:
def load_file(filename):
    with open(filename, 'rb') as filehandle:
        ret = pickle.load(filehandle)
        return ret

In [6]:
def save_file(filename, obj):
    with open(filename, 'wb') as filehandle:
        pickle.dump(obj, filehandle)

## Preprocessing

### preprocessing class

In [7]:
class Preprocessor():
    def __init__(self, seq_len, batch_size, data_ready, img_path, img_vector):
        self.seq_len = seq_len
        self.batch_size = batch_size
        self.image_top = None
        self.data_ready = data_ready
        self.img_path = img_path
        self.img_vector = img_vector
        self.resnet50 = tf.keras.applications.ResNet50(weights='imagenet', include_top=False)

    def download(self):
        url_img_dataset = 'https://www.dropbox.com/s/ofmxf7fxyixdw4a/dataset_image_all.zip?dl=1'
        file_img_dataset = 'dataset_image_all.zip'
        if not(self.data_ready):
            tf.keras.utils.get_file(fname=file_img_dataset, origin=url_img_dataset, extract=True, cache_subdir=os.getcwd())
        
        url_dataset = 'https://www.dropbox.com/s/n5i5pid134v5rkj/twitter-multi-modal.zip?dl=1'
        file_dataset = 'twitter-multi-modal.zip'
        if not(self.data_ready):
            tf.keras.utils.get_file(fname=file_dataset, origin=url_dataset, extract=True, cache_subdir=os.getcwd())
    
    
    
        


### loading data

In [9]:
pp = Preprocessor(30, 32, args.data_ready, 'temp', 'temp')
pp.download()

In [10]:
train_data = load_file('/home/sundesh/Documents/git/sarcasm/train_data')
valid_data = load_file('/home/sundesh/Documents/git/sarcasm/valid_data')
test_data = load_file('/home/sundesh/Documents/git/sarcasm/test_data')
image_top = load_file('/home/sundesh/Documents/git/sarcasm/image_top')

# # processing dataset for batch compatability
# train_dataset = pp.prepare_dataset('train.csv')
# valid_dataset = pp.prepare_dataset('valid.csv')
# test_dataset = pp.prepare_dataset('test.csv')

In [12]:
train_text = [str(item['text']) for item in train_data.values()]
valid_text = [str(item['text']) for item in valid_data.values()]
test_text = [str(item['text']) for item in test_data.values()]
top_text = [ " ".join(item) for item in image_top.values()]

train_labels = [int(item['label']) for item in train_data.values()]
valid_labels = [int(item['label']) for item in valid_data.values()]
test_labels = [int(item['label']) for item in test_data.values()]

In [13]:
train_id = [str(item) for item in train_data.keys()]
valid_id = [str(item) for item in valid_data.keys()]
test_id = [str(item) for item in test_data.keys()]
top_id = [ str(item) for item in image_top.keys()]

In [15]:
# train_text, train_labels, train_id = shuffle(train_text, train_labels,train_id)
# valid_text, valid_labels, valid_id = shuffle(valid_text, valid_labels,valid_id)
# test_text, test_labels, test_id = shuffle(test_text, test_labels,test_id)

In [16]:
print(len(train_labels), len(test_labels), len(valid_labels))

19816 2409 2410


### minimizing dataset for test run

In [17]:
train_text = train_text[:500]
train_labels = train_labels[:500]
train_id = train_id[:500]

valid_text = valid_text[:250]
valid_labels = valid_labels[:250]
valid_id = valid_id[:250]

test_text = test_text[:250]
test_labels = test_labels[:250]
test_id = test_id[:250]

In [18]:
train_text = load_file("/home/sundesh/Documents/git/sarcasm/temp/train_text")
train_labels = load_file("/home/sundesh/Documents/git/sarcasm/temp/train_labels")
train_id = load_file("/home/sundesh/Documents/git/sarcasm/temp/train_id")

# valid_text = load_file("./temp/valid_text")
# valid_labels = load_file("./temp/valid_labels")
# valid_id = load_file("./temp/valid_id")

# test_text = load_file("./temp/test_text")
# test_labels = load_file("./temp/test_labels")
# test_id = load_file("./temp/test_id")

# Preprocess text

In [5]:
import torch
from transformers import BertTokenizer, BertConfig
config = BertConfig()
# for more information on what's happening under the hood
import logging
logging.basicConfig(level=logging.INFO)

# Dataset Loader

In [6]:
from PIL import Image
from torchvision import transforms
transform_pipe = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
])

In [7]:
class TwitterDataset(torch.utils.data.Dataset):
    """
    This is our custom dataset class which will load the images, perform transforms on them,
    and load their corresponding labels.
    """
    def __init__(self, img_dir, labels, text, filenames, transform=None):
        """
        img_dir = dir in which images are located
        labels = list containing true (0/1) values
        text = list containing all the texts
        filenames = list containing the names of images
        transform = transformer to preprocess images
        """
        self.img_dir = img_dir
        self.labels = labels
        self.text = text
        self.filenames = filenames
        self.transform = transform
    
    def __getitem__(self, idx):
        try:
            img_path = os.path.join(
                self.img_dir,
                "{}.jpg".format(self.filenames[idx])
            )
        except Exception as e: 
            print(e)
                
        img = Image.open(img_path)
        
        if self.transform:
            img = self.transform(img)
        
        sample = {
            "image": img,
        }
        
        text = self.text[idx]
        text = '[CLS] ' + text + ' [SEP]'
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        tokenized_text = tokenizer.tokenize(text)
        indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
        while len(indexed_tokens) < 70:
            indexed_tokens.append(0)
        tokens_tensor = torch.tensor(indexed_tokens)
        
        try:
            sample["label"] = self.labels[idx]
            sample["token"] = tokens_tensor # torch.Size([batch_size, 512])
        except Exception as e:
            print(e)
        
        return sample
    
    def __len__(self):
        return len(self.labels)
        

In [8]:
train_data_object = TwitterDataset(
    img_dir="./dataset_image/",
    labels = train_labels,
    text = train_text,
    filenames = train_id,
    transform=transform_pipe
)

In [9]:
train_loader = torch.utils.data.DataLoader(
    train_data_object,
    batch_size=1,
    )

# MODEL

In [1]:
import FiLMedNet
import FiLMGen

ModuleNotFoundError: No module named 'FiLMedNet'