In [1]:
from utils import preprocess

In [2]:
data_dir = 'data/annotations/new_annot.json'
instances = preprocess.load_data(data_dir)

In [3]:
instances[0]

{'anchor_location': 'Atlanta',
 'instance_id': 'thanksgiving2019_1199840018380001281',
 'event': 'thanksgiving2019',
 'Answer.Q1_A2ZJS73XSSMRTD': 'Yes',
 'Answer.Q2_A2ZJS73XSSMRTD': '4',
 'adjudicated_label': 'Yes',
 'anchor_timestamp': 'Wed Nov 27 23:58:46 +0000 2019',
 'anchor_jsonpath': 'data/json_files/thanksgiving2019_1199840018380001281/anchor_1199840018380001281.json',
 'anchor_tweettext': 'Just a sampling of our Thanksgiving Eve feast. Special shout-out to @FOX5ATLCallaway and @Elepo for organizing it!!!!!! @FOX5Atlanta https://t.co/w6k2p0c46u',
 'anchor_url': 'http://www.cse.unt.edu/~blanco/screenshot/thanksgiving2019_1199840018380001281_anchor_1199840018380001281.png',
 'anchor_imagepath': 'data/image_files/thanksgiving2019_1199840018380001281/anchor_1199840018380001281.jpg',
 'context8_jsonpath': 'data/json_files/thanksgiving2019_1199840018380001281/1199503490923536389.json',
 'context8_tweettext': '@aungeliquefox5 Yes! And so generous !',
 'context8_url': 'http://www.cse.un

In [4]:
instances = preprocess.add_bert_output(instances, anchor_only=False)
instances[0]

In [13]:
import torchvision.models as models
import torch
from PIL import Image
from torchvision import transforms

def add_vgg_output(instances, anchor_only):
    """
    add the image representation (from VGG16) into the dictionary
    :param instances: a list of instances need to be updated
    :return: a new new list of instances that are already updated
    """
    # Use the GPU, if available, to get the image representation
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # get the pretrained VGG16 (discard the last classification layer)
    vgg16_model = models.vgg16(pretrained=True)
    for p in vgg16_model.parameters():
        p.requires_grad = False
    vgg16_model = vgg16_model.to(device)
    vgg16_model.eval()

    for instance in instances:
        
        # get the path of image file
        filepath = instance['anchor_imagepath']

        # preprocess the image, convert it into RGB format if it is not RGB
        input_image = Image.open(filepath).convert('RGB')
        preprocess = transforms.Compose([
            transforms.Resize(256),
            transforms.CenterCrop(224),
            transforms.ToTensor(),
            transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
        ])
        input_tensor = preprocess(input_image).unsqueeze(0).to(device)

        # get the image representation
        with torch.no_grad():
            
            output = vgg16_model(input_tensor)
            # move the tensor to the CPU to free GPU memory
            output = output.to('cpu')

        # add image representation into the dictionary
        instance['anchor_vggoutput'] = output

        if not anchor_only:
            
            for i in range(8, 14):
            
                # get the path of image file if the image exists
                imagekey = f"context{i}_imagepath"
                if imagekey in instance.keys():
                    
                    filepath = instance[imagekey]

                    # preprocess the image, convert it into RGB format if it is not RGB
                    input_image = Image.open(filepath).convert('RGB')
                    preprocess = transforms.Compose([
                        transforms.Resize(256),
                        transforms.CenterCrop(224),
                        transforms.ToTensor(),
                        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]),
                    ])
                    input_tensor = preprocess(input_image).unsqueeze(0).to(device)

                    # get the image representation
                    with torch.no_grad():
                        
                        output = vgg16_model(input_tensor)
                        # move the tensor to the CPU to free GPU memory
                        output = output.to('cpu')

                    # add image representation into the dictionary
                    instance[f"context{i}_vggoutput"] = output

    return instances

In [None]:
instances = add_vgg_output(instances, anchor_only=False)

In [None]:
instances[0]

In [None]:
instances = preprocess.add_vgg_output(instances, anchor_only=False)
instances[0]

## add more info to the annotation file

In [36]:
import json
import os
import csv
from collections import defaultdict

annotation_filepath = 'data/annotations/annotation_context.json'
original_batch_filepath = 'batch_6540.csv'
data_dir = 'data'

lookup_dict = defaultdict(dict)
with open(original_batch_filepath, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lookup_dict[row['instance_id']] = row
        
def add_info(lookup_dict, annotation_filepath):
    instances = []
    with open(annotation_filepath, 'r') as jsonfile:
        lines = jsonfile.read().split("\n")[:-1]
        for line in lines:
            instance = {}
            temp_instance = json.loads(line)
            # remove "Input." in the keys
            for key, value in temp_instance.items():
                if key.startswith("Input."):
                    if not key.endswith("url"):
                        newkey = key.split(".")[-1]
                        instance[newkey] = value
                else:
                    instance[key] = value

            # add image filepath, json filepath, screenshot url, tweet_text, and timestamp
            original_dict = lookup_dict[temp_instance['Input.instance_id']]
            for key, value in original_dict.items():
                if key.endswith("url"):
                    tweet_id = value.split("/")[-1].split("_")[-1].split(".")[0]

                    # add json filepath
                    jsonname = f"anchor_{tweet_id}.json" if 'anchor' in key else f"{tweet_id}.json"
                    jsonpath = os.path.join(data_dir, 'json_files', '_'.join(value.split("/")[-1].split("_")[:2]), jsonname)
                    jsonkey = key.split("_")[0] + "_jsonpath"
                    instance[jsonkey] = jsonpath

                    # add tweet text
                    with open(jsonpath, 'r') as tweetfile:
                        tweet = json.loads(tweetfile.read())
                    textkey = key.split("_")[0] + "_tweettext"
                    instance[textkey] = tweet['full_text']

                    # add image filepath if image exists
                    instance[key] = value
                    imagename = f"anchor_{tweet_id}.jpg" if 'anchor' in key else f"{tweet_id}.jpg"
                    imagepath = os.path.join(data_dir, 'image_files', '_'.join(value.split("/")[-1].split("_")[:2]), imagename)
                    hasimg = os.path.isfile(imagepath)
                    if hasimg:
                        imagekey = key.split("_")[0] + "_imagepath"
                        instance[imagekey] = imagepath

                if key.endswith("timestamp"):
                    instance[key] = original_dict[key]
            instances.append(instance)
    return instances

instances = add_info(lookup_dict, annotation_filepath)

new_annot_filename = 'new_annot.json'
with open(new_annot_filename, 'w') as newjson:
    for instance in instances:
        newjson.write(json.dumps(instance))
        newjson.write("\n")

## copy the json file and image file to the loctmp2 folder

In [77]:
import re
import json
import os
from shutil import copyfile

original_folder = '/media/zhaomin/Zhaomin_SSD/project_repo/emnlp2021/saved_tweets_original'
data_dir = 'data'

original_batch_filepath = 'batch_6540.csv'
lookup_dict = defaultdict(dict)
with open(original_batch_filepath, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lookup_dict[row['instance_id']] = row
        
annotation_filepath = 'data/annotations/annotation_context.json'
with open(annotation_filepath, 'r') as jsonfile:
    lines = jsonfile.read().split("\n")[:-1]
    for line in lines:
        instance = json.loads(line)
        instance_id = instance['Input.instance_id']
        
        # create folder if it does not exist
        if not os.path.isdir(os.path.join(data_dir, 'json_files', instance_id)):
            os.mkdir(os.path.join(data_dir, 'json_files', instance_id))
        if not os.path.isdir(os.path.join(data_dir, 'image_files', instance_id)):
            os.mkdir(os.path.join(data_dir, 'image_files', instance_id))
            
        # find event path
        original_dict = lookup_dict[instance_id]
        event_name = re.split('(\d+)', instance_id.split("_")[0])[0]
        for original_event in os.listdir(original_folder):
            
            # make sure the event and year are matched
            if original_event.split("_")[0] == event_name:
                if original_event.split("_")[1].split("-")[0] == re.split('(\d+)', instance_id.split("_")[0])[1]:
                    original_event_path = os.path.join(original_folder, original_event, f"final_tweet_folder_{original_event}", instance_id.split("_")[-1])
                    break
        
        for key, value in original_dict.items():
            if key.endswith("url"):
                
                if 'anchor' in key:
                    real_instance_id = value.split("/")[-1].split("_")[1]
                    src_jsonfilename = f"anchor_{real_instance_id}.json"
                    src_imagefilename = f"anchor_{real_instance_id}.jpg"
                    src_jsonfilepath = os.path.join(original_event_path, src_jsonfilename)
                    src_imagefilepath = os.path.join(original_event_path, src_imagefilename)
                    dst_jsonfilepath = os.path.join(data_dir, 'json_files', instance_id, src_jsonfilename)
                    dst_imagefilepath = os.path.join(data_dir, 'image_files', instance_id, src_imagefilename)
                    copyfile(src_jsonfilepath, dst_jsonfilepath)
                    copyfile(src_imagefilepath, dst_imagefilepath)
                
                else:
                    tweet_id = value.split("/")[-1].split(".")[0].split("_")[-1]
                    
                    # copy json file
                    src_jsonfilename = f"{tweet_id}.json"
                    src_jsonfilepath = os.path.join(original_event_path, src_jsonfilename)
                    dst_jsonfilepath = os.path.join(data_dir, 'json_files', instance_id, src_jsonfilename)
                    copyfile(src_jsonfilepath, dst_jsonfilepath)
                    
                    # copy image file if it exists
                    src_imagefilename = f"{tweet_id}.jpg"
                    if src_imagefilename in os.listdir(original_event_path):
                        src_imagefilepath = os.path.join(original_event_path, src_imagefilename)
                        dst_imagefilepath = os.path.join(data_dir, 'image_files', instance_id, src_imagefilename)
                        copyfile(src_imagefilepath, dst_imagefilepath)
                    
#         break