In [1]:
import time
from utils import preprocess
from utils import learning_helper

In [2]:
data_dir = 'data/annotations/new_annot.json'
mode = 'anchor_text_only'

instances = preprocess.load_data(data_dir, mode)

In [3]:
start_time = time.time()
instances = preprocess.add_bert_output(instances, anchor_only=True)
end_time = time.time()
elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)
print(f"Time spent for BERT: {elapsed_mins}m {elapsed_secs}s")

Time spent for BERT: 0m 40s


In [4]:
start_time = time.time()
instances = preprocess.add_vgg_output(instances, anchor_only=True)
end_time = time.time()
elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)
print(f"Time spent for VGG: {elapsed_mins}m {elapsed_secs}s")



Time spent for VGG: 1m 13s


In [5]:
train_instances, dev_instances, test_instances = preprocess.split_instances(instances)
train_loader, dev_loader, test_loader = preprocess.get_data_loader(train_instances, dev_instances, test_instances, batch_size=16)

In [7]:
import torch
from model.anchor_text_only import AnchorTextOnlyModel

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = AnchorTextOnlyModel().to(device)

In [8]:
model

AnchorTextOnlyModel(
  (fc1): Linear(in_features=768, out_features=512, bias=True)
  (fc2): Linear(in_features=512, out_features=2, bias=True)
  (dropout): Dropout(p=0.2, inplace=False)
)

In [11]:
import torch
import torch.nn as nn
import torch.optim as optim

optimizer = optim.Adam(model.parameters(), lr=1e-04)
criterion = nn.CrossEntropyLoss().to(device)

label_to_idx = {'Yes': 1, 'No': 0}
idx_to_label = {1: 'Yes', 0: 'No'}
    
num_epochs = 100
patience = 5
best_valid_loss = float('inf')
check_stopping = 0
model_name = f'retrained_{mode}_classifier.pkl'
for i in range(num_epochs):

    start_time = time.time()
    train_loss, train_acc = learning_helper.train(model, train_loader, optimizer, criterion, device, label_to_idx)
    dev_loss, dev_acc = learning_helper.evaluate(model, dev_loader, criterion, device, label_to_idx)
    end_time = time.time()

    elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)

    print("-" * 60)
    print(f"Epoch: {i+1} || Epoch Time: {elapsed_mins}m {elapsed_secs}s")
    print(f"Epoch: {i+1} || Train loss: {train_loss:.02f}, Train Acc: {train_acc:.02f}")
    print(f"Epoch: {i+1} || Dev loss: {dev_loss:.02f}, Dev Acc: {dev_acc:.02f}")

    # check if we need to save the model
    if dev_loss < best_valid_loss:
        check_stopping = 0
        best_valid_loss = dev_loss
        torch.save(model, model_name)
    else:
        check_stopping += 1
        print(f"The loss on development set does not decrease")
        if check_stopping == patience:
            print("The loss on development set does not decrease, stop training!")
            break

------------------------------------------------------------
Epoch: 1 || Epoch Time: 0m 0s
Epoch: 1 || Train loss: 0.04, Train Acc: 0.68
Epoch: 1 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 2 || Epoch Time: 0m 0s
Epoch: 2 || Train loss: 0.04, Train Acc: 0.68
Epoch: 2 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 3 || Epoch Time: 0m 0s
Epoch: 3 || Train loss: 0.04, Train Acc: 0.68
Epoch: 3 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 4 || Epoch Time: 0m 0s
Epoch: 4 || Train loss: 0.04, Train Acc: 0.68
Epoch: 4 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 5 || Epoch Time: 0m 0s
Epoch: 5 || Train loss: 0.04, Train Acc: 0.68
Epoch: 5 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 6 || Epoch Time: 0m 0s
Epoch: 6 || Tr

In [11]:
print(torch.__version__)

1.6.0+cu101


## add more info to the annotation file

In [36]:
import json
import os
import csv
from collections import defaultdict

annotation_filepath = 'data/annotations/annotation_context.json'
original_batch_filepath = 'batch_6540.csv'
data_dir = 'data'

lookup_dict = defaultdict(dict)
with open(original_batch_filepath, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lookup_dict[row['instance_id']] = row
        
def add_info(lookup_dict, annotation_filepath):
    instances = []
    with open(annotation_filepath, 'r') as jsonfile:
        lines = jsonfile.read().split("\n")[:-1]
        for line in lines:
            instance = {}
            temp_instance = json.loads(line)
            # remove "Input." in the keys
            for key, value in temp_instance.items():
                if key.startswith("Input."):
                    if not key.endswith("url"):
                        newkey = key.split(".")[-1]
                        instance[newkey] = value
                else:
                    instance[key] = value

            # add image filepath, json filepath, screenshot url, tweet_text, and timestamp
            original_dict = lookup_dict[temp_instance['Input.instance_id']]
            for key, value in original_dict.items():
                if key.endswith("url"):
                    tweet_id = value.split("/")[-1].split("_")[-1].split(".")[0]

                    # add json filepath
                    jsonname = f"anchor_{tweet_id}.json" if 'anchor' in key else f"{tweet_id}.json"
                    jsonpath = os.path.join(data_dir, 'json_files', '_'.join(value.split("/")[-1].split("_")[:2]), jsonname)
                    jsonkey = key.split("_")[0] + "_jsonpath"
                    instance[jsonkey] = jsonpath

                    # add tweet text
                    with open(jsonpath, 'r') as tweetfile:
                        tweet = json.loads(tweetfile.read())
                    textkey = key.split("_")[0] + "_tweettext"
                    instance[textkey] = tweet['full_text']

                    # add image filepath if image exists
                    instance[key] = value
                    imagename = f"anchor_{tweet_id}.jpg" if 'anchor' in key else f"{tweet_id}.jpg"
                    imagepath = os.path.join(data_dir, 'image_files', '_'.join(value.split("/")[-1].split("_")[:2]), imagename)
                    hasimg = os.path.isfile(imagepath)
                    if hasimg:
                        imagekey = key.split("_")[0] + "_imagepath"
                        instance[imagekey] = imagepath

                if key.endswith("timestamp"):
                    instance[key] = original_dict[key]
            instances.append(instance)
    return instances

instances = add_info(lookup_dict, annotation_filepath)

new_annot_filename = 'new_annot.json'
with open(new_annot_filename, 'w') as newjson:
    for instance in instances:
        newjson.write(json.dumps(instance))
        newjson.write("\n")

## copy the json file and image file to the loctmp2 folder

In [77]:
import re
import json
import os
from shutil import copyfile

original_folder = '/media/zhaomin/Zhaomin_SSD/project_repo/emnlp2021/saved_tweets_original'
data_dir = 'data'

original_batch_filepath = 'batch_6540.csv'
lookup_dict = defaultdict(dict)
with open(original_batch_filepath, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lookup_dict[row['instance_id']] = row
        
annotation_filepath = 'data/annotations/annotation_context.json'
with open(annotation_filepath, 'r') as jsonfile:
    lines = jsonfile.read().split("\n")[:-1]
    for line in lines:
        instance = json.loads(line)
        instance_id = instance['Input.instance_id']
        
        # create folder if it does not exist
        if not os.path.isdir(os.path.join(data_dir, 'json_files', instance_id)):
            os.mkdir(os.path.join(data_dir, 'json_files', instance_id))
        if not os.path.isdir(os.path.join(data_dir, 'image_files', instance_id)):
            os.mkdir(os.path.join(data_dir, 'image_files', instance_id))
            
        # find event path
        original_dict = lookup_dict[instance_id]
        event_name = re.split('(\d+)', instance_id.split("_")[0])[0]
        for original_event in os.listdir(original_folder):
            
            # make sure the event and year are matched
            if original_event.split("_")[0] == event_name:
                if original_event.split("_")[1].split("-")[0] == re.split('(\d+)', instance_id.split("_")[0])[1]:
                    original_event_path = os.path.join(original_folder, original_event, f"final_tweet_folder_{original_event}", instance_id.split("_")[-1])
                    break
        
        for key, value in original_dict.items():
            if key.endswith("url"):
                
                if 'anchor' in key:
                    real_instance_id = value.split("/")[-1].split("_")[1]
                    src_jsonfilename = f"anchor_{real_instance_id}.json"
                    src_imagefilename = f"anchor_{real_instance_id}.jpg"
                    src_jsonfilepath = os.path.join(original_event_path, src_jsonfilename)
                    src_imagefilepath = os.path.join(original_event_path, src_imagefilename)
                    dst_jsonfilepath = os.path.join(data_dir, 'json_files', instance_id, src_jsonfilename)
                    dst_imagefilepath = os.path.join(data_dir, 'image_files', instance_id, src_imagefilename)
                    copyfile(src_jsonfilepath, dst_jsonfilepath)
                    copyfile(src_imagefilepath, dst_imagefilepath)
                
                else:
                    tweet_id = value.split("/")[-1].split(".")[0].split("_")[-1]
                    
                    # copy json file
                    src_jsonfilename = f"{tweet_id}.json"
                    src_jsonfilepath = os.path.join(original_event_path, src_jsonfilename)
                    dst_jsonfilepath = os.path.join(data_dir, 'json_files', instance_id, src_jsonfilename)
                    copyfile(src_jsonfilepath, dst_jsonfilepath)
                    
                    # copy image file if it exists
                    src_imagefilename = f"{tweet_id}.jpg"
                    if src_imagefilename in os.listdir(original_event_path):
                        src_imagefilepath = os.path.join(original_event_path, src_imagefilename)
                        dst_imagefilepath = os.path.join(data_dir, 'image_files', instance_id, src_imagefilename)
                        copyfile(src_imagefilepath, dst_imagefilepath)
                    
#         break

## save split for replication

In [None]:
import json
from sklearn.model_selection import train_test_split

X = instances
y = [x['adjudicated_label'] for x in instances]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train)

split = {'train': [x['instance_id'] for x in X_train],
         'dev': [x['instance_id'] for x in X_dev],
         'test': [x['instance_id'] for x in X_test]}

with open("saved_split", 'w') as splitfile:
    splitfile.write(json.dumps(split))