In [1]:
import time
from utils import preprocess
from utils import learning_helper

In [2]:
data_dir = 'data/annotations/new_annot.json'
mode = 'anchor_text_image'

instances = preprocess.load_data(data_dir, mode)

In [3]:
start_time = time.time()
instances = preprocess.add_bert_output(instances, anchor_only=True)
end_time = time.time()
elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)
print(f"Time spent for BERT: {elapsed_mins}m {elapsed_secs}s")

Time spent for BERT: 0m 41s


In [4]:
start_time = time.time()
instances = preprocess.add_vgg_output(instances, anchor_only=True)
end_time = time.time()
elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)
print(f"Time spent for VGG: {elapsed_mins}m {elapsed_secs}s")



Time spent for VGG: 1m 17s


In [5]:
train_instances, dev_instances, test_instances = preprocess.split_instances(instances)
train_loader, dev_loader, test_loader = preprocess.get_data_loader(train_instances, dev_instances, test_instances, batch_size=16)

In [24]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from utils import evaluator
from sklearn.metrics import classification_report
from model.anchor_text_image import AnchorTextImageModel


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# define the label mapping
label_to_idx = {'Yes': 1, 'No': 0}
idx_to_label = {1: 'Yes', 0: 'No'}

# parameter setting
bert_feat_dim = 768
vgg_feat_dim = 1000
output_dim = 2
hidden_dim = 4096
dropout_rate = 0.2
learning_rate = 1e-03
    
# get the model based on mode and move model to GPU is GPU is available
classifier = AnchorTextImageModel()
classifier = classifier.to(device)

# define the optimizer, loos function, and some parameters
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss().to(device)

In [25]:
num_epochs = 500
patience = 10
best_valid_loss = float('inf')
check_stopping = 0
model_name = f'retrained_{mode}_classifier.pkl'
for i in range(num_epochs):

    start_time = time.time()
    train_loss, train_acc = learning_helper.train(classifier, train_loader, optimizer, criterion, device, label_to_idx)
    dev_loss, dev_acc = learning_helper.evaluate(classifier, dev_loader, criterion, device, label_to_idx)
    end_time = time.time()

    elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)

    print("-" * 60)
    print(f"Epoch: {i+1} || Epoch Time: {elapsed_mins}m {elapsed_secs}s")
    print(f"Epoch: {i+1} || Train loss: {train_loss:.02f}, Train Acc: {train_acc:.02f}")
    print(f"Epoch: {i+1} || Dev loss: {dev_loss:.02f}, Dev Acc: {dev_acc:.02f}")

    # check if we need to save the model
    if dev_loss < best_valid_loss:
        check_stopping = 0
        best_valid_loss = dev_loss
        torch.save(classifier, model_name)
    else:
        check_stopping += 1
        print(f"The loss on development set does not decrease")
        if check_stopping == patience:
            print("The loss on development set does not decrease, stop training!")
            break
            
classifier.eval()
pred_labels = evaluator.test_model(classifier, test_loader, idx_to_label, device)
gold_labels = [x['adjudicated_label'] for x in test_instances]
print('-' * 60)
print(classification_report(gold_labels, pred_labels))

------------------------------------------------------------
Epoch: 1 || Epoch Time: 0m 3s
Epoch: 1 || Train loss: 0.10, Train Acc: 0.62
Epoch: 1 || Dev loss: 0.04, Dev Acc: 0.67
------------------------------------------------------------
Epoch: 2 || Epoch Time: 0m 3s
Epoch: 2 || Train loss: 0.04, Train Acc: 0.67
Epoch: 2 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 3 || Epoch Time: 0m 3s
Epoch: 3 || Train loss: 0.04, Train Acc: 0.66
Epoch: 3 || Dev loss: 0.04, Dev Acc: 0.68
The loss on development set does not decrease
------------------------------------------------------------
Epoch: 4 || Epoch Time: 0m 3s
Epoch: 4 || Train loss: 0.04, Train Acc: 0.68
Epoch: 4 || Dev loss: 0.04, Dev Acc: 0.68
The loss on development set does not decrease
------------------------------------------------------------
Epoch: 5 || Epoch Time: 0m 3s
Epoch: 5 || Train loss: 0.04, Train Acc: 0.68
Epoch: 5 || Dev loss: 0.04, Dev Acc: 0.68
The loss on d

## complicaed_nn

In [1]:
import time
import os
from utils import preprocess
from utils import learning_helper

data_dir = 'data/annotations/new_annot.json'
mode = 'all_bert_only'

instances = preprocess.load_data(data_dir, mode)

In [2]:
start_time = time.time()
instances = preprocess.add_bert_output(instances, anchor_only=False)
end_time = time.time()
elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)
print(f"Time spent for BERT: {elapsed_mins}m {elapsed_secs}s")

100%|██████████| 3494/3494 [04:19<00:00, 13.46it/s]

Time spent for BERT: 4m 44s





In [3]:
# load MPQA lexicon
mpqa_path = os.path.join('data', 'reference', 'MPQA_Lexicon')
mpqa_lexicon = preprocess.load_mpqa(mpqa_path)

# extract additional features
print("Extracting additional features using SpaCy ...")
start_time = time.time()
instances = preprocess.add_additional_features(instances, mpqa_lexicon)
end_time = time.time()
elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)
print(f"Time spent for SpaCy preprocessing: {elapsed_mins}m {elapsed_secs}s")

Extracting additional features using SpaCy ...


100%|██████████| 3494/3494 [02:33<00:00, 22.82it/s]


Time spent for SpaCy preprocessing: 2m 34s


In [7]:
train_instances, dev_instances, test_instances = preprocess.split_instances(instances)
train_loader, dev_loader, test_loader = preprocess.get_data_loader(train_instances, dev_instances, test_instances, batch_size=16)

In [6]:
for x, y in train_loader:
    print(x.shape)
    break

torch.Size([16, 1, 16058])


In [16]:
train_instances[0]['anchor_addfeattensor'].shape[1]

1526

In [13]:
import copy
import spacy
import os
import torch
from collections import defaultdict, Counter
from sklearn.feature_extraction import DictVectorizer
from tqdm import tqdm


def add_additional_features(instances):
    mpqa_path = os.path.join('data', 'reference', 'MPQA_Lexicon')
    mpqa_lexicon = preprocess.load_mpqa(mpqa_path)
    nlp = spacy.load('en_core_web_sm')
    nlp.add_pipe('emoji', first=True)
    feat_dicts = []
#     pbar = tqdm(total=len(instances))
    for instance in instances:
        # ensure the order is the same as in the later part
        keys = sorted([key for key in instance.keys() if key.endswith("tweettext")])
        for key in keys:
                
            tweet = nlp(instance[key])
            print('-' * 60)
            print(tweet)
            print(tweet._.performed_spellCheck)
            featkey = key.split("_")[0] + "_addfeat"
            addfeat = {}

            # entirely uppercase words
            num_uppercasewords = len(['x' for token in tweet if token.text.isupper()])
            addfeat['num_uppercasewords'] = num_uppercasewords

            # the number of URLs
            num_urls = len(['x' for token in tweet if token.text.startswith("http")])
            addfeat['num_urls'] = num_urls

            # the number of exclamation marks
            num_exclamationmarks = len(['x' for token in tweet if token.text == '!'])
            addfeat['num_exclamationmarks'] = num_exclamationmarks

            # the number of strongly subjective words in MPQA lexicon
            num_strongsubj = len([token for token in tweet if token.text in mpqa_lexicon['strongsubj']])
            addfeat['num_strongsubj'] = num_strongsubj

            # the number of weakly subjective words in MPQA lexicon
            num_weaksubj = len([token for token in tweet if token.text in mpqa_lexicon['weaksubj']])
            addfeat['num_weaksubj'] = num_weaksubj

            # the number of emoji
            num_emoji = len(tweet._.emoji)
            addfeat['num_emoji'] = num_emoji
            
            # the three most common emoji (in the form of description)
            emoji_desc_lists = [token._.emoji_desc for token in tweet if token._.is_emoji]
            emoji_count = Counter(emoji_desc_lists).most_common(3)
            for index, x in enumerate(emoji_count):
                addfeat[f"no.{index+1}_emoji"] = x[0]
                
            # the number of tokens
            num_tokens = len(tweet)
            addfeat['num_tokens'] = num_tokens
            
            # spellcheck
            
            
            instance[featkey] = addfeat
            feat_dicts.append(addfeat)
            
            break
        break
#         pbar.update(1)
#     pbar.close()
    
#     dv = DictVectorizer(sparse=False)
#     feat_vectorized = dv.fit_transform(feat_dicts)
    
#     for index_outside, instance in enumerate(instances):
#         small_feats = feat_vectorized[index_outside*7:(index_outside+1)*7]
#         # ensure the order is the same as the previous part
#         keys = sorted([key for key in instance.keys() if key.endswith("tweettext")])
#         for index_inside, key in enumerate(keys):
#             newfeatkey = key.split("_")[0] + "_addfeattensor"
#             feattensor = torch.FloatTensor(small_feats[index_inside]).unsqueeze(0).to('cpu')
#             instance[newfeatkey] = feattensor
            
    return instances, feat_dicts

temp_instances = instances.copy()
new_instances, feat_dicts = add_additional_features(temp_instances)





HBox(children=(FloatProgress(value=0.0, description='Downloading', max=435779157.0, style=ProgressStyle(descri…

KeyboardInterrupt: 

In [6]:
def temp(instances, feat_dicts):
    dv = DictVectorizer(sparse=False, dtype=int)
    feat_vectorized = dv.fit_transform(feat_dicts)

    for index_outside, instance in enumerate(instances):
        small_feats = feat_vectorized[index_outside*7:(index_outside+1)*7]
        # ensure the order is the same as the previous part
        keys = sorted([key for key in instance.keys() if key.endswith("tweettext")])
        for index_inside, key in enumerate(keys):
            newfeatkey = key.split("_")[0] + "_addfeattensor"
            feattensor = torch.FloatTensor(small_feats[index_inside]).unsqueeze(0).to('cpu')
            instance[newfeatkey] = feattensor
    return instances

aa_ins = temp(new_instances, feat_dicts)
aa_ins[0]['anchor_addfeattensor'].shape

torch.Size([1, 1519])

In [20]:
import torch.nn as nn
import torch.nn.functional as F


# parameter setting
bert_feat_dim = 768
output_dim = 2
hidden_dim = 2048
dropout_rate = 0.5


class Test(nn.Module):

    def __init__(self, additional_feat_dim=0):
        super(Test, self).__init__()
        self.bert_feat_dim = bert_feat_dim
        self.additional_feat_dim = additional_feat_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.dropout_rate = dropout_rate
#         self.fc1 = nn.Linear(self.bert_feat_dim*7, hidden_dim)
        self.fc1 = nn.Linear((self.bert_feat_dim+self.additional_feat_dim)*7, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, feat_combined):

#         # split the combined feature
#         bert_feat = feat_combined[:, :self.bert_feat_dim * 7]
#         additional_feat = feat_combined[:, self.bert_feat_dim * 7:]

        # pass fully-connected layer
        out = self.fc1(feat_combined)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)
        return out

In [92]:
train_instances, dev_instances, test_instances = preprocess.split_instances(instances)
train_loader, dev_loader, test_loader = preprocess.get_data_loader(train_instances, dev_instances, test_instances, batch_size=16)

In [284]:
import torch
import torch.nn as nn
import torch.nn.functional as F

# parameter setting
bert_feat_dim = 768
lstm_dim = 512
output_dim = 2
hidden_dim = 512
dropout_rate = 0.5


class Test(nn.Module):

    def __init__(self):
        super(Test, self).__init__()
        self.bert_feat_dim = bert_feat_dim
        self.lstm_dim = lstm_dim
        self.output_dim = output_dim
        self.hidden_dim = hidden_dim
        self.dropout_rate = dropout_rate
        self.lstm = nn.LSTM(self.bert_feat_dim, self.lstm_dim, batch_first=True, bidirectional=True)
#         self.fc_combined = nn.Linear(self.bert_feat_dim*7+self.lstm_dim*2, hidden_dim)
#         self.fc1 = nn.Linear(self.lstm_dim*2, hidden_dim)
#         self.fc1 = nn.Linear(self.lstm_dim*2+self.bert_feat_dim, hidden_dim)
        self.fc1 = nn.Linear(hidden_dim*3, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, output_dim)
        # test
        self.fc_lstm = nn.Linear(self.lstm_dim*2, self.hidden_dim)
        self.fc_combined = nn.Linear(self.lstm_dim*2+self.bert_feat_dim, self.hidden_dim)
        self.fc_anchor = nn.Linear(self.bert_feat_dim, self.hidden_dim)        
        
        self.dropout = nn.Dropout(dropout_rate)
        self.bn = nn.BatchNorm1d(self.lstm_dim*2, affine=True)

    def forward(self, feat_combined):
        
        # get the bert output
        context1_feat = feat_combined[:, :self.bert_feat_dim * 1].unsqueeze(1)
        context2_feat = feat_combined[:, self.bert_feat_dim * 1:self.bert_feat_dim * 2].unsqueeze(1)
        context3_feat = feat_combined[:, self.bert_feat_dim * 2:self.bert_feat_dim * 3].unsqueeze(1)
        anchor_feat = feat_combined[:, self.bert_feat_dim * 3:self.bert_feat_dim * 4].unsqueeze(1)
        context4_feat = feat_combined[:, self.bert_feat_dim * 4:self.bert_feat_dim * 5].unsqueeze(1)
        context5_feat = feat_combined[:, self.bert_feat_dim * 5:self.bert_feat_dim * 6].unsqueeze(1)
        context6_feat = feat_combined[:, self.bert_feat_dim * 6:self.bert_feat_dim * 7].unsqueeze(1)

        # get the additional features
        addfeat_dim = int((feat_combined.shape[0] - self.bert_feat_dim * 7) / 7)
        context1_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 0:self.bert_feat_dim * 7 + addfeat_dim * 1].unsqueeze(1)
        context2_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 1:self.bert_feat_dim * 7 + addfeat_dim * 2].unsqueeze(1)
        context3_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 2:self.bert_feat_dim * 7 + addfeat_dim * 3].unsqueeze(1)
        anchor_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 3:self.bert_feat_dim * 7 + addfeat_dim * 4].unsqueeze(1)
        context4_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 4:self.bert_feat_dim * 7 + addfeat_dim * 5].unsqueeze(1)
        context5_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 5:self.bert_feat_dim * 7 + addfeat_dim * 6].unsqueeze(1)
        context6_addfeat = feat_combined[:, self.bert_feat_dim * 7 + addfeat_dim * 6:self.bert_feat_dim * 7 + addfeat_dim * 7].unsqueeze(1)

        # prepare for the input of LSTM
        lstm_input = torch.cat((
            context1_feat,
            context2_feat,
            context3_feat,
#             anchor_feat,
            context4_feat,
            context5_feat,
            context6_feat
        ), dim=1)
        
        # pass the LSTM
        lstm_output, _ = self.lstm(lstm_input)
        
        # only take the last hidden state
#         out = out[:, -1, :]

        # global maxpooling on the BiLSTM output
        lstm_output = nn.AvgPool1d(6, 6)(lstm_output.permute(0,2,1)).permute(0,2,1).squeeze(1)
        
        combined_out = torch.cat((lstm_output, anchor_feat.squeeze(1)), dim=1)
        
        lstm_output = self.fc_lstm(lstm_output)
        combined_out = self.fc_combined(combined_out)
        anchor_feat = self.fc_anchor(anchor_feat).squeeze(1)
        
        all_combined = torch.cat((lstm_output, combined_out, anchor_feat), dim=1)
        
#         print(out.shape)
        
        # batch normalization
#         out = self.bn(out)
        
        # pass the fully-connected layer(s)
        out = self.fc1(all_combined)
        out = F.relu(out)
        out = self.dropout(out)
        out = self.fc2(out)

        return out

In [21]:
import torch
import torch.optim as optim

# define the label mapping
label_to_idx = {'Yes': 1, 'No': 0}
idx_to_label = {1: 'Yes', 0: 'No'}
    
# define some global parameters
num_epochs = 1000
batch_size = 16
patience = 10
learning_rate = 1e-04

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

additional_feat_dim = train_instances[0]['anchor_addfeattensor'].shape[1]
classifier = Test(additional_feat_dim=additional_feat_dim)
classifier = classifier.to(device)

# define the optimizer, loos function, and some parameters
optimizer = optim.Adam(classifier.parameters(), lr=learning_rate)
criterion = nn.CrossEntropyLoss().to(device)

In [22]:
from utils import evaluator
from sklearn.metrics import classification_report

num_epochs = 500
patience = 10
best_valid_loss = float('inf')
check_stopping = 0
model_name = f'retrained_{mode}_classifier.pt'
for i in range(num_epochs):

    start_time = time.time()
    classifier.train()
    train_loss, train_acc = learning_helper.train(classifier, train_loader, optimizer, criterion, device, label_to_idx)
    dev_loss, dev_acc = learning_helper.evaluate(classifier, dev_loader, criterion, device, label_to_idx)
    classifier.train()
    end_time = time.time()

    elapsed_mins, elapsed_secs = learning_helper.epoch_time(start_time, end_time)

    print("-" * 60)
    print(f"Epoch: {i+1} || Epoch Time: {elapsed_mins}m {elapsed_secs}s")
    print(f"Epoch: {i+1} || Train loss: {train_loss:.02f}, Train Acc: {train_acc:.02f}")
    print(f"Epoch: {i+1} || Dev loss: {dev_loss:.02f}, Dev Acc: {dev_acc:.02f}")

    # check if we need to save the model
    if dev_loss < best_valid_loss:
        check_stopping = 0
        best_valid_loss = dev_loss
        torch.save(classifier.state_dict(), model_name)
    else:
        check_stopping += 1
        print(f"The loss on development set does not decrease")
        if check_stopping == patience:
            print("The loss on development set does not decrease, stop training!")
            break
            
classifier.eval()
pred_labels = evaluator.test_model(classifier, test_loader, idx_to_label, device)
gold_labels = [x['adjudicated_label'] for x in test_instances]
print('-' * 60)
print(classification_report(gold_labels, pred_labels))

------------------------------------------------------------
Epoch: 1 || Epoch Time: 0m 1s
Epoch: 1 || Train loss: 0.04, Train Acc: 0.65
Epoch: 1 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 2 || Epoch Time: 0m 1s
Epoch: 2 || Train loss: 0.04, Train Acc: 0.66
Epoch: 2 || Dev loss: 0.04, Dev Acc: 0.68
------------------------------------------------------------
Epoch: 3 || Epoch Time: 0m 1s
Epoch: 3 || Train loss: 0.04, Train Acc: 0.68
Epoch: 3 || Dev loss: 0.04, Dev Acc: 0.68
The loss on development set does not decrease
------------------------------------------------------------
Epoch: 4 || Epoch Time: 0m 1s
Epoch: 4 || Train loss: 0.04, Train Acc: 0.69
Epoch: 4 || Dev loss: 0.04, Dev Acc: 0.68
The loss on development set does not decrease
------------------------------------------------------------
Epoch: 5 || Epoch Time: 0m 1s
Epoch: 5 || Train loss: 0.04, Train Acc: 0.69
Epoch: 5 || Dev loss: 0.04, Dev Acc: 0.68
The loss on d

In [265]:
# load pretrained model
classifier = Test()
classifier = classifier.to(device)
classifier.load_state_dict(torch.load(model_name))
pred_labels = evaluator.test_model(classifier, test_loader, idx_to_label, device)
gold_labels = [x['adjudicated_label'] for x in test_instances]
print('-' * 60)
print(classification_report(gold_labels, pred_labels))

------------------------------------------------------------
              precision    recall  f1-score   support

          No       0.31      0.06      0.10       226
         Yes       0.68      0.94      0.79       473

    accuracy                           0.65       699
   macro avg       0.49      0.50      0.44       699
weighted avg       0.56      0.65      0.56       699



## add more info to the annotation file

In [36]:
import json
import os
import csv
from collections import defaultdict

annotation_filepath = 'data/annotations/annotation_context.json'
original_batch_filepath = 'batch_6540.csv'
data_dir = 'data'

lookup_dict = defaultdict(dict)
with open(original_batch_filepath, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lookup_dict[row['instance_id']] = row
        
def add_info(lookup_dict, annotation_filepath):
    instances = []
    with open(annotation_filepath, 'r') as jsonfile:
        lines = jsonfile.read().split("\n")[:-1]
        for line in lines:
            instance = {}
            temp_instance = json.loads(line)
            # remove "Input." in the keys
            for key, value in temp_instance.items():
                if key.startswith("Input."):
                    if not key.endswith("url"):
                        newkey = key.split(".")[-1]
                        instance[newkey] = value
                else:
                    instance[key] = value

            # add image filepath, json filepath, screenshot url, tweet_text, and timestamp
            original_dict = lookup_dict[temp_instance['Input.instance_id']]
            for key, value in original_dict.items():
                if key.endswith("url"):
                    tweet_id = value.split("/")[-1].split("_")[-1].split(".")[0]

                    # add json filepath
                    jsonname = f"anchor_{tweet_id}.json" if 'anchor' in key else f"{tweet_id}.json"
                    jsonpath = os.path.join(data_dir, 'json_files', '_'.join(value.split("/")[-1].split("_")[:2]), jsonname)
                    jsonkey = key.split("_")[0] + "_jsonpath"
                    instance[jsonkey] = jsonpath

                    # add tweet text
                    with open(jsonpath, 'r') as tweetfile:
                        tweet = json.loads(tweetfile.read())
                    textkey = key.split("_")[0] + "_tweettext"
                    instance[textkey] = tweet['full_text']

                    # add image filepath if image exists
                    instance[key] = value
                    imagename = f"anchor_{tweet_id}.jpg" if 'anchor' in key else f"{tweet_id}.jpg"
                    imagepath = os.path.join(data_dir, 'image_files', '_'.join(value.split("/")[-1].split("_")[:2]), imagename)
                    hasimg = os.path.isfile(imagepath)
                    if hasimg:
                        imagekey = key.split("_")[0] + "_imagepath"
                        instance[imagekey] = imagepath

                if key.endswith("timestamp"):
                    instance[key] = original_dict[key]
            instances.append(instance)
    return instances

instances = add_info(lookup_dict, annotation_filepath)

new_annot_filename = 'new_annot.json'
with open(new_annot_filename, 'w') as newjson:
    for instance in instances:
        newjson.write(json.dumps(instance))
        newjson.write("\n")

## copy the json file and image file to the loctmp2 folder

In [77]:
import re
import json
import os
from shutil import copyfile

original_folder = '/media/zhaomin/Zhaomin_SSD/project_repo/emnlp2021/saved_tweets_original'
data_dir = 'data'

original_batch_filepath = 'batch_6540.csv'
lookup_dict = defaultdict(dict)
with open(original_batch_filepath, 'r') as csvfile:
    reader = csv.DictReader(csvfile)
    for row in reader:
        lookup_dict[row['instance_id']] = row
        
annotation_filepath = 'data/annotations/annotation_context.json'
with open(annotation_filepath, 'r') as jsonfile:
    lines = jsonfile.read().split("\n")[:-1]
    for line in lines:
        instance = json.loads(line)
        instance_id = instance['Input.instance_id']
        
        # create folder if it does not exist
        if not os.path.isdir(os.path.join(data_dir, 'json_files', instance_id)):
            os.mkdir(os.path.join(data_dir, 'json_files', instance_id))
        if not os.path.isdir(os.path.join(data_dir, 'image_files', instance_id)):
            os.mkdir(os.path.join(data_dir, 'image_files', instance_id))
            
        # find event path
        original_dict = lookup_dict[instance_id]
        event_name = re.split('(\d+)', instance_id.split("_")[0])[0]
        for original_event in os.listdir(original_folder):
            
            # make sure the event and year are matched
            if original_event.split("_")[0] == event_name:
                if original_event.split("_")[1].split("-")[0] == re.split('(\d+)', instance_id.split("_")[0])[1]:
                    original_event_path = os.path.join(original_folder, original_event, f"final_tweet_folder_{original_event}", instance_id.split("_")[-1])
                    break
        
        for key, value in original_dict.items():
            if key.endswith("url"):
                
                if 'anchor' in key:
                    real_instance_id = value.split("/")[-1].split("_")[1]
                    src_jsonfilename = f"anchor_{real_instance_id}.json"
                    src_imagefilename = f"anchor_{real_instance_id}.jpg"
                    src_jsonfilepath = os.path.join(original_event_path, src_jsonfilename)
                    src_imagefilepath = os.path.join(original_event_path, src_imagefilename)
                    dst_jsonfilepath = os.path.join(data_dir, 'json_files', instance_id, src_jsonfilename)
                    dst_imagefilepath = os.path.join(data_dir, 'image_files', instance_id, src_imagefilename)
                    copyfile(src_jsonfilepath, dst_jsonfilepath)
                    copyfile(src_imagefilepath, dst_imagefilepath)
                
                else:
                    tweet_id = value.split("/")[-1].split(".")[0].split("_")[-1]
                    
                    # copy json file
                    src_jsonfilename = f"{tweet_id}.json"
                    src_jsonfilepath = os.path.join(original_event_path, src_jsonfilename)
                    dst_jsonfilepath = os.path.join(data_dir, 'json_files', instance_id, src_jsonfilename)
                    copyfile(src_jsonfilepath, dst_jsonfilepath)
                    
                    # copy image file if it exists
                    src_imagefilename = f"{tweet_id}.jpg"
                    if src_imagefilename in os.listdir(original_event_path):
                        src_imagefilepath = os.path.join(original_event_path, src_imagefilename)
                        dst_imagefilepath = os.path.join(data_dir, 'image_files', instance_id, src_imagefilename)
                        copyfile(src_imagefilepath, dst_imagefilepath)
                    
#         break

## save split for replication

In [None]:
import json
from sklearn.model_selection import train_test_split

X = instances
y = [x['adjudicated_label'] for x in instances]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y)
X_train, X_dev, y_train, y_dev = train_test_split(X_train, y_train, test_size=0.20, stratify=y_train)

split = {'train': [x['instance_id'] for x in X_train],
         'dev': [x['instance_id'] for x in X_dev],
         'test': [x['instance_id'] for x in X_test]}

with open("saved_split", 'w') as splitfile:
    splitfile.write(json.dumps(split))