In [5]:
import os
import json
import re

In [27]:
def clean_text(text):
    text = re.sub(r'http\S+', '', text)
    text = text.strip()
    return text

def load_data(data_filepath, split_filepath):
    
    train_data, test_data = [], []

    with open(split_filepath, 'r') as file:
        splits = json.load(file)
        train_ids = splits['train']
        test_ids = splits['test']
        
    with open(filepath, 'r') as file:
        for line in file:
            item = json.loads(line)
            kept_annotations = [item[key] for key in item.keys() if key.startswith("Answer.Q1_")]
            if len(kept_annotations) == 0:
                continue
            texts = [
                clean_text(item['context8_tweettext']),
                clean_text(item['context9_tweettext']),
                clean_text(item['context10_tweettext']),
                clean_text(item['context11_tweettext']),
                clean_text(item['context12_tweettext']),
                clean_text(item['context13_tweettext']),
            ]
            instance = {'texts': texts, 'label': item['adjudicated_label'], 'location': item['anchor_location']}
            if item['instance_id'] in train_ids:
                train_data.append(instance)
            if item['instance_id'] in test_ids:
                test_data.append(instance)
                
    return train_data, test_data


data_filepath = 'data/data.json'
split_filepath = 'data/data_split'
train_data, test_data = load_data(data_filepath, split_filepath)

In [28]:
train_data

[{'texts': ['The water boil advisory that went into effect following the Memorial Day tornado outbreak has been lifted for the remaining customers in Dayton, Montgomery and Greene County, with the exception of Brookville.',
   'See how neighborhoods are helping surrounding communities tonight on 2 NEWS First at 4',
   'Learn about organizations sending hundreds of volunteers to help clean up Trotwood and Dayton this weekend and how you can help, tonight on 2 NEWS First at 4.',
   "Volunteers are needed across the Miami Valley following the Memorial Day tornado outbreak. Here's information on how to help. If you know of any other volunteer opportunities, leave a comment below!",
   'UPDATE: As the search for a missing Harrison Township woman continues, the Sheriff is asking residents in the Shiloh neighborhood to search their properties for information that may help authorities find 71-year-old Catherine Clayburn.',
   'Thanks to you we were able to raise $68K for our friends in need af

In [15]:
items = []
with open(filepath, 'r') as file:
    for line in file:
        item = json.loads(line)
        kept_annotations = [item[key] for key in item.keys() if key.startswith("Answer.Q1_")]
        if len(kept_annotations) == 0:
            continue
        items.append(item)

In [20]:
from sklearn.model_selection import train_test_split

X = [x['instance_id'] for x in items]
y = [x['adjudicated_label'] for x in items]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30)

In [25]:
import random

random.shuffle(X_train)
random.shuffle(X_test)

with open('data_split', 'w') as file:
    splits = {
        'train': X_train,
        'test': X_test
    }
    json.dump(splits, file)

In [29]:
item

{'anchor_location': 'Baltimore',
 'instance_id': 'christmas2019_1209986132777328652',
 'event': 'christmas2019',
 'adjudicated_label': 'Yes',
 'anchor_timestamp': 'Wed Dec 25 23:55:49 +0000 2019',
 'anchor_jsonpath': 'data/json_files/christmas2019_1209986132777328652/anchor_1209986132777328652.json',
 'anchor_tweettext': 'My mother’s reaction to getting a ticket to see @jeffdunham for Christmas in Baltimore #Christmas #JeffDunham #themom https://t.co/FEwLOL3c20',
 'anchor_url': 'http://www.cse.unt.edu/~blanco/screenshot/christmas2019_1209986132777328652_anchor_1209986132777328652.png',
 'anchor_imagepath': 'data/image_files/christmas2019_1209986132777328652/anchor_1209986132777328652.jpg',
 'context8_jsonpath': 'data/json_files/christmas2019_1209986132777328652/1209183685066534912.json',
 'context8_tweettext': 'I mean, what does @Lj_era8 have to do, save humanity, in order to get some respect from the analysts? They get on my nerves! #Ravens #RavensFlock #LamarJackson',
 'context8_url'