In [1]:
import os
import json

prediction_folder = 'predictions'
prediction_filename = 'prediction_all_bert_lstm.json'
prediction_filepath = os.path.join(prediction_folder, prediction_filename)

def load_prediction(filepath):
    instances = []
    with open(filepath, 'r') as jsonfile:
        lines = jsonfile.read().split("\n")[:-1]
        for line in lines:
            instance = json.loads(line)
            instances.append(instance)
    return instances

# load the instances with predictions
instances = load_prediction(prediction_filepath)

# get the instances whose predictions are not correct
errors = [ins for ins in instances if ins['adjudicated_label'] != ins['pred_label']]

## generate html file to display 100 sampled errors (screenshots)

In [17]:
import random

def get_html(errors, num_samples=100, html_filename='errors.html'):
    sampled_errors = random.sample(errors, num_samples)
    with open(html_filename, 'w') as htmlfile:
        htmlfile.write("<!DOCTYPE html>\n<html>\n<head>\n<title>Error Example</title>\n</head>\n<body>\n")
        htmlfile.write("""<table style=\"margin-left: auto; margin-right: auto; border: 1px solid black; line-height: 1.0em; width: 1200\">\n""")
        image_css = """style=\"display: block; margin-left: auto; margin-right: auto; max-width: 450px; max-height: 650px;\""""

        for error in sampled_errors:
            
            ###########################
            ### 3 context tweets before
            ###########################
            
            htmlfile.write("<tr>\n")

            # context8_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['context8_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            # context9_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['context9_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            # context10_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['context10_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            htmlfile.write("</tr>\n")
            
            ###########################
            ### anchor tweet
            ###########################
            
            htmlfile.write("<tr>\n")
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\"></td>\n")

            # anchor_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['anchor_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\"></td>\n")
            htmlfile.write("</tr>\n")
            
            ###########################
            ### 3 context tweets after
            ###########################
            
            htmlfile.write("<tr>\n")

            # context11_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['context11_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            # context12_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['context12_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            # context13_url
            htmlfile.write("<td style=\"width:400; border-bottom: 1px solid black;\">\n")
            htmlfile.write(f"<img src=\"{error['context13_url']}\" {image_css}>\n")
            htmlfile.write("</td>\n")

            htmlfile.write("</tr>\n")

    print("Done")
            
get_html(errors)

Done


## only display the tweet text

In [5]:
import random

def get_html(errors, num_samples=100, txt_filename='errors_text'):
    sampled_errors = random.sample(errors, num_samples)
    with open(txt_filename, 'w') as txtfile:

        for error in sampled_errors:
            
            ###########################
            ### 3 context tweets before
            ###########################
            
            txtfile.write("#" * 60 + "\n" + "#" * 60 + "\n")
            txtfile.write(f"Location: {error['anchor_location']}, gold_label: {error['adjudicated_label']}, pred_label: {error['pred_label']}\n")

            # context8
            txtfile.write("-------------- Tweet 1 --------------\n")
            txtfile.write(f"{error['context8_tweettext']}\n")

            # context9
            txtfile.write("-------------- Tweet 2 --------------\n")
            txtfile.write(f"{error['context9_tweettext']}\n")

            # context10
            txtfile.write("-------------- Tweet 3 --------------\n")
            txtfile.write(f"{error['context10_tweettext']}\n")
            
            ###########################
            ### anchor tweet
            ###########################
            
            # anchor tweet
            txtfile.write("-------------- Tweet 4 --------------\n")
            txtfile.write(f"{error['anchor_tweettext']}\n")
            
            ###########################
            ### 3 context tweets after
            ###########################
            
            # context11
            txtfile.write("-------------- Tweet 5 --------------\n")
            txtfile.write(f"{error['context11_tweettext']}\n")

            # context12
            txtfile.write("-------------- Tweet 6 --------------\n")
            txtfile.write(f"{error['context12_tweettext']}\n")

            # context13
            txtfile.write("-------------- Tweet 7 --------------\n")
            txtfile.write(f"{error['context13_tweettext']}\n")
            
    print("Done")
            
get_html(errors)

Done


Locate the error via tweet text

In [14]:
errors[0]

{'anchor_location': 'Reno',
 'instance_id': 'thanksgiving2019_1202281007690502146',
 'event': 'thanksgiving2019',
 'Answer.Q1_A13YTGRLTS80MU': 'No',
 'Answer.Q2_A13YTGRLTS80MU': '3',
 'adjudicated_label': 'No',
 'anchor_timestamp': 'Wed Dec 04 17:38:24 +0000 2019',
 'anchor_jsonpath': 'data/json_files/thanksgiving2019_1202281007690502146/anchor_1202281007690502146.json',
 'anchor_tweettext': 'Mother statue after renovation by protesters. Al-Umma Park BGD\n@RichardGrenell\n@LongDefense\n@POTUS\n@SaraCarterDC\n@SecPompeo\n@StateDept\n@USAbilAraby\n@laraseligman\n@JeanineHennis\n@eqanbar\n@Rez739\n@DRUDGE\n@DRUDGE_REPORT\n #Thanksgiving\n#Save_the_Iraqi_People #ثورة_تشرين https://t.co/kNutE4NBNN',
 'anchor_url': 'http://www.cse.unt.edu/~blanco/screenshot/thanksgiving2019_1202281007690502146_anchor_1202281007690502146.png',
 'anchor_imagepath': 'data/image_files/thanksgiving2019_1202281007690502146/anchor_1202281007690502146.jpg',
 'context8_jsonpath': 'data/json_files/thanksgiving2019_120

In [24]:
from IPython.display import display, Image

query_text = """Whitney Houston"""

for error in errors:
    texts = [value for key, value in error.items() if key.endswith("text")]
    for text in texts:
        if query_text in text:
            print(f"instance_id: {error['instance_id']}")
            print(f"Location: {error['anchor_location']}, gold_label: {error['adjudicated_label']}, pred_label: {error['pred_label']}")
            print("-------------- Tweet 1 --------------\n")
            display(Image(url=error['context8_url']))
            print("-------------- Tweet 2 --------------\n")
            display(Image(url=error['context9_url']))
            print("-------------- Tweet 3 --------------\n")
            display(Image(url=error['context10_url']))
            print("-------------- Tweet 4 --------------\n")
            display(Image(url=error['anchor_url']))
            print("-------------- Tweet 5 --------------\n")
            display(Image(url=error['context11_url']))
            print("-------------- Tweet 6 --------------\n")
            display(Image(url=error['context12_url']))
            print("-------------- Tweet 7 --------------\n")
            display(Image(url=error['context13_url']))
            break

instance_id: christmas2019_1211435492446740481
Location: Houston, gold_label: Yes, pred_label: No
-------------- Tweet 1 --------------



-------------- Tweet 2 --------------



-------------- Tweet 3 --------------



-------------- Tweet 4 --------------



-------------- Tweet 5 --------------



-------------- Tweet 6 --------------



-------------- Tweet 7 --------------

