In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from tqdm import tqdm
import os
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

import spacy
from spacy import displacy

def display_ent(doc, style="ent", colors=None, options=None, compact=True):
    colors = colors or {"TRIGGER": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = options or {"ents": None, "colors": colors, "compact": compact}
    displacy.render(doc, style=style, jupyter=True, options=options)

from pipeline.utils.display import display_ent
from pipeline.data.metadata import get_report_data, get_geoview_data
from pipeline.preprocessing.processing import match_triggers, triggers_json_to_df, load_triggers, load_json

classifier.py
helpers.py
processing.py
metadata.py


In [3]:
# testing engineering for extracting triggers from entity ruler, rather than on matcher
from pipeline.preprocessing.processing import load_spacy_model, create_trigger_ruler

nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=False,
                       stopword_removal=False, punctuation_removal=True, lemmatize_triggers=True)

Added punctuation removal pipe
Added trigger phrase matcher pipe
Loading spaCy model with spaCy doc output.


# Load DataFrame

In [4]:
capstone_files, files = get_report_data(count_sentences=True, return_files=True)

Loading files as dict: 100%|██████████| 32646/32646 [00:02<00:00, 14598.25it/s]


In [5]:
# full sweep of all documents on triggers to build data manually
# data = {file : [(idx, ent.text) for idx, doc in enumerate(nlp.pipe(
#     sentences, disable=['ner'], n_process=1, cleanup=True, batch_size=10000))
#     for ent in doc.ents if ent.label_ == 'TRIGGER'] for file, sentences in tqdm(
#     files.items(), desc='Extracting TRIGGER entities from documents')}

# from pipeline.preprocessing.processing import data_path
# import json

# with open(data_path / 'sentence_triggers.json', 'w+') as f:
#     json.dump(data, f)

# df = pd.DataFrame([
#     {"filename": filename, "idx": int(match[0]), "trigger": match[1]} 
#     for filename, matches in data.items() for match in matches
# ])

# df.groupby(['filename','idx'])['trigger'].apply(
#     lambda x : ', '.join(x).strip()).reset_index().rename(
#     columns={'trigger':'triggers'})#.to_csv('extracted_triggers_grouped.csv', index_label='triggers_id')


In [6]:
geoview = get_geoview_data()

# load triggers grouped by sentence_idx in each file - triggers are comma separated strings
extracted_triggers = pd.read_csv(os.path.join('data','events', 'extracted_triggers_grouped.csv'), index_col=0)

# merge capstone_file data with anumber data with extracted triggers, joined on filename
source = capstone_files.merge(
    geoview[['anumber','report_type']], on='anumber').merge(
    extracted_triggers, on='filename')

# display source data
source.head()

Unnamed: 0,filename,anumber,sentence_count,report_type,idx,triggers
0,a074282_reidy_annual2006_12863956.json,74282,141,Annual,19,prospect
1,a074282_reidy_annual2006_12863956.json,74282,141,Annual,43,"anomalism, mineralization"
2,a074282_reidy_annual2006_12863956.json,74282,141,Annual,53,"supported, mineralization"
3,a074282_reidy_annual2006_12863956.json,74282,141,Annual,84,extensive
4,a074282_reidy_annual2006_12863956.json,74282,141,Annual,123,presence of


In [10]:
# specify labellers
users = ('daniel','charlie')
dataset = {}

### SPECIFY USER ###
user = 'charlie'

# get data from path given user
from pathlib import Path
base_path = Path('.')
labelled_path = base_path / 'data' / 'events' / f'{user}_dataset.csv'

for usr in users:
    if usr==user:
        if os.path.isfile(labelled_path):
            print(f'File for {user} at {labelled_path.resolve()} already exists. Loading file.')
            dataset[user] = pd.read_csv(labelled_path, index_col=0)
        else:
            print(f'Creating new dataset for {user}.')
            # create a dataset for each labeller based off the source data
            dataset[user] = pd.DataFrame.from_dict(dict(
                    source.to_dict(),
                    **{'reviewed': {idx: False for idx in range(len(source))}},
                    **{'label': {idx: False for idx in range(len(source))}},
                    **{'confidence': {idx: None for idx in range(len(source))}},
                    **{'lower_bound': {idx: 0 for idx in range(len(source))}},
                    **{'upper_bound': {idx: 0 for idx in range(len(source))}}))

dataset[user].head()

File for charlie at /home/daniel/capstone/data/events/charlie_dataset.csv already exists. Loading file.


Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,a074282_reidy_annual2006_12863956.json,74282,141,Annual,19,prospect,False,False,,0,0
1,a074282_reidy_annual2006_12863956.json,74282,141,Annual,43,"anomalism, mineralization",False,False,,0,0
2,a074282_reidy_annual2006_12863956.json,74282,141,Annual,53,"supported, mineralization",False,False,,0,0
3,a074282_reidy_annual2006_12863956.json,74282,141,Annual,84,extensive,False,False,,0,0
4,a074282_reidy_annual2006_12863956.json,74282,141,Annual,123,presence of,False,False,,0,0


In [11]:
dataset[user].to_csv(labelled_path, index=True, index_label='index')

In [12]:
from sklearn.model_selection import train_test_split

seed = 1

def stratify_source_data(source, n, prop=False, seed=None, stratify_col='report_type'):
    if prop:  # if n is not a percentage/proportion
        n /= len(source)  # calculate count of total
    train_idx, test_idx, train_label, test_label = train_test_split(source.index, source.label,
        test_size = n, random_state = seed, stratify = source[stratify_col])
    
    return train_idx, test_idx, train_label, test_label

def get_indices(dataset, n=100, prop=False, seed=None):
    _, indices, _, _ = stratify_source_data(dataset, n=n, prop=prop, seed=seed)
    return indices

In [13]:
# get indices from stratified sampling procedure - make sure to set seed for consistent
indices = get_indices(dataset[user], n=100, seed=seed)
indices

Int64Index([532931, 523097, 355152, 276101, 390913, 208675, 395272, 265797,
            575037,  69689, 121726, 128143, 173377, 458102, 450338,  38181,
            191079,  35443, 496615, 225020, 394485, 392208, 480358, 234691,
            511076, 431132, 267625, 224314, 576125, 287377, 101796, 236045,
            484381, 475250, 314849, 570322, 362909,  52756, 357674, 175385,
            171135, 124540, 549955, 150851, 136003, 294112, 133431, 307573,
            376212,  21735, 223265,  68167, 330087, 415349, 135993, 192129,
             72182, 197928, 551341, 569124, 262790,  29297, 517991, 486495,
            257661, 292355,  78745, 437833, 490385, 397661, 157226, 535087,
            491340, 493256,   1446, 251355,  74487, 161471, 177376, 261531,
            157255,  34201, 542703,  24731,  98896, 296854,  28801,  94085,
            201257, 400822, 398959, 474487, 172004, 449215, 235744, 133884,
            123886, 279087, 252308, 534321],
           dtype='int64', name='index')

In [14]:
# return the user's dataset given the sampled indices but with rows that have not been reviewed
df = dataset[user].loc[indices].loc[~dataset[user].reviewed]
df

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
532931,a106392_unlock-e51_1063_2015s.json,106392,161,Final Surrender,117,broad,False,False,,0,0
523097,a078348_c125_2004_2008a_15517251.json,78348,247,Annual,195,mineralisation,False,False,,0,0
355152,a079276_p29_1795_2008a_10208681.json,79276,72,Annual,64,mineralisation,False,False,,0,0
276101,a081229_c117_1997_2009a_15776919.json,81229,13502,Annual,1396,broad,False,False,,0,0
390913,a098901_e77_1987_2013_a_16118961.json,98901,360,Annual,249,mineralisation,False,False,,0,0
...,...,...,...,...,...,...,...,...,...,...,...
133884,a088448_r206 2010_zuleika-kunanalling project_...,88448,539,Annual,505,"proposed, prospect, broads",False,False,,0,0
123886,a099477_finalgeologicalreport2013ddh_july2013_...,99477,248,Co-Funded Drilling,233,support,False,False,,0,0
279087,a098683_e28-2096_2013 annual report_9919264.json,98683,439,Annual,278,extensive,False,False,,0,0
252308,a081794_annual technical report e21_124_lakesi...,81794,94,Annual,46,significant,False,False,,0,0


In [15]:
# prepare data to review for user
i = 0  # starting position in the list of indices sampled  (Saved to df functioning as sentence idx)
prev = 0  # the stored number of sentences shown previous to the centre trigger sentence (Saved to df)
fwd = 0  # the stored number of sentences shown after the centre trigger sentence (saved to df)
event = df.loc[indices[i]]  # first event

def get_index_text(idx):
    return f"<h3>Current Index: {indices[i]} / {len(source)}</h3>"

def get_trigger_text(idx):  
    trigger_words = ''.join([f'<li>{w}</li>' for w in df.loc[idx,'triggers'].split(',')])
    return f"<h3>Trigger Words</h3><ul>{trigger_words}</ul>"

def yes_pressed(b):
    label_sentence(label=True)

def no_pressed(b):
    label_sentence(label=False)
    
def revert_pressed(b):
    revert_changes()

def label_sentence(label):
    global i

    # get confidence from button
    confidence = confidence_toggle.get_interact_value()
    
    # assign data
    df.loc[indices[i], 'reviewed'] = True
    df.loc[indices[i], 'label'] = label
    df.loc[indices[i], 'confidence'] = confidence

    # save lower_idx and upper_idx
    if save_chunk_toggle.get_interact_value():
        df.loc[indices[i], 'lower_bound'] = prev_input.value
        df.loc[indices[i], 'upper_bound'] = fwd_input.value
        
    # increment to next index
    i += 1
    
    event = df.loc[indices[i]]  # get index position, not loc
    idx_text.value = get_index_text(indices[i])
    confidence_toggle.value = 'High'
    triggers_widget.value = get_trigger_text(indices[i])
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"
        
def revert_changes():
    global i
    global prev
    global fwd
    
    # get data from toggle button
    confidence = confidence_toggle.get_interact_value()
    
    # revert idx back one
    i -= 1
    
    # reset data
    df.loc[indices[i], 'reviewed'] = False
    df.loc[indices[i], 'label'] = False
    df.loc[indices[i], 'confidence'] = None
    df.loc[indices[i], 'lower_idx'] = 0
    df.loc[indices[i], 'upper_idx'] = 0
    
    # get event
    event = df.loc[idx]
    
    # update widget values
    idx_text.value = get_index_text(idx)
    confidence_toggle.value = 'High'
    triggers_widget.value = get_trigger_text(idx)
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"

def save_pressed(b):
    df.to_csv(labelled_path, index=True, index_label='index')

idx_text = widgets.HTML(value=get_index_text(indices[i]))
triggers_widget = widgets.HTML(value=get_trigger_text(indices[i]))
line_break = widgets.HTML(value='\n')

prev_input = widgets.BoundedIntText(
    value=0,
    min=-10,
    max=0,
    step=1,
    description='Previous',
    disabled=False
)

fwd_input = widgets.BoundedIntText(
    value=0,
    min=0,
    max=10,
    step=1,
    description='Next',
    disabled=False
)

def update_chunk(b):
    global i
    global prev
    global fwd
    
    event = df.loc[indices[i]]
    #triggers_widget.value = get_trigger_text(event.triggers)
    
    prev = prev_input.value
    fwd = fwd_input.value

    with prev_output:
        if prev < 0:
            prev_output.clear_output()
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            display_ent(prev_chunk)
            prev_output.layout.display = "block"
        else:
            prev_output.layout.display = 'none'
        
    with fwd_output:
        if fwd > 0:
            fwd_output.clear_output()
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            display_ent(fwd_chunk)
            fwd_output.layout.display = "block"
        else:        
            fwd_output.layout.display = 'none'

def toggle_ner(toggle):
    global idx
    global prev
    global fwd
    
    toggle = ner_toggle.get_interact_value()
    
    prev = prev_input.value
    fwd = fwd_input.value
    
    if prev < 0:
        with prev_output:
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            if toggle:
                prev_output.clear_output()
                display_ent(prev_chunk)
            else:
                prev_chunk_text = ' '.join([token.text for token in prev_chunk]).strip()
                prev_output.clear_output()
                display(prev_chunk_text)
                
            prev_output.layout.display = "block"
            
    if fwd > 0:
        with fwd_output:
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            if toggle:
                fwd_output.clear_output()
                display_ent(fwd_chunk)
            else:
                fwd_chunk_text = ' '.join([token.text for token in fwd_chunk]).strip()
                fwd_output.clear_output()
                display(fwd_chunk_text)
                
            fwd_output.layout.display = "block"
            
prev_output = widgets.Output()
event_output = widgets.Output()
fwd_output = widgets.Output()

prev_output.layout.display = "none"
fwd_output.layout.display = "none"

yes = widgets.Button(description='Near Miss Event', button_style='success')
yes.on_click(yes_pressed)

no = widgets.Button(description='Not Near Miss Event', button_style='danger')
no.on_click(no_pressed)

save = widgets.Button(description='Save DataFrame', button_style='primary')
save.on_click(save_pressed)

revert = widgets.Button(description='Revert Index', button_style='warning')
revert.on_click(revert_pressed)

update = widgets.Button(description='Update Text Range', button_style='')
update.on_click(update_chunk)

update_ner = widgets.Button(description='Update NER Display', button_style='')
update_ner.on_click(toggle_ner)

ner_toggle = widgets.Checkbox(
    value=True,
    description='External NER',
    disabled=False
)

confidence_toggle = widgets.ToggleButtons(
    options=['Low', 'Medium', 'High'],
    value='High',
    description='Confidence',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=[
        'An uncertain interpretation worth reviewing',
        'A reasonable interpretation of a near miss',
        'A certain interpretation of a near miss'],
)

save_chunk_toggle = widgets.Checkbox(
    value=True,
    description='Save Text Chunk Bounds',
    disabled=False,
    #button_style='info',
    #tooltip='Saves lower and upper bound indices on text chunks'
)

settings_text =  widgets.HTML(value="<h3>Text Chunk Settings</h3>")
labeller_text = widgets.HTML(value="<h3>Labeller</h3>")

#text_chunk_box = widgets.VBox([settings_text, save_chunk_toggle])

text_buttons = widgets.HBox([update, save_chunk_toggle])
ner_buttons = widgets.HBox([update_ner, ner_toggle])
label_buttons = widgets.HBox([yes, no])
function_buttons = widgets.HBox([save, revert])
buttons = widgets.VBox([label_buttons, function_buttons])

with event_output:
    display_ent(nlp(files[event.filename][event.idx]))
    
with prev_output:
    display(widgets.HTML(value='\n'))

with fwd_output:
    display(widgets.HTML(value=''))
                            
display(idx_text, triggers_widget, settings_text, prev_input, fwd_input, text_buttons, ner_buttons,
        labeller_text, buttons, line_break, confidence_toggle, prev_output, line_break, event_output, 
        line_break, fwd_output,)

HTML(value='<h3>Current Index: 532931 / 100</h3>')

HTML(value='<h3>Trigger Words</h3><ul><li>broad</li></ul>')

HTML(value='<h3>Text Chunk Settings</h3>')

BoundedIntText(value=0, description='Previous', max=0, min=-10)

BoundedIntText(value=0, description='Next', max=10)

HBox(children=(Button(description='Update Text Range', style=ButtonStyle()), Checkbox(value=True, description=…

HBox(children=(Button(description='Update NER Display', style=ButtonStyle()), Checkbox(value=True, description…

HTML(value='<h3>Labeller</h3>')

VBox(children=(HBox(children=(Button(button_style='success', description='Near Miss Event', style=ButtonStyle(…

HTML(value='\n')

ToggleButtons(description='Confidence', index=2, options=('Low', 'Medium', 'High'), tooltips=('An uncertain in…

Output(layout=Layout(display='none'))

HTML(value='\n')

Output()

HTML(value='\n')

Output(layout=Layout(display='none'))

In [None]:
# save backup
df.to_csv(os.path.join('data','events',f'{user}_dataset_backup.csv'), index=True, index_label='index')