In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
from tqdm import tqdm
import os
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

import spacy
from spacy import displacy

def display_ent(doc, style="ent", colors=None, options=None, compact=True):
    colors = colors or {"TRIGGER": "linear-gradient(90deg, #aa9cfc, #fc9ce7)"}
    options = options or {"ents": None, "colors": colors, "compact": compact}
    displacy.render(doc, style=style, jupyter=True, options=options)

from pipeline.utils.display import display_ent
from pipeline.data.metadata import get_report_data, get_geoview_data
from pipeline.preprocessing.processing import match_triggers, triggers_json_to_df, load_triggers, load_json

In [5]:
# testing engineering for extracting triggers from entity ruler, rather than on matcher
from pipeline.preprocessing.processing import load_spacy_model, create_trigger_ruler

nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True,
                       stopword_removal=False, punctuation_removal=True, lemmatize_triggers=True)

Added punctuation removal pipe
Added geological entity matcher pipe
Added trigger phrase matcher pipe
Loading spaCy model with spaCy doc output.


# Load DataFrame

In [6]:
capstone_files, files = get_report_data(count_sentences=True, return_files=True)

Loading files as dict: 100%|██████████| 32646/32646 [00:21<00:00, 1513.87it/s]


In [5]:
# full sweep of all documents on triggers to build data manually
# data = {file : [(idx, ent.text) for idx, doc in enumerate(nlp.pipe(
#     sentences, disable=['ner'], n_process=1, cleanup=True, batch_size=10000))
#     for ent in doc.ents if ent.label_ == 'TRIGGER'] for file, sentences in tqdm(
#     files.items(), desc='Extracting TRIGGER entities from documents')}

# from pipeline.preprocessing.processing import data_path
# import json

# with open(data_path / 'sentence_triggers.json', 'w+') as f:
#     json.dump(data, f)

# df = pd.DataFrame([
#     {"filename": filename, "idx": int(match[0]), "trigger": match[1]} 
#     for filename, matches in data.items() for match in matches
# ])

# df.groupby(['filename','idx'])['trigger'].apply(
#     lambda x : ', '.join(x).strip()).reset_index().rename(
#     columns={'trigger':'triggers'})#.to_csv('extracted_triggers_grouped.csv', index_label='triggers_id')


In [8]:
# specify labellers
users = ('daniel','charlie')
dataset = {}

### SPECIFY USER ###
user = 'charlie'

# get data from path given user
from pathlib import Path
base_path = Path('.')
labelled_path = base_path / 'data' / 'labels' / f'{user}_dataset.csv'

# get geoview data
geoview = get_geoview_data()

# load triggers grouped by sentence_idx in each file - triggers are comma separated strings
extracted_triggers = pd.read_csv(os.path.join('data','labels', 'extracted_triggers_grouped.csv'), index_col=0)

# merge capstone_file data with anumber data with extracted triggers, joined on filename
source = capstone_files.merge(
    geoview[['anumber','report_type']], on='anumber').merge(
    extracted_triggers, on='filename')

for usr in users:
    if usr==user:
        if os.path.isfile(labelled_path):
            print(f'File for {user} at {labelled_path.resolve()} already exists. Loading file.')
            dataset[user] = pd.read_csv(labelled_path, index_col=0)
        else:
            print(f'Creating new dataset for {user}.')
            # create a dataset for each labeller based off the source data
            dataset[user] = pd.DataFrame.from_dict(dict(
                    source.to_dict(),
                    **{'reviewed': {idx: False for idx in range(len(source))}},
                    **{'label': {idx: False for idx in range(len(source))}},
                    **{'confidence': {idx: None for idx in range(len(source))}},
                    **{'lower_bound': {idx: 0 for idx in range(len(source))}},
                    **{'upper_bound': {idx: 0 for idx in range(len(source))}}))

dataset[user].head()

File for charlie at C:\Users\charl\capstone\data\events\charlie_dataset.csv already exists. Loading file.


Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,31,"encouragement, prospective",False,False,,0,0
1,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,39,potential,False,False,,0,0
2,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,58,"potential, mineralisation",False,False,,0,0
3,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,59,"prospective, mineralisation",False,False,,0,0
4,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,76,"potential, mineralisation",False,False,,0,0


In [13]:
dataset[user].head()

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
0,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,31,"encouragement, prospective",False,False,,0,0
1,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,39,potential,False,False,,0,0
2,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,58,"potential, mineralisation",False,False,,0,0
3,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,59,"prospective, mineralisation",False,False,,0,0
4,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,76,"potential, mineralisation",False,False,,0,0


In [14]:
dataset[user].to_csv(labelled_path, index=True, index_label='index')

In [15]:
from sklearn.model_selection import train_test_split

seed = 10

def stratify_source_data(source, n, prop=False, seed=None, stratify_col='report_type'):
    if prop:  # if n is not a percentage/proportion
        n /= len(source)  # calculate count of total
    train_idx, test_idx, train_label, test_label = train_test_split(source.index, source.label,
        test_size = n, random_state = seed, stratify = source[stratify_col])
    
    return train_idx, test_idx, train_label, test_label

def get_indices(dataset, n=100, prop=False, seed=None):
    _, indices, _, _ = stratify_source_data(dataset, n=n, prop=prop, seed=seed)
    return indices

In [16]:
# get indices from stratified sampling procedure - make sure to set seed for consistent
batch_size = 2000
batch_indices = get_indices(dataset[user], n=batch_size, seed=seed)
batch_indices

Int64Index([345790,   7617, 196549, 415468, 496762,  26180, 250306,  16062,
             23146, 234261,
            ...
            261615, 463083,  80028, 257020, 267195, 240767, 554063, 559163,
            253669,  53413],
           dtype='int64', name='index', length=2000)

In [17]:
# return the user's dataset given the sampled indices but with rows that have not been reviewed
df = dataset[user].loc[batch_indices].loc[~dataset[user].reviewed]
indices = df.index

# indices in batch that havent been reviewed
indices

Int64Index([345790,   7617, 196549, 415468, 496762,  26180, 250306,  16062,
             23146, 234261,
            ...
            261615, 463083,  80028, 257020, 267195, 240767, 554063, 559163,
            253669,  53413],
           dtype='int64', name='index', length=1936)

In [18]:
# prepare data to review for user
i = 0  # starting position in the list of indices sampled  (Saved to df functioning as sentence idx)
prev = 0  # the stored number of sentences shown previous to the centre trigger sentence (Saved to df)
fwd = 0  # the stored number of sentences shown after the centre trigger sentence (saved to df)
event = df.loc[indices[i]]  # first event

def get_index_text(idx):
    return f"<h3>Current Index: {indices[i]} / {len(source)}</h3>"

def get_trigger_text(idx):  
    trigger_words = ''.join([f'<li>{w}</li>' for w in df.loc[idx,'triggers'].split(',')])
    return f"<h3>Trigger Words</h3><ul>{trigger_words}</ul>"

def yes_pressed(b):
    label_sentence(label=True)

def no_pressed(b):
    label_sentence(label=False)
    
def revert_pressed(b):
    revert_changes()

def label_sentence(label):
    global i

    # get confidence from button
    confidence = confidence_toggle.get_interact_value()
    
    # assign data
    df.loc[indices[i], 'reviewed'] = True
    df.loc[indices[i], 'label'] = label
    df.loc[indices[i], 'confidence'] = confidence

    # save lower_idx and upper_idx
    if save_chunk_toggle.get_interact_value():
        df.loc[indices[i], 'lower_bound'] = prev_input.value
        df.loc[indices[i], 'upper_bound'] = fwd_input.value
        
    # increment to next index
    i += 1
    
    event = df.loc[indices[i]]  # get index position, not loc
    idx_text.value = get_index_text(indices[i])
    confidence_toggle.value = 'High'
    triggers_widget.value = get_trigger_text(indices[i])
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"
        
def revert_changes():
    global i
    global prev
    global fwd
    
    # get data from toggle button
    confidence = confidence_toggle.get_interact_value()
    
    # revert idx back one
    i -= 1
    
    # reset data
    df.loc[indices[i], 'reviewed'] = False
    df.loc[indices[i], 'label'] = False
    df.loc[indices[i], 'confidence'] = None
    df.loc[indices[i], 'lower_bound'] = 0
    df.loc[indices[i], 'upper_bound'] = 0
    
    # get event
    event = df.loc[indices[i]]
    
    # update widget values
    idx_text.value = get_index_text(indices[i])
    confidence_toggle.value = 'High'
    triggers_widget.value = get_trigger_text(indices[i])
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"

def save_pressed(b):
    # print('Dataset saved.')
    dataset[user].loc[indices] = df
    dataset[user].to_csv(labelled_path, index=True, index_label='index')

idx_text = widgets.HTML(value=get_index_text(indices[i]))
triggers_widget = widgets.HTML(value=get_trigger_text(indices[i]))
line_break = widgets.HTML(value='\n')

prev_input = widgets.BoundedIntText(
    value=0,
    min=-10,
    max=0,
    step=1,
    description='Previous',
    disabled=False
)

fwd_input = widgets.BoundedIntText(
    value=0,
    min=0,
    max=10,
    step=1,
    description='Next',
    disabled=False
)

def update_chunk(b):
    global i
    global prev
    global fwd
    
    event = df.loc[indices[i]]
    
    prev = prev_input.value
    fwd = fwd_input.value

    with prev_output:
        if prev < 0:
            prev_output.clear_output()
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            display_ent(prev_chunk)
            prev_output.layout.display = "block"
        else:
            prev_output.layout.display = 'none'
        
    with fwd_output:
        if fwd > 0:
            fwd_output.clear_output()
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            display_ent(fwd_chunk)
            fwd_output.layout.display = "block"
        else:        
            fwd_output.layout.display = 'none'

def toggle_ner(toggle):
    global idx
    global prev
    global fwd
    
    toggle = ner_toggle.get_interact_value()
    
    prev = prev_input.value
    fwd = fwd_input.value
    
    if prev < 0:
        with prev_output:
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            if toggle:
                prev_output.clear_output()
                display_ent(prev_chunk)
            else:
                prev_chunk_text = ' '.join([token.text for token in prev_chunk]).strip()
                prev_output.clear_output()
                display(prev_chunk_text)
                
            prev_output.layout.display = "block"
            
    if fwd > 0:
        with fwd_output:
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            if toggle:
                fwd_output.clear_output()
                display_ent(fwd_chunk)
            else:
                fwd_chunk_text = ' '.join([token.text for token in fwd_chunk]).strip()
                fwd_output.clear_output()
                display(fwd_chunk_text)
                
            fwd_output.layout.display = "block"
            
prev_output = widgets.Output()
event_output = widgets.Output()
fwd_output = widgets.Output()

prev_output.layout.display = "none"
fwd_output.layout.display = "none"

yes = widgets.Button(description='Near Miss Event', button_style='success')
yes.on_click(yes_pressed)

no = widgets.Button(description='Not Near Miss Event', button_style='danger')
no.on_click(no_pressed)

save = widgets.Button(description='Save DataFrame', button_style='primary')
save.on_click(save_pressed)

revert = widgets.Button(description='Revert Index', button_style='warning')
revert.on_click(revert_pressed)

update = widgets.Button(description='Update Text Range', button_style='')
update.on_click(update_chunk)

update_ner = widgets.Button(description='Update NER Display', button_style='')
update_ner.on_click(toggle_ner)

ner_toggle = widgets.Checkbox(
    value=True,
    description='External NER',
    disabled=False
)

confidence_toggle = widgets.ToggleButtons(
    options=['Low', 'Medium', 'High'],
    value='High',
    description='Confidence',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=[
        'An uncertain interpretation worth reviewing',
        'A reasonable interpretation of a near miss',
        'A certain interpretation of a near miss'],
)

save_chunk_toggle = widgets.Checkbox(
    value=True,
    description='Save Text Chunk Bounds',
    disabled=False,
    #button_style='info',
    #tooltip='Saves lower and upper bound indices on text chunks'
)

settings_text =  widgets.HTML(value="<h3>Text Chunk Settings</h3>")
labeller_text = widgets.HTML(value="<h3>Labeller</h3>")

#text_chunk_box = widgets.VBox([settings_text, save_chunk_toggle])

text_buttons = widgets.HBox([update, save_chunk_toggle])
ner_buttons = widgets.HBox([update_ner, ner_toggle])
label_buttons = widgets.HBox([yes, no])
function_buttons = widgets.HBox([save, revert])
buttons = widgets.VBox([label_buttons, function_buttons])

with event_output:
    display_ent(nlp(files[event.filename][event.idx]))
    
with prev_output:
    display(widgets.HTML(value='\n'))

with fwd_output:
    display(widgets.HTML(value=''))
                            
display(idx_text, triggers_widget, settings_text, prev_input, fwd_input, text_buttons, ner_buttons,
        labeller_text, buttons, line_break, confidence_toggle, prev_output, line_break, event_output, 
        line_break, fwd_output,)

HTML(value='<h3>Current Index: 345790 / 580746</h3>')

HTML(value='<h3>Trigger Words</h3><ul><li>optimising</li></ul>')

HTML(value='<h3>Text Chunk Settings</h3>')

BoundedIntText(value=0, description='Previous', max=0, min=-10)

BoundedIntText(value=0, description='Next', max=10)

HBox(children=(Button(description='Update Text Range', style=ButtonStyle()), Checkbox(value=True, description=…

HBox(children=(Button(description='Update NER Display', style=ButtonStyle()), Checkbox(value=True, description…

HTML(value='<h3>Labeller</h3>')

VBox(children=(HBox(children=(Button(button_style='success', description='Near Miss Event', style=ButtonStyle(…

HTML(value='\n')

ToggleButtons(description='Confidence', index=2, options=('Low', 'Medium', 'High'), tooltips=('An uncertain in…

Output(layout=Layout(display='none'))

HTML(value='\n')

Output()

HTML(value='\n')

Output(layout=Layout(display='none'))

In [58]:
# df is temp data store for your data - shown below
df.loc[df.reviewed]

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
345790,a088286_caz_pil_e47_1617_2010an_15364631.json,88286,105,Annual,93,optimising,True,True,Medium,-1,0
7617,a072262_bhp383165_2006a_14910125.json,72262,29,Annual,6,proposed,True,False,High,0,1
196549,a081133_c198_2007_text_2008a_15349268.json,81133,476,Annual,260,significantly,True,False,High,-1,1
415468,a093792_e52_2284_2011_2012_a_11050090.json,93792,59,Final Surrender,46,anomalous,True,False,High,0,0
496762,a100684_c210_2010_2013s_11560360.json,100684,576,Final Surrender,184,"mineralisation, prospect",True,False,High,0,1
...,...,...,...,...,...,...,...,...,...,...,...
466761,a098135_c12-2007 annual report mt jackson east...,98135,62,Annual,28,mineralisation,True,False,High,0,1
310962,a086838_c73_2008_2010a_14517258.json,86838,474,Annual,47,mineralisation,True,False,High,0,0
368079,a089813_lake_mackay_c135_2008_2011a_14936741.json,89813,173,Annual,104,prospect,True,False,High,-1,1
91054,a076280_cat camp report_feb2007_10604178.json,76280,432,Annual,212,significant,True,False,High,0,1


In [61]:
# dataset[user] is your main dataset, indices are the current indices being labelled
# note that df does not get saved unless you manually overwrite!
dataset[user].loc[dataset[user].reviewed]

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
118,a071633_27600 morrissey_9355974.json,71633,92,Final Surrender,51,extensive,True,False,High,0,0
3094,a071977_kurnalpi 2005_19331320.json,71977,171,Annual,138,mineralization,True,True,High,-1,1
3728,a072020_goldenvalley_ann_05_11276734.json,72020,294,Annual,61,prospects,True,False,High,0,0
5897,a072158_c15_2006_2005a_15133097.json,72158,186,Annual,63,mineralisation,True,True,Low,0,0
6304,a072178_eureka annual report 2005-2006_1467518...,72178,68,Annual,50,"underground resource, underground resource",True,True,High,0,1
...,...,...,...,...,...,...,...,...,...,...,...
578922,a109668_a109668_v1_report.json,109668,88,Final Surrender,76,presence of,True,False,High,0,1
580021,a109843_a109843_v1_report.json,109843,75,Partial Surrender,22,"high grade, exploration drilling",True,False,High,0,1
580132,a109904_a109904_v1_report.json,109904,201,Partial Surrender,53,mineralisation,True,True,Low,0,2
580251,a109930_a109930_v1_report.json,109930,118,Final Surrender,23,significant,True,False,High,0,0


In [37]:
# save backup
dataset[user].loc[indices] = df
dataset[user].to_csv(os.path.join('data','labels',f'{user}_dataset_backup.csv'), index=True, index_label='index')

In [62]:
source.head()

Unnamed: 0,filename,anumber,sentence_count,report_type,idx,triggers
0,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,31,"encouragement, prospective"
1,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,39,potential
2,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,58,"potential, mineralisation"
3,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,59,"prospective, mineralisation"
4,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,76,"potential, mineralisation"
