In [87]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [88]:
from tqdm import tqdm
import os
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

import spacy
from spacy import displacy

from pipeline.utils.display import display_ent
from pipeline.data.metadata import get_report_data, get_geoview_data
from pipeline.preprocessing.processing import match_triggers, triggers_json_to_df, load_triggers, load_json

In [89]:
# testing engineering for extracting triggers from entity ruler, rather than on matcher
from pipeline.preprocessing.processing import load_spacy_model, create_trigger_ruler

nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True,
                       stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True)

Added geological entity matcher pipe
Added trigger phrase matcher pipe
Loading spaCy model with spaCy doc output.


# Load DataFrame

In [90]:
capstone_files, files = get_report_data(count_sentences=True, return_files=True)

Loading files as dict: 100%|██████████| 32646/32646 [00:02<00:00, 14672.21it/s]


In [91]:
# full sweep of all documents on triggers to build data manually
# data = {file : [(idx, ent.text) for idx, doc in enumerate(nlp.pipe(
#     sentences, disable=['ner'], n_process=1, cleanup=True, batch_size=10000))
#     for ent in doc.ents if ent.label_ == 'TRIGGER'] for file, sentences in tqdm(
#     files.items(), desc='Extracting TRIGGER entities from documents')}

# from pipeline.preprocessing.processing import data_path
# import json

# with open(data_path / 'sentence_triggers.json', 'w+') as f:
#     json.dump(data, f)

# df = pd.DataFrame([
#     {"filename": filename, "idx": int(match[0]), "trigger": match[1]} 
#     for filename, matches in data.items() for match in matches
# ])

# df.groupby(['filename','idx'])['trigger'].apply(
#     lambda x : ', '.join(x).strip()).reset_index().rename(
#     columns={'trigger':'triggers'})#.to_csv('extracted_triggers_grouped.csv', index_label='triggers_id')


In [92]:
# specify labellers
users = ('daniel','charlie')
dataset = {}

### SPECIFY USER ###
user = 'daniel'

# get data from path given user
from pathlib import Path
base_path = Path('.')
labelled_path = {user : base_path / 'data' / 'events' / f'{user}_dataset.csv' for user in users}

# get geoview data
geoview = get_geoview_data()

# load triggers grouped by sentence_idx in each file - triggers are comma separated strings
extracted_triggers = pd.read_csv(os.path.join('data','events', 'extracted_triggers_grouped.csv'), index_col=0)

# merge capstone_file data with anumber data with extracted triggers, joined on filename
source = capstone_files.merge(
    geoview[['anumber','report_type']], on='anumber').merge(
    extracted_triggers, on='filename').sort_values(['anumber','filename','idx'])

for usr in users:
    #if usr==user:
    if os.path.isfile(labelled_path[usr]):
        print(f'File for {usr} at {labelled_path[usr].resolve()} already exists. Loading file.')
        dataset[usr] = pd.read_csv(labelled_path[usr], index_col=0)
    else:
        print(f'Creating new dataset for {usr}.')
        # create a dataset for each labeller based off the source data
        dataset[usr] = pd.DataFrame.from_dict(dict(
                source.to_dict(),
                **{'reviewed': {idx: False for idx in range(len(source))}},
                **{'label': {idx: False for idx in range(len(source))}},
                **{'confidence': {idx: None for idx in range(len(source))}},
                **{'lower_bound': {idx: 0 for idx in range(len(source))}},
                **{'upper_bound': {idx: 0 for idx in range(len(source))}}))

source.head()

File for daniel at /home/dawn/capstone/data/events/daniel_dataset.csv already exists. Loading file.
File for charlie at /home/dawn/capstone/data/events/charlie_dataset.csv already exists. Loading file.


Unnamed: 0,filename,anumber,sentence_count,report_type,idx,triggers
0,a080307_mk_east_surr report_2008_e39_1331_1531...,80307,44,Final Surrender,15,prospectivity
1,a080307_mk_east_surr report_2008_e39_1331_1531...,80307,44,Final Surrender,16,delineate
2,a080307_mk_east_surr report_2008_e39_1331_1531...,80307,44,Final Surrender,29,potentially
3,a080307_mk_east_surr report_2008_e39_1331_1531...,80307,44,Final Surrender,30,"delineate, worthwhile"
4,a080307_mk_east_surr report_2008_e39_1331_1531...,80307,44,Final Surrender,32,"delineate, prospective"


In [93]:
for usr in users:
    print(f'{usr} has reviewed {len(dataset[usr].loc[dataset[usr].reviewed])} events.')

daniel has reviewed 251 events.
charlie has reviewed 502 events.


In [122]:
dataset['daniel'].sort_values(['anumber','filename','idx']).reset_index(drop=True).to_csv(
    'data/events/daniel_dataset.csv', index=True, index_label='index')

In [95]:
source.sort_values(['anumber','filename','idx']).index

Unnamed: 0,filename,anumber,sentence_count,report_type,idx,triggers
60673,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,31,"encouragement, prospective"
60674,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,39,potential
60675,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,58,"potential, mineralisation"
60676,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,59,"prospective, mineralisation"
60677,a070455_partial surrender report e59-907_97711...,70455,79,Partial Surrender,76,"potential, mineralisation"
...,...,...,...,...,...,...
116299,a110187_a110187_v1_report.json,110187,74,Partial Surrender,57,mineralisation
304446,a110191_p812 thin section descriptions.json,110191,95,Core Library Drilling,43,could be
304447,a110191_p812 thin section descriptions.json,110191,95,Core Library Drilling,50,mineralization
304448,a110191_p812 thin section descriptions.json,110191,95,Core Library Drilling,63,mineralization


In [9]:
#dataset[user].to_csv(base_path / 'data' / 'events' / f'{user}_dataset.csv', index=True, index_label='index')

In [67]:
from sklearn.model_selection import train_test_split

seed = 10

def stratify_source_data(source, n, prop=False, seed=None, stratify_col='report_type'):
    if prop:  # if n is not a percentage/proportion
        n /= len(source)  # calculate count of total
    train_idx, test_idx, train_label, test_label = train_test_split(source.index, source.label,
        test_size = n, random_state = seed, stratify = source[stratify_col])
    
    return train_idx, test_idx, train_label, test_label

def get_indices(dataset, n=100, prop=False, seed=None):
    _, indices, _, _ = stratify_source_data(dataset, n=n, prop=prop, seed=seed)
    return indices

In [11]:
# get indices from stratified sampling procedure - make sure to set seed for consistent
batch_size = 100
batch_indices = get_indices(dataset['charlie'], n=batch_size, seed=seed)
batch_indices

Int64Index([285412,  26778,  31162, 286664, 408324, 349105,  34920, 250797,
            328911, 260302, 568349,  70275, 543935, 571715, 225574, 231065,
            275411, 129557,  91511, 265049, 400619, 419397, 372212, 421557,
            580662, 447749, 576518, 305827, 366798, 420271, 127401, 245968,
            243974, 475728,  64203, 497250, 273766, 390860, 429558, 236348,
            187472,  10158, 317314, 476414, 480795,  28965,  12979,  90799,
            397851, 235563, 459611,  31018, 374605, 352860, 561565, 433264,
            296896, 124348, 440417,  38657, 460871, 138161, 311304, 227331,
            522088,  43070,  83093, 398852,  35131, 243003, 494495, 362759,
            188415, 141145, 143640, 216147, 108694,  61997, 360421,  67367,
            530495, 147586, 532845, 286687, 317556, 244622, 455905, 382037,
            305466, 383540, 537033, 287889, 522100, 493038,  42884,  48654,
            290547, 428452, 566939,  27115],
           dtype='int64', name='index')

In [71]:
batch_indices = pd.read_csv('seed10_batch100_indices.csv').values.T[0]
batch_indices

array([  9092,  11186,  11551,  20061,  24695,  26427,  26998,  28457,
        33189,  33414,  33806,  37263,  38096,  40028,  41786,  48461,
        48485,  48754,  50267,  54750,  54967,  56041,  60868,  60963,
        62598,  65214,  67138,  70347,  70470,  73079,  75706,  77245,
        77587,  81061,  81599,  82310,  89038,  89542,  95963,  99048,
       102190, 105701, 108889, 109854, 110395, 111796, 113777, 114116,
       115287, 115976, 118263, 122704, 129023, 134408, 140610, 143150,
       143604, 147669, 147845, 147859, 147863, 149553, 157524, 165772,
       169365, 170846, 172874, 174548, 177526, 178091, 182736, 193060,
       193530, 197335, 203459, 203749, 206038, 210626, 211227, 211289,
       212172, 212723, 213076, 214544, 217511, 218569, 224976, 225551,
       226770, 228276, 228972, 229145, 229342, 236386, 239064, 241362,
       247558, 252743, 256837, 259248, 261065, 269193, 270295, 278521,
       285500, 294844, 295760, 298636, 307993, 309426, 311953, 315813,
      

In [72]:
# return the user's dataset given the sampled indices but with rows that have not been reviewed
df = dataset[user].loc[batch_indices].loc[~dataset[user].reviewed]
indices = df.index

# indices in batch that havent been reviewed
print(f'sample size: {len(indices)}')
indices

sample size: 210


Int64Index([  9092,  11186,  11551,  20061,  24695,  26427,  26998,  28457,
             33189,  33414,
            ...
            547574, 550630, 554353, 554508, 556744, 564183, 564281, 569614,
            571824, 579969],
           dtype='int64', name='index', length=210)

In [73]:
# prepare data to review for user
i = 0  # starting position in the list of indices sampled  (Saved to df functioning as sentence idx)
prev = 0  # the stored number of sentences shown previous to the centre trigger sentence (Saved to df)
fwd = 0  # the stored number of sentences shown after the centre trigger sentence (saved to df)
event = df.loc[indices[i]]  # first event

def get_index_text(idx):
    return f"<h3>Current Index: {indices[i]} / {len(source)}</h3>"

def get_trigger_text(idx):  
    trigger_words = ''.join([f'<li>{w}</li>' for w in df.loc[idx,'triggers'].split(',')])
    return f"<h3>Trigger Words</h3><ul>{trigger_words}</ul>"

def yes_pressed(b):
    label_sentence(label=True)

def no_pressed(b):
    label_sentence(label=False)
    
def revert_pressed(b):
    revert_changes()

def label_sentence(label):
    global i

    # get confidence from button
    confidence = confidence_toggle.get_interact_value()
    
    # assign data
    df.loc[indices[i], 'reviewed'] = True
    df.loc[indices[i], 'label'] = label
    df.loc[indices[i], 'confidence'] = confidence

    # save lower_idx and upper_idx
    if save_chunk_toggle.get_interact_value():
        df.loc[indices[i], 'lower_bound'] = prev_input.value
        df.loc[indices[i], 'upper_bound'] = fwd_input.value
        
    # increment to next index
    i += 1
    
    event = df.loc[indices[i]]  # get index position, not loc
    idx_text.value = get_index_text(indices[i])
    confidence_toggle.value = 'High'
    triggers_widget.value = get_trigger_text(indices[i])
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"
        
def revert_changes():
    global i
    global prev
    global fwd
    
    # get data from toggle button
    confidence = confidence_toggle.get_interact_value()
    
    # revert idx back one
    i -= 1
    
    # reset data
    df.loc[indices[i], 'reviewed'] = False
    df.loc[indices[i], 'label'] = False
    df.loc[indices[i], 'confidence'] = None
    df.loc[indices[i], 'lower_bound'] = 0
    df.loc[indices[i], 'upper_bound'] = 0
    
    # get event
    event = df.loc[indices[i]]
    
    # update widget values
    idx_text.value = get_index_text(indices[i])
    confidence_toggle.value = 'High'
    triggers_widget.value = get_trigger_text(indices[i])
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"

def save_pressed(b):
    # print('Dataset saved.')
    dataset[user].loc[indices] = df
    dataset[user].to_csv(labelled_path[user], index=True, index_label='index')

idx_text = widgets.HTML(value=get_index_text(indices[i]))
triggers_widget = widgets.HTML(value=get_trigger_text(indices[i]))
line_break = widgets.HTML(value='\n')

prev_input = widgets.BoundedIntText(
    value=0,
    min=-10,
    max=0,
    step=1,
    description='Previous',
    disabled=False
)

fwd_input = widgets.BoundedIntText(
    value=0,
    min=0,
    max=10,
    step=1,
    description='Next',
    disabled=False
)

def update_chunk(b):
    global i
    global prev
    global fwd
    
    event = df.loc[indices[i]]
    
    prev = prev_input.value
    fwd = fwd_input.value

    with prev_output:
        if prev < 0:
            prev_output.clear_output()
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            display_ent(prev_chunk)
            prev_output.layout.display = "block"
        else:
            prev_output.layout.display = 'none'
        
    with fwd_output:
        if fwd > 0:
            fwd_output.clear_output()
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            display_ent(fwd_chunk)
            fwd_output.layout.display = "block"
        else:        
            fwd_output.layout.display = 'none'

def toggle_ner(toggle):
    global idx
    global prev
    global fwd
    
    toggle = ner_toggle.get_interact_value()
    event = df.loc[indices[i]]
    prev = prev_input.value
    fwd = fwd_input.value
    
    if prev < 0:
        with prev_output:
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            if toggle:
                prev_output.clear_output()
                display_ent(prev_chunk)
            else:
                prev_chunk_text = ' '.join([token.text for token in prev_chunk]).strip()
                prev_output.clear_output()
                display(prev_chunk_text)
                
            prev_output.layout.display = "block"
            
    if fwd > 0:
        with fwd_output:
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            if toggle:
                fwd_output.clear_output()
                display_ent(fwd_chunk)
            else:
                fwd_chunk_text = ' '.join([token.text for token in fwd_chunk]).strip()
                fwd_output.clear_output()
                display(fwd_chunk_text)
                
            fwd_output.layout.display = "block"
            
prev_output = widgets.Output()
event_output = widgets.Output()
fwd_output = widgets.Output()

prev_output.layout.display = "none"
fwd_output.layout.display = "none"

yes = widgets.Button(description='Near Miss Event', button_style='success')
yes.on_click(yes_pressed)

no = widgets.Button(description='Not Near Miss Event', button_style='danger')
no.on_click(no_pressed)

save = widgets.Button(description='Save DataFrame', button_style='primary')
save.on_click(save_pressed)

revert = widgets.Button(description='Revert Index', button_style='warning')
revert.on_click(revert_pressed)

update = widgets.Button(description='Update Text Range', button_style='')
update.on_click(update_chunk)

update_ner = widgets.Button(description='Update NER Display', button_style='')
update_ner.on_click(toggle_ner)

ner_toggle = widgets.Checkbox(
    value=True,
    description='External NER',
    disabled=False
)

confidence_toggle = widgets.ToggleButtons(
    options=['Low', 'Medium', 'High'],
    value='High',
    description='Confidence',
    disabled=False,
    button_style='', # 'success', 'info', 'warning', 'danger' or ''
    tooltips=[
        'An uncertain interpretation worth reviewing',
        'A reasonable interpretation of a near miss',
        'A certain interpretation of a near miss'],
)

save_chunk_toggle = widgets.Checkbox(
    value=True,
    description='Save Text Chunk Bounds',
    disabled=False,
    #button_style='info',
    #tooltip='Saves lower and upper bound indices on text chunks'
)

settings_text =  widgets.HTML(value="<h3>Text Chunk Settings</h3>")
labeller_text = widgets.HTML(value="<h3>Labeller</h3>")

#text_chunk_box = widgets.VBox([settings_text, save_chunk_toggle])

text_buttons = widgets.HBox([update, save_chunk_toggle])
ner_buttons = widgets.HBox([update_ner, ner_toggle])
label_buttons = widgets.HBox([yes, no])
function_buttons = widgets.HBox([save, revert])
buttons = widgets.VBox([label_buttons, function_buttons])

with event_output:
    display_ent(nlp(files[event.filename][event.idx]))
    
with prev_output:
    display(widgets.HTML(value='\n'))

with fwd_output:
    display(widgets.HTML(value=''))
                            
display(idx_text, triggers_widget, settings_text, prev_input, fwd_input, text_buttons, ner_buttons,
        labeller_text, buttons, line_break, confidence_toggle, prev_output, line_break, event_output, 
        line_break, fwd_output,)

HTML(value='<h3>Current Index: 9092 / 580746</h3>')

HTML(value='<h3>Trigger Words</h3><ul><li>possible</li></ul>')

HTML(value='<h3>Text Chunk Settings</h3>')

BoundedIntText(value=0, description='Previous', max=0, min=-10)

BoundedIntText(value=0, description='Next', max=10)

HBox(children=(Button(description='Update Text Range', style=ButtonStyle()), Checkbox(value=True, description=…

HBox(children=(Button(description='Update NER Display', style=ButtonStyle()), Checkbox(value=True, description…

HTML(value='<h3>Labeller</h3>')

VBox(children=(HBox(children=(Button(button_style='success', description='Near Miss Event', style=ButtonStyle(…

HTML(value='\n')

ToggleButtons(description='Confidence', index=2, options=('Low', 'Medium', 'High'), tooltips=('An uncertain in…

Output(layout=Layout(display='none'))

HTML(value='\n')

Output()

HTML(value='\n')

Output(layout=Layout(display='none'))

In [84]:
i

47

In [86]:
dataset[user].loc[dataset[user].reviewed]

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3094,a081811_ashburton se area 2008 annual report_1...,81811,30,Annual,8,"substantial, exploration potential, potential",True,True,High,0,0
7477,a074444_e47_1256_2007a_16626090.json,74444,84,Annual,30,significantly,True,False,High,0,0
7797,a085815_rr_e77-1268-i_2009a_report_12635884.json,85815,142,Annual,113,"mineralisation, potential",True,False,High,0,0
9089,a074494_fairstar_spinifexwell_report_doir_1416...,74494,471,Annual,343,"potential, significant",True,False,High,0,0
9092,a074494_fairstar_spinifexwell_report_doir_1416...,74494,471,Annual,359,possible,True,False,High,0,0
...,...,...,...,...,...,...,...,...,...,...,...
571715,a099457_unlock-e69_2235_e46_780_e45_3747_2013s...,99457,834,Final Surrender,473,proposed,True,False,High,0,0
576518,a076778_anrep363_10243009.json,76778,57,Annual,48,prospective,True,False,High,0,0
578869,a106549_jh_2015a.json,106549,1596,Annual,203,mineralisation,True,False,High,0,0
578922,a106549_jh_2015a.json,106549,1596,Annual,350,"prospect, mineralisation",True,False,High,0,0


In [68]:
display(
    dataset['charlie'].loc[indices[i]],
    files[dataset['charlie'].loc[indices[i],'filename']][dataset['charlie'].loc[indices[i], 'idx']]
)

NameError: name 'i' is not defined

In [69]:
# dataset[user] is your main dataset, indices are the current indices being labelled
# note that df does not get saved unless you manually overwrite!
dataset['charlie'].loc[dataset['charlie'].reviewed]

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
3094,a071977_kurnalpi 2005_19331320.json,71977,171,Annual,138,mineralization,True,True,High,-1,1
7797,a072284_gvjv_19-2005_2006a_12505377.json,72284,654,Annual,258,"prospecting, prospect",True,False,High,0,0
9089,a072376_bb_2006a_10327822.json,72376,1823,Annual,344,prospect,True,True,Low,0,1
12990,a072451_c101_2003_2005a_14708906.json,72451,85,Annual,67,mineralisation,True,True,Medium,0,0
22677,a073100_annual mineral exploration all 2005-6_...,73100,2806,Annual,444,mineralised,True,False,High,0,2
...,...,...,...,...,...,...,...,...,...,...,...
554197,a106349_a106349_v1_report.json,106349,126,Annual,25,support,True,False,High,0,0
556419,a106548_car c111_2006_jutsonrocks2015.json,106548,83,Annual,4,prospect,True,False,High,0,0
572708,a108771_e69_2864 surrender report may 2016.json,108771,384,Final Surrender,345,mineralised,True,True,Low,-1,1
578869,a109646_a109646_v1_report.json,109646,93,Annual,38,"prospects, high grade, mineralisation",True,True,Medium,0,1


In [36]:
dataset[user].loc[indices] = df
dataset[user].to_csv(os.path.join('data','events',f'{user}_dataset.csv'), index=True, index_label='index')

In [34]:
# save backup
dataset[user].loc[indices] = df
dataset[user].to_csv(os.path.join('data','events',f'{user}_dataset_backup.csv'), index=True, index_label='index')