In [1]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
from tqdm import tqdm
from pathlib import Path
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter
import pandas as pd
import numpy as np

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
from IPython.display import display

import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

from pipeline.utils.display import display_ent
    
GROUP = 2
base_path = Path('.')
events_path = base_path / 'events'
dictionary_path = base_path / 'dictionary'
patterns_path = dictionary_path / 'patterns'
group_events_path = events_path / f'group_{GROUP}_events.csv'
labelled_path = events_path / f'group_{GROUP}_labelled.csv'

classifier.py
helpers.py
processing.py
metadata.py


In [3]:
from pipeline.data.metadata import get_report_data
from pipeline.preprocessing.processing import match_triggers, triggers_json_to_df
from pipeline.preprocessing.processing import load_triggers, load_json

# Load DataFrame

In [9]:
capstone_files, files = get_report_data(count_sentences=True, return_files=True)
#file_triggers = match_triggers(files, return_json=False, triggers_from_labelling=False, save_json=True)
file_triggers = triggers_json_to_df(load_json('data/sample_triggers.json'))

# create source labelling dataset
source = capstone_files.merge(file_triggers, on='filename')

# specify labellers
users = ('daniel',)

# create a dataset for each labeller based off the source data
dataset = {user : pd.DataFrame.from_dict(
    dict(
        source.to_dict(),
        **{'reviewed': {idx: False for idx in range(len(source))}},
        **{'label': {idx: False for idx in range(len(source))}},
        **{'confidence': {idx: None for idx in range(len(source))}}
    )) for user in tqdm(users, desc='Preparing labelling data')}

user = 'daniel'

Loading files as dict: 100%|██████████| 32646/32646 [00:08<00:00, 3633.33it/s]
Preparing labelling data: 100%|██████████| 1/1 [00:01<00:00,  1.05s/it]


In [10]:
dataset[user]

Unnamed: 0,filename,anumber,sentence_count,idx,triggers,reviewed,label,confidence
0,a074282_reidy_annual2006_12863956.json,74282,141,43,"mineralization, anomalism",False,False,
1,a074282_reidy_annual2006_12863956.json,74282,141,53,mineralization,False,False,
2,a074282_reidy_annual2006_12863956.json,74282,141,84,extensive,False,False,
3,a074282_reidy_annual2006_12863956.json,74282,141,138,mineralization,False,False,
4,a080852_e27_318_2008a_14530025.json,80852,28,4,patchy,False,False,
...,...,...,...,...,...,...,...,...
377048,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,18,mineralisation,False,False,
377049,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,22,prospective,False,False,
377050,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,34,Mineralisation,False,False,
377051,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,35,mineralised,False,False,


In [4]:
from pipeline.preprocessing.processing import load_triggers

triggers = load_triggers(triggers_from_labelling=False)

triggers

['abandoned survey',
 'accelerated regional',
 'area of main workings',
 'assist in identifying',
 'assuring',
 'auspicious',
 'bright',
 'broad',
 'can not be dismissed',
 "can't be dismissed",
 'cannot be dismissed',
 'cannot be discounted',
 'confirmation',
 'confirmed area',
 'confirmed geological',
 'confirmed the existence',
 'confirming the presence',
 'continued exploration',
 'correlation',
 'could be',
 'delineated',
 'detailed feasibility',
 'discover',
 'discovered',
 'does not appear to have been drilled',
 'drivable depths',
 'economic mineralisation',
 'economic mineralization',
 'economically',
 'economically viable',
 'encounted several anomalous',
 'encouragement',
 'encouraging',
 'encouraging intercepts',
 'enriched',
 'enrichement',
 'evidenced by',
 'exploration drilling',
 'exploration potential',
 'extending the discovery',
 'extension to the',
 'extensive',
 'extensive regional mapping',
 'extensive silicification',
 'favourable',
 'favourable amounts',
 'favou

In [5]:
from pipeline.preprocessing.processing import load_spacy_model

nlp = load_spacy_model(output_type='doc', trigger_matcher=True,
                       stopword_removal=False, punctuation_removal=True, lemmatizer=True)

lemmatized_triggers = list(nlp.pipe(triggers))

Added lemmatizer pipe
Added punctuation removal pipe
Added entity ruler pipe
Added trigger matcher pipe
Loading spaCy model with spaCy doc output.


In [6]:
from pipeline.preprocessing.processing import to_text, to_sentence

pd.set_option('display.max_rows', 200)
pd.DataFrame.from_dict(
    {'original': triggers, 'processed': [to_sentence(trigger) for trigger in lemmatized_triggers]}
)

Unnamed: 0,original,processed
0,abandoned survey,abandon survey
1,accelerated regional,accelerate regional
2,area of main workings,area of main working
3,assist in identifying,assist in identify
4,assuring,assure
5,auspicious,auspicious
6,bright,bright
7,broad,broad
8,can not be dismissed,can not be dismiss
9,can't be dismissed,can not be dismiss


In [7]:
sample_text = 'No geochemical anomalism indicative of nickel sulphide mineralization is evident in the compiled data.'
doc = nlp(sample_text)
display_ent(doc)

In [23]:
# geoview_and_files = capstone_files.merge(geoview, on = 'anumber').merge(
#     match_triggers(files, return_json = False, triggers_from_labelling =False))

# ## Function that generates a list of index values from a dataframe containing metadata and extracted
# ## sentence information and samples it based on a specified column catergories

# def stratify(geoview_and_sentences,  ## dataframe with metadata and sentence information
#                                      num_sentences_to_label, ## how many sentences you want to be extracted
#                                      column_to_sample,       ## for reproducibiliity
#                                      random_state):          ## the column that you want to stratify on
    
#     size_of_split = num_sentences_to_label / len(geoview_and_sentences)
#     y = geoview_and_sentences[column_to_sample]
#     x = geoview_and_sentences
#     X_train, X_test, y_train, y_test = train_test_split( x,
#                                                          y, 
#                                                          test_size = size_of_split, 
#                                                          random_state = random_state,
#                                                          stratify = y)
#     return X_test.index

# index = return_stratified_sentence_index(geoview_and_sentences = geoview_and_sentences,
#                                          num_sentences_to_label = 2000,
#                                          column_to_sample = 'report_type',
#                                          random_state = 420)
# geoview_and_sentences.iloc[index].report_type.value_counts(normalize = True)

# geoview_and_files = capstone_files.merge(geoview, on = 'anumber').merge(
#     match_triggers(files, return_json = False, triggers_from_labelling =False))

Unnamed: 0,filename,anumber,sentence_count,idx,triggers,reviewed,label,confidence
0,a074282_reidy_annual2006_12863956.json,74282,141,43,anomalism,False,False,
1,a074282_reidy_annual2006_12863956.json,74282,141,53,,False,False,
2,a074282_reidy_annual2006_12863956.json,74282,141,55,weak,False,False,
3,a074282_reidy_annual2006_12863956.json,74282,141,71,likely,False,False,
4,a074282_reidy_annual2006_12863956.json,74282,141,84,extensive,False,False,
...,...,...,...,...,...,...,...,...
431378,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,14,anomalism,False,False,
431379,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,17,"mineralisation, potential",False,False,
431380,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,18,"economic significance, mineralisation",False,False,
431381,a093589_e09_1719 surrenderreport 080312_118423...,93589,54,22,"prospective for, prospective",False,False,


In [11]:
# prepare data to review for user
df = dataset[user].loc[~dataset[user].reviewed]
n = len(df)
idx = df.index[0]

event = df.loc[idx]

def get_index_text(idx):
    return f"<h3>Current Index: {idx} / {n}</h3>"

def get_trigger_text(idx):  
    trigger_words = ''.join([f'<li>{w}</li>' 
                             for w in df.loc[idx,'triggers'].split(',')])
    return f"<h3>Trigger Words:</h3><ul>{trigger_words}</ul>"


def yes_pressed(b):
    label_sentence(label=True)

def no_pressed(b):
    label_sentence(label=False)

def label_sentence(label):
    global idx
    global prev_input
    global fwd_input
    
    df.loc[review_idx, 'reviewed'] = False
    df.loc[review_idx, 'label'] = label
    
    idx += 1
    
    event = df.loc[idx]
    idx_text.value = get_index_text(str(idx))
    triggers_widget.value = get_trigger_text(event.triggers)
    prev_input.value = 0
    fwd_input.value = 0

    # sentence_text = files[event.filename][event.idx]
    with event_output:
        event_output.clear_output()
        
        display_ent([nlp(files[event.filename][event.idx])])
        
    with prev_output:
        prev_output.clear_output()
        prev_output.layout.display = "none"

    with fwd_output:
        fwd_output.clear_output()
        fwd_output.layout.display = "none"

def save_pressed(b):
    df.to_csv(labelled_path, index=False)

idx_text = widgets.HTML(value=get_index_text(idx))
triggers_widget = widgets.HTML(value=get_trigger_text(idx))
line_break = widgets.HTML(value='\n')

prev_input = widgets.BoundedIntText(
    value=0,
    min=-10,
    max=0,
    step=1,
    description='Lower Range',
    disabled=False
)

fwd_input = widgets.BoundedIntText(
    value=0,
    min=0,
    max=10,
    step=1,
    description='Upper Range',
    disabled=False
)

def update_chunk(b):
    global idx
    global prev_input
    global fwd_input
    
    event = df.loc[idx]
    #triggers_widget.value = get_trigger_text(event.triggers)
    
    
    prev = prev_input.value
    fwd = fwd_input.value

    if prev < 0:
        with prev_output:
            prev_output.clear_output()
            prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
            display_ent(prev_chunk)
            prev_output.layout.display = "block"
            
    if fwd > 0:
        with fwd_output:
            fwd_output.clear_output()
            fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + 1 : event.idx + 1 + fwd]))
            display_ent(fwd_chunk)
            fwd_output.layout.display = "block"
        
#         # get text chunks on each side of sentence
#         prev_chunk = list(nlp.pipe(files[event.filename][event.idx + prev : event.idx]))
#         sentence = [nlp(files[event.filename][event.idx])]
#         fwd_chunk = list(nlp.pipe(files[event.filename][event.idx + fwd + 1]))
#         for name,val in zip(('prev_chunk','sentence','fwd_chunk'),
#                      (prev_chunk, sentence, fwd_chunk)):
#             assert(type(val) == list), f'{name} : {val}'
#         # display them with spacy separately for clarity
#         display_ent(prev_chunk + ['\n']  + sentence + ['\n'] + fwd_chunk)

prev_output = widgets.Output()
event_output = widgets.Output()
fwd_output = widgets.Output()

prev_output.layout.display = "none"
fwd_output.layout.display = "none"

yes = widgets.Button(description='Near Miss Event', button_style='success')
yes.on_click(yes_pressed)

no = widgets.Button(description='Not Near Miss Event', button_style='danger')
no.on_click(no_pressed)

save = widgets.Button(description='Save DataFrame', button_style='primary')
save.on_click(save_pressed)

update = widgets.Button(description='Update Text Range', button_style='primary')
update.on_click(update_chunk)

label_buttons = widgets.HBox([yes, no])
buttons = widgets.VBox([label_buttons, save])

with event_output:
    display_ent(nlp(files[event.filename][event.idx]))
    
with prev_output:
    display(widgets.HTML(value='\n'))

with fwd_output:
    display(widgets.HTML(value=''))
                            
display(idx_text, triggers_widget, prev_input, fwd_input, update,
        line_break, prev_output, line_break, event_output, line_break, fwd_output,
        line_break, buttons)

HTML(value='<h3>Current Index: 0 / 377053</h3>')

HTML(value='<h3>Trigger Words:</h3><ul><li>mineralization</li><li> anomalism</li></ul>')

BoundedIntText(value=0, description='Lower Range', max=0, min=-10)

BoundedIntText(value=0, description='Upper Range', max=10)

Button(button_style='primary', description='Update Text Range', style=ButtonStyle())

HTML(value='\n')

Output(layout=Layout(display='none'))

HTML(value='\n')

Output()

HTML(value='\n')

Output(layout=Layout(display='none'))

HTML(value='\n')

VBox(children=(HBox(children=(Button(button_style='success', description='Near Miss Event', style=ButtonStyle(…