In [46]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

In [47]:
# Project Base
base_path = Path('..')

# Event Path
events_path = base_path / 'events'

# dictionary path
dictionary_path = base_path / 'dictionary'
triggers_path = dictionary_path / 'triggers'
patterns_path = dictionary_path / 'patterns'

In [48]:
# load patterns for simple geological NER during labelling process
patterns = []
for filename in patterns_path.iterdir():
    with open(filename, encoding="utf8") as f:
        patterns += json.load(f)
        
print(f'loaded {len(patterns)} patterns')

spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)
print(f'loaded {spacy_model} as nlp')

# add loaded patterns to spacy model pipeline
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(patterns)

nlp.add_pipe(ruler)

loaded 31120 patterns
loaded en_core_web_lg as nlp


In [49]:
# specify your group number here
event_group = 

# if in process of labelling data, load previously reviewed event datta
with events_path.joinpath(f'group_{event_group}_labelled.csv').resolve() as file_path:
    if os.path.isfile(file_path):  # check if file exists
        df = pd.read_csv(file_path)
        print(f'Loaded pre-labelled data from {file_path}')
    else:  # else if labelling-in-progress csv does not exist try loading the unlabelled version
        try:
            print(f'Previously labelled data does not exist - loading group_{event_group}_events.csv')
            df = pd.read_csv(events_path.joinpath(f'group_{event_group}_events.csv').resolve())
            df['reviewed'] = False
        except Exception as e:  # error handling
            print(f'File probably does not exist. Please extract events and save to {events_path.resolve()}')
            print(e)

Loaded pre-labelled data from /home/daniel/capstone/events/group_2_labelled.csv


In [50]:
df.iloc[0].T

Unnamed: 0                                                                   0
event_id                             a081752_anrep2008eraheedy2103_15107355_16
filename                           a081752_anrep2008eraheedy2103_15107355.json
sentence_idx                                                                16
sentence_text                mineral occurrences and exploration potential ...
n_trigger_words                                                              1
trigger_words_in_sentence                                        ['potential']
trigger_words_in_event                                           ['potential']
event_text                   bibliography bunting ja 1986, geology of the e...
ORE_DEPOSIT                                                                 []
ROCK                                                               ['granite']
MINERAL                                                                     []
STRAT                                               

In [51]:
# clear cell display if desired
from IPython.display import clear_output
clear_cell_display = True

for idx, row in df.loc[~df.reviewed].iterrows():
    # print event information
    print(f'File Index {idx} - File Name : {row.filename}\n')
    print(row.event_text)
    print('')
    print("The trigger word(s) detected in this text chunck were: {}".format(row.trigger_words_in_sentence))
    print('')
    print("The sentence that the trigger word occured in was: {}.".format(row.sentence_text))
    print(display_ent(nlp(row.event_text)))
    
    
    # take user input
    label = input("\nIs this a near miss event?(True/False)\n")
    reviewed = True
    
    # if user response is true, then mark as near miss and query for reasoning
    if label.lower() in ["yes", "y","ye", "yeah", "true", "t"]:
        label = True
        trigger_phrase = input("""
        \n Was there a phrase/word that indicated that this was a near miss event? Please type phrase that can be added to trigger phrases, leave blank if none\n""")
    
    # if user response is false, then mark accordingly witth no trigger phrase
    elif label.lower() in ["n",  "no", "nope", "nah", "false", "f"]:
        label = False
        trigger_phrase = ''
    
    # if near miss label is uncertain, leave as unreviewed so as to come back later.
    elif label.lower() in ['u','unknown','none']:
        print('Not specifying a label for this sentence.')
        reviewed = False
        
    # allow text input for quit
    elif label.lower() in ["q","quit","stop",'exit']:
        break
        
    # invalid input
    else:
        print('Input did not match list of valid commands.')
        print(f'Input received was {label}')
        print('Try: "true" (or "T"), "false" (or "F"), "unknown" (or "U"), or "quit" (or "Q").')
        break
        
    print(f'label = {label}')
    df.loc[idx,'Near Miss Event'] = label
    df.loc[idx, 'reviewed'] = reviewed
    df.loc[idx, 'Key trigger phrase'] = trigger_phrase
    print('')
    
    # clear cell output if true
    if clear_cell_display:
        clear_output()
    

File Index 201 - File Name : a084558_e80_3371_hc2009a_11315862.json

the altered matrix has been cut by a series of anastomosing quartz veins that have been cut by late carbonate veining. classification: weathered hydrothermal alteration assemblage (secondary potash feldspar carbonate sericite secondary quartz) cut by quartz and carbonate veins possibly occurring within volcanic flow top. sample 902127b a possible cavity within a volcanic top host comprises secondary potash feldspar infilled by carbonate that has been progressively replaced by limonite (lm). cleavage traces have been preserved in the weathered carbonate phase.

The trigger word(s) detected in this text chunck were: ['possible']

The sentence that the trigger word occured in was: sample 902127b a possible cavity within a volcanic top host comprises secondary potash feldspar infilled by carbonate that has been progressively replaced by limonite (lm)..


None

Is this a near miss event?(True/False)
stop


In [43]:
# view extended section
idx = 156 # file index number 
pd.read_json(f'../data/subset/{df.loc[idx].filename}').loc[
    int(df.loc[idx].sentence_idx-10):int(df.loc[idx].sentence_idx+10)].values

array([['As part of the RJV, Matador will also earn a 50% interest in all of the gold rights to the pre-existing Focus Coolgardie tenements comprising the Dreadnought Tindals, Norris Lord Bob, Nepean and the Mount projects (discussed by Besserer et al., 2005).'],
       ['Upon completion of the purchase of the Herald Leviathan interest (completed in February, 2006), Matador will be deemed to have earned a 10% interest in the CGP and the RJV.'],
       ['Matador will then be required to spend $8.0 million in exploration and/or development expenditures over three years to earn a further 40% in the CGP and the RJV (Besserer et al., 2005).'],
       ['In the southern portion of the Kalgoorlie Terrane of the Menzies-Norseman Greenstone Belt gold mineralization at the Mount project is hosted by quartz lodes and reefs in mafic to ultramafic rocks with the lodes and reefs developed in and parallel to a fold hinge axis.'],
       ['The quartz lodes and reefs are spatially associated with a stro

In [11]:
# view reviewed events
df.loc[df.reviewed].tail(10)

Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words,trigger_words_in_sentence,trigger_words_in_event,event_text,ORE_DEPOSIT,ROCK,MINERAL,STRAT,LOCATION,TIMESCALE,event_label,reviewed,Near Miss Event,Key trigger phrase
29,a084512_rbb_2009_e5101206_12945465_131,a084512_rbb_2009_e5101206_12945465.json,131,furthermore previous reconnaissance drilling h...,1,['favourable'],['favourable'],no author or date given) report a 055117 work ...,['base metal'],['sediments'],[],[],"['glengarry basin', 'perseverance', 'glengarry...",[],0,True,True,
30,a084512_rbb_2009_e5101206_12945465_139,a084512_rbb_2009_e5101206_12945465.json,139,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...",['sulphides'],[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,
31,a084512_rbb_2009_e5101206_12945465_146,a084512_rbb_2009_e5101206_12945465.json,146,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...",['sulphides'],[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,
32,a084512_rbb_2009_e5101206_12945465_153,a084512_rbb_2009_e5101206_12945465.json,153,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...",['sulphides'],[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,
33,a084512_rbb_2009_e5101206_12945465_160,a084512_rbb_2009_e5101206_12945465.json,160,rocks exposed in the area comprise a turbidite...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",report a 057563 e51 765 was part of the wiluna...,"['base metal', 'base metal']","['turbidite', 'sedimentary sequence', 'limesto...","['sulphides', 'diamonds']",[],"['wiluna', 'wiluna', 'yerrida basin', 'glengar...",['proterozoic'],0,True,True,
34,a084512_rbb_2009_e5101206_12945465_167,a084512_rbb_2009_e5101206_12945465.json,167,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...",['sulphides'],[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,
35,a084512_rbb_2009_e5101206_12945465_170,a084512_rbb_2009_e5101206_12945465.json,170,preliminary evaluation of the lease suggested ...,1,['favourable'],"['favourable', 'mineralisation']",work completed in 1997 to 1998 comprised liter...,['pge'],"['sediment', 'carbonate rocks']","['hydrothermal', 'oxide']",[],[],[],0,True,True,immediate vicinity
36,a084512_rbb_2009_e5101206_12945465_176,a084512_rbb_2009_e5101206_12945465.json,176,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...",['sulphides'],[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,
37,a084512_rbb_2009_e5101206_12945465_183,a084512_rbb_2009_e5101206_12945465.json,183,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...","['sulphides', 'diamonds']",[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,
38,a084512_rbb_2009_e5101206_12945465_191,a084512_rbb_2009_e5101206_12945465.json,191,recent research by gswa suggested the basin ha...,2,"['potential', 'mineralisation']","['potential', 'mineralisation']",the lease covered part of the proterozoic yerr...,['base metal'],"['turbidite', 'sedimentary sequence', 'limesto...",['sulphides'],[],"['yerrida basin', 'glengarry basin']",['proterozoic'],0,True,True,


In [45]:
# export csv when done
save_path = events_path.joinpath(f'group_{event_group}_labelled.csv').resolve()
df.to_csv(save_path)
print(f'Saved group {event_group} labelled data to {save_path}')

Saved group 2 labelled data to /home/daniel/capstone/events/group_2_labelled.csv
