In [1]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

In [2]:
# Project Base
base_path = Path('..')

# Event Path
events_path = base_path / 'events'

# dictionary path
dictionary_path = base_path / 'dictionary'
triggers_path = dictionary_path / 'triggers'
patterns_path = dictionary_path / 'patterns'

In [3]:
# load patterns for simple geological NER during labelling process
patterns = []
for filename in patterns_path.iterdir():
    with open(filename, encoding="utf8") as f:
        patterns += json.load(f)
        
print(f'loaded {len(patterns)} patterns')

spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)
print(f'loaded {spacy_model} as nlp')

# add loaded patterns to spacy model pipeline
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(patterns)

nlp.add_pipe(ruler)

loaded 31120 patterns
loaded en_core_web_lg as nlp


In [11]:
# specify your group number here
event_group = 0

# if in process of labelling data, load previously reviewed event datta
with events_path.joinpath(f'group_{event_group}_labelled.csv').resolve() as file_path:
    if os.path.isfile(file_path):  # check if file exists
        df = pd.read_csv(file_path)
        print(f'Loaded pre-labelled data from {file_path}')
    else:  # else if labelling-in-progress csv does not exist try loading the unlabelled version
        try:
            print(f'Previously labelled data does not exist - loading group_{event_group}_events.csv')
            df = pd.read_csv(events_path.joinpath(f'group_{event_group}_events.csv').resolve())
            df['reviewed'] = False
        except Exception as e:  # error handling
            print(f'File probably does not exist. Please extract events and save to {events_path.resolve()}')
            print(e)

Loaded pre-labelled data from /Users/rossgreen/Documents/UWA/CITS5553 - Capstone/capstone/events/group_0_labelled.csv


In [5]:
df.iloc[0].T

event_id                                    a080918_e9_1443_annual_09_13904956_0
filename                                 a080918_e9_1443_annual_09_13904956.json
sentence_idx                                                                   0
sentence_text                  following the completion of the hole and loggi...
n_trigger_words_in_sentence                                                    1
trigger_words_in_sentence                                              potential
n_trigger_words_in_event                                                       1
trigger_words_in_event                                                 potential
event_text                     following the completion of the hole and loggi...
STRAT                                                      dirk hartog formation
event_label                                                                    0
ROCK                                                                         NaN
LOCATION                    

In [12]:
# clear cell display if desired
from IPython.display import clear_output
clear_cell_display = True

for idx, row in df.loc[~df.reviewed].iterrows():
    # print event information
    print(f'File Index {idx} - File Name : {row.filename}\n')
    print(row.event_text)
    print('')
    print("The trigger word(s) detected in this text chunck were: {}".format(row.trigger_words_in_sentence))
    print('')
    print("The sentence that the trigger word occured in was: {}.".format(row.sentence_text))
    print(display_ent(nlp(row.event_text)))
    
    
    # take user input
    label = input("\nIs this a near miss event?(True/False)\n")
    reviewed = True
    
    # if user response is true, then mark as near miss and query for reasoning
    if label.lower() in ["yes", "y","ye", "yeah", "true", "t"]:
        label = True
        trigger_phrase = input("""
        \n Was there a phrase/word that indicated that this was a near miss event? Please type phrase that can be added to trigger phrases, leave blank if none\n""")
    
    # if user response is false, then mark accordingly witth no trigger phrase
    elif label.lower() in ["n",  "no", "nope", "nah", "false", "f"]:
        label = False
        trigger_phrase = ''
    
    # if near miss label is uncertain, leave as unreviewed so as to come back later.
    elif label.lower() in ['u','unknown','none']:
        print('Not specifying a label for this sentence.')
        reviewed = False
        
    # allow text input for quit
    elif label.lower() in ["q","quit","stop",'exit']:
        break
        
    # invalid input
    else:
        print('Input did not match list of valid commands.')
        print(f'Input received was {label}')
        print('Try: "true" (or "T"), "false" (or "F"), "unknown" (or "U"), or "quit" (or "Q").')
        break
        
    print(f'label = {label}')
    df.loc[idx,'Near Miss Event'] = label
    df.loc[idx, 'reviewed'] = reviewed
    df.loc[idx, 'Key trigger phrase'] = trigger_phrase
    print('')
    
    # clear cell output if true
    if clear_cell_display:
        clear_output()
    

File Index 284 - File Name : a078899_e27310atr_10484068.json

d2 major domain bounding thrusts, e.g., the boorara, mt monger and victory dam faults and or shears; and 4. d3 late cross faults and fracture patterns principally either nw or ne trending. a regolith interpretation has been produced, outlining the following broad regolith units: 1. outcrop, either fresh or lower saprolite.

The trigger word(s) detected in this text chunck were: broad

The sentence that the trigger word occured in was: a regolith interpretation has been produced, outlining the following broad regolith units: 1..


None



Is this a near miss event?(True/False)
 q


In [10]:
trigger_words = []
for filename in triggers_path.iterdir():
    with open(filename, 'r') as f:
        trigger_words += [w.strip() for w in f.readlines()]
    
print(f'loaded {len(trigger_words)} trigger words\n')
print(trigger_words)

loaded 25 trigger words

['potential', 'drill', 'drilling', 'mineralisation', 'contain', 'promise', 'find', 'possible', 'enrich', 'enrichment', 'subeconomic', 'encouraging', 'extensive', 'patchy', 'minor', 'further drilling', 'low grade', 'medium grade', 'follow up', 'weak intercepts', 'minor intercepts', 'open at depth', 'along strike', 'exploration licence', 'prospective']


In [43]:
# view extended section
idx = 156 # file index number 
pd.read_json(f'../data/subset/{df.loc[idx].filename}').loc[
    int(df.loc[idx].sentence_idx-10):int(df.loc[idx].sentence_idx+10)].values

array([['As part of the RJV, Matador will also earn a 50% interest in all of the gold rights to the pre-existing Focus Coolgardie tenements comprising the Dreadnought Tindals, Norris Lord Bob, Nepean and the Mount projects (discussed by Besserer et al., 2005).'],
       ['Upon completion of the purchase of the Herald Leviathan interest (completed in February, 2006), Matador will be deemed to have earned a 10% interest in the CGP and the RJV.'],
       ['Matador will then be required to spend $8.0 million in exploration and/or development expenditures over three years to earn a further 40% in the CGP and the RJV (Besserer et al., 2005).'],
       ['In the southern portion of the Kalgoorlie Terrane of the Menzies-Norseman Greenstone Belt gold mineralization at the Mount project is hosted by quartz lodes and reefs in mafic to ultramafic rocks with the lodes and reefs developed in and parallel to a fold hinge axis.'],
       ['The quartz lodes and reefs are spatially associated with a stro

In [14]:
# view reviewed events
df.loc[df.reviewed].tail(10)

Unnamed: 0.1,Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words_in_sentence,trigger_words_in_sentence,n_trigger_words_in_event,trigger_words_in_event,event_text,STRAT,event_label,ROCK,LOCATION,MINERAL,ORE_DEPOSIT,TIMESCALE,reviewed,Near Miss Event,Key trigger phrase
274,274,a078820_escape creek combined annual report to...,a078820_escape creek combined annual report to...,1,the are patchy areas of cainozoic sand cover.,1,patchy,1,patchy,the escape creek project area is dominated by ...,pentecost sandstone,0,sand,,,,"proterozoic, cainozoic",True,True,
275,275,a078820_escape creek combined annual report to...,a078820_escape creek combined annual report to...,8,the potential presence of kimberlite in the es...,1,potential,1,potential,the western australian kimberley region is par...,,0,"kimberlitic rocks, kimberlite, lamproites, lam...","ellendale, kimberley, argyle","diamonds, diamond",,,True,True,
276,276,a078820_escape creek combined annual report to...,a078820_escape creek combined annual report to...,43,the data were not considered sufficiently enco...,1,encouraging,1,encouraging,the first systematic exploration for diamonds ...,,0,"lamproites, gravel","kimberley, southwest",diamonds,,,True,False,
277,277,a078820_escape creek combined annual report to...,a078820_escape creek combined annual report to...,48,"in the 1970s and 1980s, a number of groups, wh...",1,broad,1,broad,from 1968 to 1971 stellar minerals nl worked i...,mount abbott lamproite,0,lamproite,"western australia, kimberley, argyle diamond mine","garnets, diamonds, ilmenites, pyrope, diamond",,,True,False,
278,278,a078820_escape creek combined annual report to...,a078820_escape creek combined annual report to...,53,many kimberlite or lamproite bodies were disco...,1,potential,1,potential,"in certain areas, aeromagnetic surveys were em...",,0,"lamproites, kimberlites, lamproite, kimberlite",ellendale,,,,True,False,
279,279,a078899_e27310atr_10484068_22,a078899_e27310atr_10484068.json,22,the whole package has been folded into a broad...,1,broad,1,broad,the lower most sequence comprises a calc alkal...,,0,"komatiite, tuff, tuffs, andesite, basalt, silt...","bulong, balagundi, kalgoorlie terrane",,,archaean,True,False,
280,280,a078899_e27310atr_10484068_33,a078899_e27310atr_10484068.json,33,extensive lateritised peridotite sequences wer...,1,extensive,1,extensive,"companies including ada exploration pty, weste...",,0,"gossans, gabbro, peridotite",south west,copper,,,True,False,
281,281,a078899_e27310atr_10484068_36,a078899_e27310atr_10484068.json,36,5 exploration during the reporting period expl...,1,mineralisation,1,mineralisation,australian selection intersected anomalous cop...,,0,"regolith, gabbro","balagundi, south west","copper, nickel, gold, zinc",vhms,,True,True,
282,282,a078899_e27310atr_10484068_50,a078899_e27310atr_10484068.json,50,5.2 solid geology study lithos x mineral explo...,1,mineralisation,1,mineralisation,the samples were dispatched to acme laboratory...,,0,"saprolite, sediments, lateritic duricrust, lat...",balagundi,"nickel, gold",vms,,True,False,
283,283,a078899_e27310atr_10484068_62,a078899_e27310atr_10484068.json,62,regional work indicates this subdomain has a h...,1,enriched,1,enriched,the bulk of the tenements in the project area ...,,0,"basalt, volcaniclastic sediments, gabbroic roc...",balagundi,,base metal,,True,True,


In [13]:
# export csv when done
save_path = events_path.joinpath(f'group_{event_group}_labelled.csv').resolve()
df.to_csv(save_path)
print(f'Saved group {event_group} labelled data to {save_path}')

Saved group 0 labelled data to /Users/rossgreen/Documents/UWA/CITS5553 - Capstone/capstone/events/group_0_labelled.csv
