In [6]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

In [7]:
# Project Base
base_path = Path('..')

# Event Path
events_path = base_path / 'events'

# dictionary path
dictionary_path = base_path / 'dictionary'
triggers_path = dictionary_path / 'triggers'
patterns_path = dictionary_path / 'patterns'

In [8]:
# load patterns for simple geological NER during labelling process
patterns = []
for filename in patterns_path.iterdir():
    with open(filename, encoding="utf8") as f:
        patterns += json.load(f)
        
print(f'loaded {len(patterns)} patterns')

spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)
print(f'loaded {spacy_model} as nlp')

# add loaded patterns to spacy model pipeline
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(patterns)

nlp.add_pipe(ruler)

loaded 31120 patterns
loaded en_core_web_lg as nlp


In [9]:
# specify your group number here
event_group = 6

# if in process of labelling data, load previously reviewed event datta
with events_path.joinpath(f'group_{event_group}_labelled.csv').resolve() as file_path:
    if os.path.isfile(file_path):  # check if file exists
        df = pd.read_csv(file_path)
        print(f'Loaded pre-labelled data from {file_path}')
    else:  # else if labelling-in-progress csv does not exist try loading the unlabelled version
        try:
            print(f'Previously labelled data does not exist - loading group_{event_group}_events.csv')
            df = pd.read_csv(events_path.joinpath(f'group_{event_group}_events.csv').resolve())
            df['reviewed'] = False
        except Exception as e:  # error handling
            print(f'File probably does not exist. Please extract events and save to {events_path.resolve()}')
            print(e)

Previously labelled data does not exist - loading group_6_events.csv


In [10]:
df.iloc[0].T

event_id                        a072171_2005 everlasting annual rpt_12941703_1
filename                     a072171_2005 everlasting annual rpt_12941703.json
sentence_idx                                                                 1
sentence_text                a detailed report titled potential of the nors...
n_trigger_words                                                              1
trigger_words_in_sentence                                        ['potential']
trigger_words_in_event                         ['potential', 'mineralisation']
event_text                   the norseman project has significant explorati...
ORE_DEPOSIT                                                                 []
ROCK                                                                        []
MINERAL                                                                     []
STRAT                                                                       []
LOCATION                                  ['norseman

In [None]:
# clear cell display if desired
from IPython.display import clear_output
clear_cell_display = True

for idx, row in df.loc[~df.reviewed].iterrows():
    # print event information
    print(f'File Index {idx} - File Name : {row.filename}\n')
    print(row.event_text)
    print('')
    print("The trigger word(s) detected in this text chunck were: {}".format(row.trigger_words_in_sentence))
    print('')
    print("The sentence that the trigger word occured in was: {}.".format(row.sentence_text))
    print(display_ent(nlp(row.event_text)))
    
    
    # take user input
    label = input("\nIs this a near miss event?(True/False)\n")
    reviewed = True
    
    # if user response is true, then mark as near miss and query for reasoning
    if label.lower() in ["yes", "y","ye", "yeah", "true", "t"]:
        label = True
        trigger_phrase = input("""
        \n Was there a phrase/word that indicated that this was a near miss event? Please type phrase that can be added to trigger phrases, leave blank if none\n""")
    
    # if user response is false, then mark accordingly witth no trigger phrase
    elif label.lower() in ["n",  "no", "nope", "nah", "false", "f"]:
        label = False
        trigger_phrase = ''
    
    # if near miss label is uncertain, leave as unreviewed so as to come back later.
    elif label.lower() in ['u','unknown','none']:
        print('Not specifying a label for this sentence.')
        reviewed = False
        
    # allow text input for quit
    elif label.lower() in ["q","quit","stop",'exit']:
        break
        
    # invalid input
    else:
        print('Input did not match list of valid commands.')
        print(f'Input received was {label}')
        print('Try: "true" (or "T"), "false" (or "F"), "unknown" (or "U"), or "quit" (or "Q").')
        break
        
    print(f'label = {label}')
    df.loc[idx,'Near Miss Event'] = label
    df.loc[idx, 'reviewed'] = reviewed
    df.loc[idx, 'Key trigger phrase'] = trigger_phrase
    print('')
    
    # clear cell output if true
    if clear_cell_display:
        clear_output()
    

File Index 2 - File Name : a072171_2005 everlasting annual rpt_12941703.json

introduction australian gold investments pty ltd (agi), a sydney based private company purchased the everlasting tenements from kinross gold australia pty ltd (kga) on august 1st 2005. the everlasting project covers an area of 436 hectares (table 1) is located about 8 kilometres northeast of norseman, and about 25 kilometres northeast of mt. the project has two prospects gladstone and daisy that lie a kilometre from each other and are host to two major mineralised shear zones that extend southward onto the neighbouring property held by croesus mining nl (crs). croesus have recently developed and mined by open cut methods part of the daisy and gladstone mineralisation. mineable reserves (source crs annual reports) were; daisy 286kt @ 5.73g t au (53koz) mined 2003 4 gladstone 167kt @ 4.17g t au (22koz) mined 2004 5 the gladstone north and daisy prospects were (until recently) thought to be hosted in rocks of th

None


In [13]:
# view reviewed events
df.loc[df.reviewed]

Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words,trigger_words_in_sentence,trigger_words_in_event,event_text,ORE_DEPOSIT,ROCK,MINERAL,STRAT,LOCATION,TIMESCALE,event_label,reviewed,Near Miss Event,Key trigger phrase
0,a072171_2005 everlasting annual rpt_12941703_1,a072171_2005 everlasting annual rpt_12941703.json,1,a detailed report titled potential of the nors...,1,['potential'],"['potential', 'mineralisation']",the norseman project has significant explorati...,[],[],[],[],"['norseman', 'norseman', 'norseman']",[],0,True,False,
1,a072171_2005 everlasting annual rpt_12941703_6,a072171_2005 everlasting annual rpt_12941703.json,6,exploration potential and recommendations.,1,['potential'],"['mineralisation', 'potential']",better definition of soil anomalies is require...,[],[],"['gold', 'gold']",[],['australia'],[],0,True,True,survey has been recommended


In [None]:
# # fix indices by "un-reviewing them"
# for i in (4,5):
#     df.loc[i,'reviewed'] = False
    
# df.loc[df.reviewed]

In [26]:
# export csv when done
save_path = events_path.joinpath(f'group_{event_group}_labelled.csv').resolve()
df.to_csv(save_path)
print(f'Saved group {event_group} labelled data to {save_path}')

Saved group 2 labelled data to /home/daniel/capstone/events/group_2_labelled.csv
