In [4]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

In [5]:
# Project Base
base_path = Path('..')

# Event Path
events_path = base_path / 'events'

# dictionary path
dictionary_path = base_path / 'dictionary'
triggers_path = dictionary_path / 'triggers'
patterns_path = dictionary_path / 'patterns'

In [28]:
# load patterns for simple geological NER during labelling process
patterns = []
for filename in patterns_path.iterdir():
    with open(filename, encoding="utf8") as f:
        patterns += json.load(f)
        
print(f'loaded {len(patterns)} patterns')

spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)
print(f'loaded {spacy_model} as nlp')

# add loaded patterns to spacy model pipeline
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(patterns)

nlp.add_pipe(ruler)

loaded 31120 patterns
loaded en_core_web_lg as nlp


In [29]:
# specify your group number here
event_group = 2

# if in process of labelling data, load previously reviewed event datta
with events_path.joinpath(f'group_{event_group}_labelled.csv').resolve() as file_path:
    if os.path.isfile(file_path):  # check if file exists
        df = pd.read_csv(file_path)
        print(f'Loaded pre-labelled data from {file_path}')
    else:  # else if labelling-in-progress csv does not exist try loading the unlabelled version
        try:
            print(f'Previously labelled data does not exist - loading group_{event_group}_events.csv')
            df = pd.read_csv(events_path.joinpath(f'group_{event_group}_events.csv').resolve())
            df['reviewed'] = False
        except Exception as e:  # error handling
            print(f'File probably does not exist. Please extract events and save to {events_path.resolve()}')
            print(e)

Loaded pre-labelled data from /home/daniel/capstone/events/group_2_labelled.csv


In [30]:
# clear cell display if desired
from IPython.display import clear_output
clear_cell_display = True

for idx, row in df.loc[~df.reviewed].iterrows():
    # print event information
    print(f'File Index {idx} - File Name : {row.filename}\n')
    print(row.event_text)
    print('')
    print("The trigger word(s) detected in this text chunck were: {}".format(row.trigger_words))
    print(display_ent(nlp(row.event_text)))
    
    # take user input
    label = input("\nIs this a near miss event?(True/False)\n")
    reviewed = True
    
    # if user response is true, then mark as near miss and query for reasoning
    if label.lower() in ["yes", "y","ye", "yeah", "true", "t"]:
        label = True
        trigger_phrase = input("""
        \n Was there a phrase/word that indicated that this was a near miss event? Please type phrase that can be added to trigger phrases, leave blank if none\n""")
    
    # if user response is false, then mark accordingly witth no trigger phrase
    elif label.lower() in ["n",  "no", "nope", "nah", "false", "f"]:
        label = False
        trigger_phrase = ''
    
    # if near miss label is uncertain, leave as unreviewed so as to come back later.
    elif label.lower() in ['u','unknown','none']:
        print('Not specifying a label for this sentence.')
        reviewed = False
        
    # allow text input for quit
    elif label.lower() in ["q","quit","stop",'exit']:
        break
        
    # invalid input
    else:
        print('Input did not match list of valid commands.')
        print(f'Input received was {label}')
        print('Try: "true" (or "T"), "false" (or "F"), "unknown" (or "U"), or "quit" (or "Q").')
        break
        
    print(f'label = {label}')
    df.loc[idx,'Near Miss Event'] = label
    df.loc[idx, 'reviewed'] = reviewed
    df.loc[idx, 'Key trigger phrase'] = trigger_phrase
    print('')
    
    # clear cell output if true
    if clear_cell_display:
        clear_output()
    

File Index 101 - File Name : a072786_carlow castle 2005a c453-1996_14242581.json

further exploration of legends carlow castle project area is therefore recommended. exploration by legend between 1995 and 2004 has been reported in detail in previous annual reports to doir. initially, legends work focused on areas of historic mining but later moved to identification of new targets and the discovery of a blind copper gold resource at carlow south. the carlow castle project is comprised of exploration licence 47 562 and prospecting licences 47 944 and 47 945, and is located 1,500km north of perth in the western part of the pilbara region of western australia. it is 30km from karratha, a regional centre with a major airport and is 10 km southwest of the township of roebourne. the reporting period is from 1 january to 31 december annually. the carlow castle prospect covers gold copper cobalt mineralisation hosted by westsouth westerly trending archaean greenstones of the roebourne group. se

None

Is this a near miss event?(True/False)
q


In [25]:
# view reviewed events
df.loc[df.reviewed]

Unnamed: 0.1,Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words,trigger_words,event_text,ORE_DEPOSIT,ROCK,MINERAL,STRAT,LOCATION,TIMESCALE,event_label,reviewed,Near Miss Event,Key trigger phrase
0,0,a081752_anrep2008eraheedy2103_15107355_16,a081752_anrep2008eraheedy2103_15107355.json,16,mineral occurrences and exploration potential ...,1,['potential'],resource search pty ltd. september 2008 2 the ...,['uranium'],"['calcrete', 'granite', 'sediments', 'silts', ...",[],[],"['nabberu basin', 'western australia', 'wester...","['paleoproterozoic', 'cainozoic']",0,True,True,"weakly anomalous, studies to confirm the valid..."
1,1,a075210_buck_a_ el12_1_2007_11292066_235,a075210_buck_a_ el12_1_2007_11292066.json,235,further drilling in coming years will further ...,1,['further drilling'],of days worked at $ $ construction materials: ...,[],"['coal', 'coal']",[],[],"['ewington', 'premier', 'ewington']",[],0,True,True,prospective areas
2,2,a075210_buck_a_ el12_1_2007_11292066_246,a075210_buck_a_ el12_1_2007_11292066.json,246,the tenement was applied for on the 12 1 2005 ...,1,['possible'],60 appendix 3: coal resources within el12 1 66...,[],"['coal', 'coal', 'ash', 'coal', 'coal', 'coal']","['diamond', 'sulphur']",[],"['muja', 'collie', 'ewington', 'collie']",[],0,True,adfg,prospective areas
3,3,a080379_e80_2574_08atr_12876104_4,a080379_e80_2574_08atr_12876104.json,4,the east kimberley halls creek orogen is widel...,2,"['potential', 'mineralisation']",the planned ground magnetics survey should be ...,['pge'],[],"['gold', 'sulphide', 'sulphide']",[],"['kimberley', 'halls creek orogen', 'australia']",[],0,True,True,
4,4,a080379_e80_2574_08atr_12876104_5,a080379_e80_2574_08atr_12876104.json,5,hoatson also recognized broad similarities bet...,1,['broad'],the planned ground magnetics survey should be ...,"['pge', 'pge']",[],"['gold', 'sulphide', 'sulphide']",[],"['kimberley', 'halls creek orogen', 'australia']",[],0,True,True,"economic interest, extensively mineralized, ex..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
96,96,a072786_carlow castle 2005a c453-1996_14242581...,a072786_carlow castle 2005a c453-1996_14242581...,223,the discovery of a blind high grade copper gol...,3,"['low grade', 'mineralisation', 'encouraging']","based on the results of this program, openpit ...","['lodes', 'lode', 'lode', 'lode', 'lode']",['serpentinites'],"['gold', 'copper', 'gold', 'gold', 'copper', '...",[],[],[],0,True,True,
97,97,a072786_carlow castle 2005a c453-1996_14242581...,a072786_carlow castle 2005a c453-1996_14242581...,227,such a mineral resource could be the basis for...,1,['broad'],near sing well a few small open pits were sunk...,"['lode', 'lode', 'lode', 'lode']","['serpentinites', 'chert']","['gold', 'copper', 'copper', 'gold', 'copper',...",[],[],[],0,True,False,
98,98,a072786_carlow castle 2005a c453-1996_14242581...,a072786_carlow castle 2005a c453-1996_14242581...,228,the work by legend has outlined several other ...,1,['mineralisation'],exploration in the carlow castle area has conf...,"['lode', 'lode', 'lode', 'lode']","['chert', 'gabbro']","['gold', 'copper', 'copper', 'gold', 'copper',...",['andover complex'],[],[],0,True,True,extension to the mineralisation
99,99,a072786_carlow castle 2005a c453-1996_14242581...,a072786_carlow castle 2005a c453-1996_14242581...,233,"in particular, the prospects se of carlow cast...",1,['possible'],such a mineral resource could be the basis for...,"['lode', 'lode']","['chert', 'gabbro']","['copper', 'gold', 'copper', 'gold', 'copper',...",['andover complex'],"['south west', 'whundo']",[],0,True,True,


In [None]:
# # fix indices by "un-reviewing them"
# for i in (4,5):
#     df.loc[i,'reviewed'] = False
    
# df.loc[df.reviewed]

In [26]:
# export csv when done
save_path = events_path.joinpath(f'group_{event_group}_labelled.csv').resolve()
df.to_csv(save_path)
print(f'Saved group {event_group} labelled data to {save_path}')

Saved group 2 labelled data to /home/daniel/capstone/events/group_2_labelled.csv
