In [5]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

spacy_model = "en_core_web_sm"
#spacy_model = "en_core_web_lg"

nlp = spacy.load(spacy_model)

In [6]:
# Project Base
base_path = Path('..')

# Event Path
events_path = base_path / 'events'

In [10]:
# specify your group number here
event_group = 6

# if in process of labelling data, load previously reviewed event datta
with events_path.joinpath(f'group_{event_group}_labelled.csv').resolve() as file_path:
    if os.path.isfile(file_path):  # check if file exists
        df = pd.read_csv(file_path)
    else:  # else if labelling-in-progress csv does not exist try loading the unlabelled version
        try:
            print(f'Previously labelled data does not exist - loading group_{event_group}_events.csv')
            df = pd.read_csv(events_path.joinpath(f'group_{event_group}_events.csv').resolve())
            df['reviewed'] = False
        except Exception as e:  # error handling
            print(f'File probably does not exist. Please extract events and save to {events_path.resolve()}')
            print(e)

df

Previously labelled data does not exist - loading group_6_events.csv


Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words,trigger_words,event_text,ORE_DEPOSIT,ROCK,MINERAL,STRAT,LOCATION,TIMESCALE,event_label,reviewed
0,a072171_2005 everlasting annual rpt_12941703_1,a072171_2005 everlasting annual rpt_12941703.json,1,a detailed report titled potential of the nors...,1,['potential'],the norseman project has significant explorati...,[],[],[],[],"['norseman', 'norseman', 'norseman']",[],0,False
1,a072171_2005 everlasting annual rpt_12941703_3,a072171_2005 everlasting annual rpt_12941703.json,3,better definition of soil anomalies is require...,1,['mineralisation'],a detailed report titled potential of the nors...,[],[],[],[],"['norseman', 'norseman']",[],0,False
2,a072171_2005 everlasting annual rpt_12941703_6,a072171_2005 everlasting annual rpt_12941703.json,6,exploration potential and recommendations.,1,['potential'],regional geological setting . exploration comp...,[],[],[],[],[],[],0,False
3,a072171_2005 everlasting annual rpt_12941703_11,a072171_2005 everlasting annual rpt_12941703.json,11,croesus have recently developed and mined by o...,1,['mineralisation'],the everlasting project covers an area of 436 ...,[],"['banded iron formation', 'bif']",[],"['penneshaw formation', 'noganyer formation']","['norseman', 'gladstone', 'gladstone', 'gladst...",[],0,False
4,a072171_2005 everlasting annual rpt_12941703_59,a072171_2005 everlasting annual rpt_12941703.json,59,the area around the gladstone and daisy teneme...,1,['mineralisation'],archaean lithology the dominant archaean litho...,[],"['basalt', 'dolerite', 'gabbro', 'metasediment...","['gold', 'zircon']","['penneshaw formation', 'penneshaw formation']","['gladstone', 'gladstone', 'long']","['archaean', 'archaean', 'archaean']",0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,24,westralian iron pty ltd 20 100 go ann 0008 com...,1,['extensive'],"archaean granitoid rocks: ago, ags in the nort...",[],"['granitoid', 'granitoid', 'gneiss', 'granite'...","['quartz', 'feldspar', 'biotite', 'iron']",[],"['south west', 'koolanooka']","['archaean', 'tertiary', 'tertiary', 'archaean']",0,False
697,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,30,the synform broadly consists of a middle sedim...,1,['extensive'],weathering has transformed the near surface bi...,['limonite'],"['bif', 'laterite', 'granitoid', 'granitoid', ...","['hematite', 'goethite']",[],['koolanooka'],['archaean'],0,False
698,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,44,"very extensive magnetite rich, steeply dipping...",1,['extensive'],the koolanooka hills comprises of a nnw trendi...,[],"['banded iron formation', 'bif', 'bif', 'bifs']","['magnetite', 'hematite']",[],"['koolanooka', 'mid west', 'western australia'...",[],0,False
699,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,91,broad acre farming including wheat cropping an...,1,['broad'],"the closest population centres are morawa, loc...",['iron ore'],[],[],[],"['morawa', 'perenjori', 'koolanooka']",[],0,False


In [11]:
# clear cell display if desired
from IPython.display import clear_output
clear_cell_display = True

for idx, row in df.loc[~df.reviewed].iterrows():
    # print event information
    print(f'File Index {idx} - File Name : {row.filename}\n')
    print(row.event_text)
    print('')
    print("The trigger word(s) detected in this text chunck were: {}".format(row.trigger_words))
    print(display_ent(nlp(row.event_text)))
    
    # take user input
    label = input("\nIs this a near miss event?(True/False) ")
    
    if label in ["Yes", "y","Y","ye", "yeah", "True", "true"]:
        label = True
        trigger_phrase = input("""\n Was there a phrase/word that indicated that this was a near miss event?
                              please type phrase that can be added to trigger phrases, leave blank if none""")
    elif label == "stop":
        break
    else:
        label = False
    # exit
    
    print(f'label = {label}')
    df.loc[idx,'Near Miss Event'] = label
    df.loc[idx, 'reviewed'] = True
    df.loc[idx, 'Key trigger phrase'] = trigger_phrase
    print('')
    
    # clear cell output if true
    if clear_cell_display:
        clear_output()
    

File Index 0 - File Name : a072171_2005 everlasting annual rpt_12941703.json

the norseman project has significant exploration upside. a detailed report titled potential of the norseman project 2002 identifies most significant targets. the powerpoint, norseman exploration eoy 2004 also provides an illustrative overview.

The trigger word(s) detected in this text chunck were: ['potential']


None

Is this a near miss event?(True/False) stop


In [12]:
df

Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words,trigger_words,event_text,ORE_DEPOSIT,ROCK,MINERAL,STRAT,LOCATION,TIMESCALE,event_label,reviewed
0,a072171_2005 everlasting annual rpt_12941703_1,a072171_2005 everlasting annual rpt_12941703.json,1,a detailed report titled potential of the nors...,1,['potential'],the norseman project has significant explorati...,[],[],[],[],"['norseman', 'norseman', 'norseman']",[],0,False
1,a072171_2005 everlasting annual rpt_12941703_3,a072171_2005 everlasting annual rpt_12941703.json,3,better definition of soil anomalies is require...,1,['mineralisation'],a detailed report titled potential of the nors...,[],[],[],[],"['norseman', 'norseman']",[],0,False
2,a072171_2005 everlasting annual rpt_12941703_6,a072171_2005 everlasting annual rpt_12941703.json,6,exploration potential and recommendations.,1,['potential'],regional geological setting . exploration comp...,[],[],[],[],[],[],0,False
3,a072171_2005 everlasting annual rpt_12941703_11,a072171_2005 everlasting annual rpt_12941703.json,11,croesus have recently developed and mined by o...,1,['mineralisation'],the everlasting project covers an area of 436 ...,[],"['banded iron formation', 'bif']",[],"['penneshaw formation', 'noganyer formation']","['norseman', 'gladstone', 'gladstone', 'gladst...",[],0,False
4,a072171_2005 everlasting annual rpt_12941703_59,a072171_2005 everlasting annual rpt_12941703.json,59,the area around the gladstone and daisy teneme...,1,['mineralisation'],archaean lithology the dominant archaean litho...,[],"['basalt', 'dolerite', 'gabbro', 'metasediment...","['gold', 'zircon']","['penneshaw formation', 'penneshaw formation']","['gladstone', 'gladstone', 'long']","['archaean', 'archaean', 'archaean']",0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
696,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,24,westralian iron pty ltd 20 100 go ann 0008 com...,1,['extensive'],"archaean granitoid rocks: ago, ags in the nort...",[],"['granitoid', 'granitoid', 'gneiss', 'granite'...","['quartz', 'feldspar', 'biotite', 'iron']",[],"['south west', 'koolanooka']","['archaean', 'tertiary', 'tertiary', 'archaean']",0,False
697,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,30,the synform broadly consists of a middle sedim...,1,['extensive'],weathering has transformed the near surface bi...,['limonite'],"['bif', 'laterite', 'granitoid', 'granitoid', ...","['hematite', 'goethite']",[],['koolanooka'],['archaean'],0,False
698,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,44,"very extensive magnetite rich, steeply dipping...",1,['extensive'],the koolanooka hills comprises of a nnw trendi...,[],"['banded iron formation', 'bif', 'bif', 'bifs']","['magnetite', 'hematite']",[],"['koolanooka', 'mid west', 'western australia'...",[],0,False
699,a086030_20-100-go-ann-0008_ks_annual_report_20...,a086030_20-100-go-ann-0008_ks_annual_report_20...,91,broad acre farming including wheat cropping an...,1,['broad'],"the closest population centres are morawa, loc...",['iron ore'],[],[],[],"['morawa', 'perenjori', 'koolanooka']",[],0,False


In [None]:
# export csv when done
df.to_csv(f'group_{event_group}_labelled.csv')
print(f'Saved group_{event_group}_labelled.csv')