In [1]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

spacy_model = "en_core_web_sm"
#spacy_model = "en_core_web_lg"

nlp = spacy.load(spacy_model)

In [2]:
# Project Base
base_path = Path('..')

# Event Path
events_path = base_path / 'events'

In [5]:
# specify your group number here
event_group = 2

# if in process of labelling data, load previously reviewed event datta
with events_path.joinpath(f'group_{event_group}_labelled.csv').resolve() as file_path:
    if os.path.isfile(file_path):  # check if file exists
        df = pd.read_csv(file_path)
    else:  # else if labelling-in-progress csv does not exist try loading the unlabelled version
        try:
            print(f'Previously labelled data does not exist - loading group_{event_group}_events.csv')
            df = pd.read_csv(events_path.joinpath(f'group_{event_group}_events.csv').resolve())
            df['reviewed'] = False
        except Exception as e:  # error handling
            print(f'File probably does not exist. Please extract events and save to {events_path.resolve()}')
            print(e)

Previously labelled data does not exist - loading group_2_events.csv


In [6]:
# clear cell display if desired
from IPython.display import clear_output
clear_cell_display = True

for idx, row in df.loc[~df.reviewed].iterrows():
    # print event information
    print(f'File Index {idx} - File Name : {row.filename}\n')
    print(row.sentence_text)
    print('')
    print(display_ent(nlp(row.sentence_text)))
    
    # take user input
    label = input("\nIs this a near miss event? ")
    
    # exit
    if label == 'stop':
        break
    
    
    print(f'label = {label}')
    df.loc[idx,'event_label'] = label
    df.loc[idx, 'reviewed'] = True
    print('')
    
    # clear cell output if true
    if clear_cell_display:
        clear_output()
    

File Index 0 - File Name : a081752_anrep2008eraheedy2103_15107355.json

project name: granite peak tenement numbers: e69 2103 tenement operator: atomic resources limited tenement holder: resource search pty ltd report type: annual report period: 1st september 2007 30th august 2008 author: atomic resources date of report: 30 october 2008 1:250 000 map sheet: nabberu ( sg51 5) 1:100,000 map sheet : nabberu ( 3046) target commodity: gold, copper, uranium prospects drilled: nil.



None

Is this a near miss event? stop


In [7]:
df

Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words,trigger_words,event_text,ORE_DEPOSIT,ROCK,MINERAL,STRAT,LOCATION,TIMESCALE,event_label,reviewed
0,a081752_anrep2008eraheedy2103_15107355_1,a081752_anrep2008eraheedy2103_15107355.json,1,project name: granite peak tenement numbers: e...,1,['drill'],annual report for the year ending 1st septembe...,['uranium'],['granite'],"['gold', 'copper']",[],[],[],0,False
1,a081752_anrep2008eraheedy2103_15107355_2,a081752_anrep2008eraheedy2103_15107355.json,2,the project comprises one exploration licence ...,1,['exploration licence'],annual report for the year ending 1st septembe...,['uranium'],['granite'],"['gold', 'copper']",[],['wiluna'],['paleoproterozoic'],0,False
2,a081752_anrep2008eraheedy2103_15107355_16,a081752_anrep2008eraheedy2103_15107355.json,16,mineral occurrences and exploration potential ...,1,['potential'],"bibliography bunting ja 1986, geology of the e...",[],['granite'],[],[],"['nabberu basin', 'western australia', 'wester...",[],0,False
3,a081752_anrep2008eraheedy2103_15107355_20,a081752_anrep2008eraheedy2103_15107355.json,20,results: follow up targets identified fro fiel...,2,"['drill', 'follow up']",geology: located within the paleoproterozoic e...,[],"['sediments', 'silts', 'sands', 'calcrete']",[],[],[],"['paleoproterozoic', 'cainozoic']",0,False
4,a075210_buck_a_ el12_1_2007_11292066_45,a075210_buck_a_ el12_1_2007_11292066.json,45,"92.670 0.200 139441 sandstone (75%): light, bu...",1,['minor'],top of e25 seam 75.480 0.960 139439 coal undif...,[],"['coal', 'sandstone', 'sandstone', 'coal', 'sa...","['pyrite', 'pyrite']",[],"['grey', 'grey', 'grey']",[],0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2273,a080097_microsoft word - 081110_e52_1711_parti...,a080097_microsoft word - 081110_e52_1711_parti...,38,anaconda nickel ltd conducted a shallow rc and...,1,['potential'],this prospect is currently inactive due to nat...,[],"['laterite', 'laterites']",['nickel'],[],[],[],0,False
2274,a080097_microsoft word - 081110_e52_1711_parti...,a080097_microsoft word - 081110_e52_1711_parti...,39,drilling and metallurgical test results indica...,1,['contain'],the mineralisation may be co genetic with that...,[],"['laterite', 'laterites']","['nickel', 'nickel', 'sulphides', 'gold']",[],[],[],0,False
2275,a080097_microsoft word - 081110_e52_1711_parti...,a080097_microsoft word - 081110_e52_1711_parti...,43,the focus of the work was to identify areas of...,2,"['potential', 'mineralisation']",exploration exploration work completed over th...,[],['lag'],"['gold', 'gold']",[],[],[],0,False
2276,a080097_microsoft word - 081110_e52_1711_parti...,a080097_microsoft word - 081110_e52_1711_parti...,46,neither sampling media identified any anomalie...,1,['follow up'],the geochemical response within the surrendere...,[],[],"['gold', 'gold']",[],['bangemall basin'],[],0,False


In [None]:
# export csv when done
df.to_csv(f'group_{event_group}_labelled.csv')
print(f'Saved group_{event_group}_labelled.csv')