In [1]:
%load_ext autoreload
%autoreload 2

from pipeline.preprocessing.processing import match_triggers, triggers_json_to_df, load_triggers
from pipeline.preprocessing.processing import load_files, load_json
from pipeline.data.metadata import get_report_data, get_geoview_data
import spacy

import pandas as pd
from tqdm import tqdm

classifier.py
helpers.py
processing.py
metadata.py


In [11]:
def extract_text_chunks(filenames, pad=2, skip_on_trigger=False):
    # if a dictionary is loaded where keys are filenames and values are pre-loaded files, we dont load from disk
    if type(filenames) == dict:
        files = filenames
    else:
        # load report files from disk to extract events on triggers
        files = load_files(filenames, data_path='data/wamex_xml', output='dict')
    
    if skip_on_trigger:
        pass
    
    return {
        file : [' '.join((pad*[''] + sentences + pad*[''])[idx : (1 + idx + (2*pad))]).strip()
                for idx in range(len(sentences))] 
        for file, sentences in tqdm(files.items(), desc='Extracting text chunks')
     }

def build_event_text(event, files=None, pad=0, labelled_ranges=True):
    if files == None:
        _, files = get_report_data(count_sentences=False, return_files=True)

    # return the customised lower and upper bounds on the labelled event (only near misses)
    if event.label and labelled_ranges:
        event_text = files[event.filename][
            event.idx + event.lower_bound + pad: event.idx + 1 + event.upper_bound + pad]
        return ' '.join(event_text).strip()

    # else return the default window - if pad = 0 then only return the sentence
    else:
        if pad == 0:  # return individual sentences
            return files[event.filename][event.idx]
        else:  # sliding window method
            file = (pad*[''] + files[event.filename] + pad*[''])  # pad file with extra sentences given pad
            event_text = file[event.idx : event.idx + 1 + (2*pad)]  # get window of width pad
            return ' '.join(event_text).strip()

def merge_datasets(datasets: dict, users = None, confidence='High'):
    # type checking confidence_threshold parameter input
    assert type(confidence) == str, 'confidence parameter must be a string'
    assert confidence.lower() in ('low','medium','high'), 'confidence parameter must be in ("low","medium","high")'
    
    users = users or ('daniel','charlie')  # specify data labellers
    
    # subset all datasets on reviewed, assign user column to user name, then concatenate and return
    df = pd.concat([dataset[user].loc[dataset[user].reviewed].assign(user=user) for user in users])
    
    # split into duplicates and non-duplicates
    duplicates = df.loc[df.index.duplicated(keep=False)]
    df = df.loc[~df.index.duplicated(keep=False)]
    
    if confidence.lower() == 'high':
        duplicates.loc[duplicates.confidence != 'High', 'label'] = False
    elif confidence.lower() == 'medium':
        duplicates.loc[duplicates.confidence == 'Low', 'label'] = False
    
    merged = duplicates[['filename','anumber','sentence_count','report_type','idx','triggers']].merge(
        duplicates.groupby('index').agg({'label': 'all', 'lower_bound': 'min', 'upper_bound': 'max'}),
        left_index=True, right_index=True).drop_duplicates()
    
    return pd.concat([df, merged]).drop(columns=['confidence','user', 'reviewed']).sort_index()

def load_group_all_labelled(geoview = None, capstone_files=None):
    import yaml
    if type(geoview) != pd.DataFrame:
        from pipeline.data.metadata import get_geoview_data
        geoview = get_geoview_data()
        
    if type(capstone_files) != pd.DataFrame:
        from pipeline.data.metadata import get_report_data
        capstone_files, _ = get_report_data(count_sentences=True, return_files=False)
        
    old_events = pd.read_csv('events/group_all_labelled.csv')

    # fix string and lists
    for group in (2,3,4,6):
        old_events.loc[old_events.group == group, 'trigger_words_in_sentence'] = old_events.loc[
            old_events.group == group, 'trigger_words_in_sentence'].map(yaml.safe_load).apply(
        lambda triggers : ', '.join(triggers).strip())
   
    # edit old columns and delete unnecessary ones
    old_events.drop(columns=['group','n_trigger_words_in_sentence','sentence_text','n_trigger_words_in_event',
        'trigger_words_in_event', 'n_trigger_words','event_label','reviewed','Key trigger phrase',
        'STRAT','ROCK','LOCATION','MINERAL','ORE_DEPOSIT','TIMESCALE'], inplace=True)
    old_events.rename(columns={'trigger_words_in_sentence':'sentence_triggers','Near Miss Event': 'label'}, inplace=True)
    
    # merge with necessary metadata
    old_events = old_events.merge(capstone_files, on='filename').merge(geoview[['anumber','report_type']], on='anumber')
    
    # add in upper and lower bound on text chunks given n_sentences_extract =2
    old_events['lower_idx'] = old_events.apply(
        lambda row : (row.sentence_idx-2) if ((row.sentence_idx-2) > 0) else 0, axis=1) 
    old_events['upper_idx'] = old_events.apply(
        lambda row : (row.sentence_idx-2) if ((row.sentence_idx+2) <= row.sentence_count) else row.sentence_count, axis=1)
    
    old_events['event_id'] = old_events['event_id'] + '_old'
    
    return old_events[['event_id','filename','anumber','report_type','sentence_count',
        'sentence_idx','sentence_triggers','event_text', 'label','lower_idx','upper_idx']]

def build_event_data(datasets: dict, pad=0, labelled_ranges=True, confidence='High', group_all_labelled=False,
                    nlp=None, named_entities=None, return_entities=True, batch_size=50, n_process=16,
                    drop=['lower_idx','upper_idx'], geoview=None, capstone_files=None, files=None):
    
    # load files if files are not provided
    if (type(capstone_files) != pd.DataFrame) | (files == None):
        capstone_files, files = get_report_data(count_sentences=False, return_files=True)
        
    # merge datasets provided by individual labellers
    df = merge_datasets(datasets, confidence='High')
    
    # apply the build event text function to build text chunk from labelled sentences
    df.insert(6,'event_text', df.apply(lambda row : build_event_text(
        row, pad=2, labelled_ranges=labelled_ranges, files=files), axis=1))

    # insert the event_id natural key which is f'{filename}_{idx}'
    df.insert(0, 'event_id', df.apply(lambda row : '_'.join(
        [row.filename.rsplit('.', 1)[0], str(row.idx)]), axis=1))
    
    # return the text chunk start and end positions, or lower_bound/upper_bound
    if labelled_ranges:
        df['lower_idx'] = df['idx'] + df['lower_bound']
        df['upper_idx'] = df['idx'] + df['upper_bound']
    else:
        df['lower_idx'] = df['idx'] - pad
        df['upper_idx'] = df['idx'] + pad
    
    df.rename(columns = {'idx':'sentence_idx','triggers':'sentence_triggers'}, inplace=True)
    df.drop(columns=['lower_bound','upper_bound'], inplace=True)
    
    # load old event labels from group labelling early in semester
    if group_all_labelled:
        old_events = load_group_all_labelled(geoview=geoview, capstone_files=capstone_files)
        df = df.append(old_events, ignore_index=True)
        
    # run named entity recognition with spacy on text chunk
    if return_entities:
        nlp = nlp or load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False,
            geological_matcher=True, stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True)

        named_entities = named_entities or ['DATE','LOCATION','TRIGGER','STRAT', 'ROCK', 
                                            'LOCATION', 'MINERAL', 'ORE_DEPOSIT', 'TIMESCALE']

        # create a list of tuples for each entity in each event id
        event_entities = [(event_id, ent.text, ent.label_) for event_id, doc in zip(df.event_id.values,
            nlp.pipe(df.event_text.values, batch_size=batch_size, n_process=n_process))
            for ent in doc.ents if ent.label_ in named_entities]

        # join entity labels together as a string and then merge onto original dataframe
        df = df.merge(pd.DataFrame(data=event_entities, columns=['event_id','entity','label']).groupby(
            ['event_id','label']).apply(lambda x : ', '.join(x.entity)).unstack(level='label').reset_index(),
                 on='event_id',how='left').fillna('')

    return df


In [12]:
# specify labellers
users = ('daniel','charlie')
dataset = {user : pd.read_csv(f'data/events/{user}_dataset.csv', index_col=0) for user in users}

for user in users:
    print(f'{len(dataset[user].loc[dataset[user].reviewed])} events labelled by {user}.')

251 events labelled by daniel.
502 events labelled by charlie.


In [13]:
from pipeline.preprocessing.processing import load_spacy_model

capstone_files, files = get_report_data(count_sentences=True, return_files=True)
geoview = get_geoview_data()

nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True,
                       stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False)

events = {}
events['medium'] = build_event_data(dataset, files=files, nlp=nlp, capstone_files=capstone_files, geoview=geoview,
                          return_entities=True, group_all_labelled=True, confidence='medium')
events['high'] = build_event_data(dataset, files=files, nlp=nlp, capstone_files=capstone_files, geoview=geoview,
                          return_entities=True, confidence='high')

Loading files as dict: 100%|██████████| 32646/32646 [00:02<00:00, 15266.19it/s]


In [15]:
events['medium'].to_csv('data/events/events_medium_conf.csv')
events['high'].to_csv('data/events/events_high_conf.csv')

In [16]:
events['medium']

Unnamed: 0,event_id,filename,anumber,sentence_count,report_type,sentence_idx,sentence_triggers,event_text,label,lower_idx,upper_idx,DATE,LOCATION,MINERAL,ORE_DEPOSIT,ROCK,STRAT,TIMESCALE,TRIGGER
0,a071633_27600 morrissey_9355974_51,a071633_27600 morrissey_9355974.json,71633,92,Final Surrender,51,extensive,The target horizon considered prospective for ...,False,51,51,,Broken Hill,"lead, zinc, silicate",,"schist, phyllite, quartzite, quartzite, paragn...","Morrissey Metamorphics, Morrissey Metamorphics...",Proterozoic,"prospective, mineralisation, extensive"
1,a071816_apollo 2005 annual tech report_1127563...,a071816_apollo 2005 annual tech report_1127563...,71816,264,Annual,186,"minor gold, mineralisation",The width of the anomaly is at least 150m and ...,False,186,186,,,"gold, Arsenic, gold, gold",,"metabasalt, bedrock, dolerite",,,"mineralisation, prospect, anomalous"
2,a071875_700-100-go-rep-0002_13675165_172,a071875_700-100-go-rep-0002_13675165.json,71875,204,Annual,172,"significance, prospects, follow up work",The Minyari lease areas are known to be covere...,False,172,172,,,"diamond, Diamond",,"gravel, sand",,,"prospects, significance, prospects, mineralisa..."
3,a071950_c591_1994_2005a_16001655_730,a071950_c591_1994_2005a_16001655.json,71950,797,Annual,730,high grade,The deposit was estimated by ResEval using Inv...,False,730,730,,Horseshoe,,,,,,"mineralisation, high grade"
4,a071977_kurnalpi 2005_19331320_138,a071977_kurnalpi 2005_19331320.json,71977,171,Annual,138,mineralization,The basaltic doleritic units provide an excell...,True,137,139,,Kurnalpi,"gold, gold, quartz, gold, quartz",,"basaltic, basaltic",,,"mineralization, mineralisation, prospects, min..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2351,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,75860,116,Annual,55,mineralisation,"the wadi prospect, 100 metres south of the kin...",False,53,53,"late november 2002, the third year, late may 2...","south west, haoma, long","nickel, copper, platinum, diamond","pge, pge, pge",bedrock,,,"prospect, prospect, mineralisation"
2352,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,75860,116,Annual,67,mineralisation,diamond drilling was focussed on testing down ...,True,65,65,"annual, september 2005",,"diamond, diamond, nickel, sulphide, sulphides",precious metal,serpentinite,,,"mineralisation, significant, confirming the pr..."
2353,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,75860,116,Annual,83,"favourable, mineralisation",no significant assays were received from this ...,False,81,81,4 october 2006,,"nickel, sulphide",,,,,"significant, prospect, favourable, mineralisat..."
2354,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,75860,116,Annual,110,potential,the data has been lodged with doir airborne ge...,False,108,108,"2001, 2003, annual, the 12 months ending 5 mar...","east pilbara, western australia",,,,,,exploration potential


In [17]:
events['high']

Unnamed: 0,event_id,filename,anumber,sentence_count,report_type,sentence_idx,sentence_triggers,event_text,label,lower_idx,upper_idx,DATE,LOCATION,MINERAL,ORE_DEPOSIT,ROCK,STRAT,TIMESCALE,TRIGGER
0,a071633_27600 morrissey_9355974_51,a071633_27600 morrissey_9355974.json,71633,92,Final Surrender,51,extensive,The target horizon considered prospective for ...,False,51,51,,Broken Hill,"lead, zinc, silicate",,"schist, phyllite, quartzite, quartzite, paragn...","Morrissey Metamorphics, Morrissey Metamorphics...",Proterozoic,"prospective, mineralisation, extensive"
1,a071816_apollo 2005 annual tech report_1127563...,a071816_apollo 2005 annual tech report_1127563...,71816,264,Annual,186,"minor gold, mineralisation",The width of the anomaly is at least 150m and ...,False,186,186,,,"gold, Arsenic, gold, gold",,"metabasalt, bedrock, dolerite",,,"mineralisation, prospect, anomalous"
2,a071875_700-100-go-rep-0002_13675165_172,a071875_700-100-go-rep-0002_13675165.json,71875,204,Annual,172,"significance, prospects, follow up work",The Minyari lease areas are known to be covere...,False,172,172,,,"diamond, Diamond",,"gravel, sand",,,"prospects, significance, prospects, mineralisa..."
3,a071950_c591_1994_2005a_16001655_730,a071950_c591_1994_2005a_16001655.json,71950,797,Annual,730,high grade,The deposit was estimated by ResEval using Inv...,False,730,730,,Horseshoe,,,,,,"mineralisation, high grade"
4,a071977_kurnalpi 2005_19331320_138,a071977_kurnalpi 2005_19331320.json,71977,171,Annual,138,mineralization,The basaltic doleritic units provide an excell...,True,137,139,,Kurnalpi,"gold, gold, quartz, gold, quartz",,"basaltic, basaltic",,,"mineralization, mineralisation, prospects, min..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
744,a109843_a109843_v1_report_22,a109843_a109843_v1_report.json,109843,75,Partial Surrender,22,"high grade, exploration drilling",The Atlas mineral sands deposit has a Measured...,False,22,23,"year ended 30 June 2015, 2013, between October...",,"ilmenite, leucoxene, rutile, zircon, ilmenite,...",,mineral sands,,,exploration drilling
745,a109904_a109904_v1_report_53,a109904_a109904_v1_report.json,109904,201,Partial Surrender,53,mineralisation,Sandfire Resources De Grussa Cu Mine) (is beli...,True,53,55,1987,Capricorn Orogen,,,,,Archaean,
746,a109930_a109930_v1_report_23,a109930_a109930_v1_report.json,109930,118,Final Surrender,23,significant,All data was reviewed for multiple commodity p...,False,23,23,"the reporting year, the period 29, May 2015 to...","Kimberley, Western Australia, Halls Creek",,,,,,"potential, significant"
747,a109995_a109995_v1_report_84,a109995_a109995_v1_report.json,109995,88,Partial Surrender,84,prospecting,All tenements were explored as part of the Gas...,False,84,84,years 1 to 6,Gascoyne,,base metals,,,,"prospectivity, prospective, prospective"
