In [1]:
%load_ext autoreload
%autoreload 2

# capstone repo pipeline for labelled training data --> built event text chunks
# from pipeline.preprocessing.processing import match_triggers, triggers_json_to_df, load_triggers
# from pipeline.preprocessing.processing import load_files, load_json, load_spacy_model
# from pipeline.utils.helpers import to_list
# from pipeline.data.metadata import get_report_data, get_geoview_data

# import numpy as np
# import pandas as pd
# import geopandas as gpd

# from sklearn.preprocessing import MultiLabelBinarizer

# import spacy

# from tqdm import tqdm

# from build_events import extract_text_chunks, build_event_text, merge_datasets, load_group_all_labelled, build_event_data

from build_events import *

classifier.py
helpers.py
processing.py
metadata.py


In [3]:
# load spacy model
nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True,
    stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False)

# load files and geoview metadata
capstone_files, files = get_report_data(count_sentences=True, return_files=True)

metadata = pd.read_csv('data/geoview/capstone_metadata.zip', compression='zip', parse_dates=['report_year'],
    usecols=['anumber','title','report_type','project','keywords','commodity','report_year'])

geoview = gpd.read_file('zip://data/geoview/capstone_shapefiles.shp.zip')

# specify labellers
users = ('daniel','charlie')
dataset = {
    user : pd.read_csv(f'data/events/{user}_dataset.csv', index_col=0).rename(
        columns={'idx': 'sentence_idx'}) for user in users}
confs = ('medium','high',)

for user in users:
    print(f'{len(dataset[user].loc[dataset[user].reviewed])} events labelled by {user}.')

Loading files as dict: 100%|██████████| 32646/32646 [00:02<00:00, 13803.16it/s]


251 events labelled by daniel.
1008 events labelled by charlie.


In [4]:
# loads events by confidence - note will not load group labelled
events = {conf: build_event_data(dataset, confidence=conf, files=files, nlp=nlp, capstone_files=capstone_files,
    geoview=geoview, return_entities=True, group_all_labelled=True) for conf in confs}

# build geopandas.geodataframe.GeoDataFrame (to start with geoview to preserve data type for plotly map)
# join geoview shape files, geoview metadata, capstone json to anumber mapping, and aggregated event statistics
df = {conf : geoview.merge(metadata, on='anumber').merge(capstone_files, on='anumber').merge(
    events[conf].groupby('filename')['label'].sum().reset_index(), on='filename') for conf in confs}

# store one hot encodings for each of the events dataframes
commodities = {}
for conf in confs:
    df[conf]['commodity_list'] = df[conf]['commodity'].apply(lambda x : to_list(x, sep=';', default='NO TARGET COMMODITY'))  # expand string separated strings to list
    mlb = MultiLabelBinarizer()
    mlb.fit(df[conf]['commodity_list'])
    commodities[conf] = pd.DataFrame(mlb.transform(df[conf]['commodity_list']), columns=mlb.classes_, index=df[conf].index)


Extracting events: 2862it [01:19, 35.91it/s] 
Extracting events: 1255it [00:38, 33.02it/s]


In [5]:
for conf in confs:
    display(events[conf].head())

Unnamed: 0,event_id,filename,anumber,sentence_count,report_type,sentence_idx,sentence_triggers,event_text,label,lower_idx,upper_idx,DATE,LOCATION,MINERAL,ORE_DEPOSIT,ROCK,STRAT,TIMESCALE,TRIGGER
0,a071633_27600 morrissey_9355974_51,a071633_27600 morrissey_9355974.json,71633,92,Final Surrender,51,extensive,The target horizon considered prospective for ...,False,51,51,,Broken Hill,"lead, zinc, silicate",,"schist, phyllite, quartzite, quartzite, paragn...","Morrissey Metamorphics, Morrissey Metamorphics...",Proterozoic,"prospective, mineralisation, extensive"
1,a071816_apollo 2005 annual tech report_1127563...,a071816_apollo 2005 annual tech report_1127563...,71816,264,Annual,186,"minor gold, mineralisation",The width of the anomaly is at least 150m and ...,False,186,186,,,"gold, Arsenic, gold, gold",,"metabasalt, bedrock, dolerite",,,"mineralisation, prospect, anomalous"
2,a071875_700-100-go-rep-0002_13675165_172,a071875_700-100-go-rep-0002_13675165.json,71875,204,Annual,172,"significance, prospects, follow up work",The Minyari lease areas are known to be covere...,False,172,172,,,"diamond, Diamond",,"gravel, sand",,,"prospects, significance, prospects, mineralisa..."
3,a071950_c591_1994_2005a_16001655_730,a071950_c591_1994_2005a_16001655.json,71950,797,Annual,730,high grade,The deposit was estimated by ResEval using Inv...,False,730,730,,Horseshoe,,,,,,"mineralisation, high grade"
4,a071966_c28_2004_2005a_16260784_94,a071966_c28_2004_2005a_16260784.json,71966,535,Annual,94,mineralization,The mineralisation appears to thin and dissipa...,False,94,94,,Hamersley,"hematite, Iron",,BIF,,,"mineralisation, mineralisation, mineralization"


Unnamed: 0,event_id,filename,anumber,sentence_count,report_type,sentence_idx,sentence_triggers,event_text,label,lower_idx,upper_idx,DATE,LOCATION,MINERAL,ORE_DEPOSIT,ROCK,STRAT,TIMESCALE,TRIGGER
0,a071633_27600 morrissey_9355974_51,a071633_27600 morrissey_9355974.json,71633,92,Final Surrender,51,extensive,The target horizon considered prospective for ...,False,51,51,,Broken Hill,"lead, zinc, silicate",,"schist, phyllite, quartzite, quartzite, paragn...","Morrissey Metamorphics, Morrissey Metamorphics...",Proterozoic,"prospective, mineralisation, extensive"
1,a071816_apollo 2005 annual tech report_1127563...,a071816_apollo 2005 annual tech report_1127563...,71816,264,Annual,186,"minor gold, mineralisation",The width of the anomaly is at least 150m and ...,False,186,186,,,"gold, Arsenic, gold, gold",,"metabasalt, bedrock, dolerite",,,"mineralisation, prospect, anomalous"
2,a071875_700-100-go-rep-0002_13675165_172,a071875_700-100-go-rep-0002_13675165.json,71875,204,Annual,172,"significance, prospects, follow up work",The Minyari lease areas are known to be covere...,False,172,172,,,"diamond, Diamond",,"gravel, sand",,,"prospects, significance, prospects, mineralisa..."
3,a071950_c591_1994_2005a_16001655_730,a071950_c591_1994_2005a_16001655.json,71950,797,Annual,730,high grade,The deposit was estimated by ResEval using Inv...,False,730,730,,Horseshoe,,,,,,"mineralisation, high grade"
4,a071966_c28_2004_2005a_16260784_94,a071966_c28_2004_2005a_16260784.json,71966,535,Annual,94,mineralization,The mineralisation appears to thin and dissipa...,False,94,94,,Hamersley,"hematite, Iron",,BIF,,,"mineralisation, mineralisation, mineralization"


In [6]:
for conf in confs:
    events[conf].to_csv(f'data/events/events_{conf}-conf.csv')
    commodities[conf].to_csv(f'data/events/commodities_{conf}-conf.csv')