In [1]:
%load_ext autoreload
%autoreload 2

from build_events import *

In [2]:
# load spacy model
nlp = load_spacy_model(output_type='doc', trigger_matcher=True, lemmatizer=False, geological_matcher=True,
    stopword_removal=False, punctuation_removal=False, lemmatize_triggers=True, verbose=False)

# load files and geoview metadata
capstone_files, files = get_report_data(count_sentences=True, return_files=True)

metadata = pd.read_csv('data/geoview/capstone_metadata.zip', compression='zip', parse_dates=['report_year'],
    usecols=['anumber','title','report_type','project','keywords','commodity','report_year'])

# specify labellers
users = ('daniel','charlie')
dataset = {
    user : pd.read_csv(f'data/labels/{user}_dataset.csv', index_col=0).rename(
        columns={'idx': 'sentence_idx'}) for user in users}
confs = ('low','medium','high',)
group_all_labelled = True

for user in users:
    print(f'{len(dataset[user].loc[dataset[user].reviewed])} events labelled by {user}.')

Loading files as dict: 100%|██████████| 32646/32646 [00:09<00:00, 3305.93it/s]


251 events labelled by daniel.
1258 events labelled by charlie.


In [5]:
dataset['charlie'].loc[dataset['charlie'].reviewed]

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,sentence_idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
118,a071633_27600 morrissey_9355974.json,71633,92,Final Surrender,51,extensive,True,False,High,0,0
2938,a071966_c28_2004_2005a_16260784.json,71966,535,Annual,94,mineralization,True,False,High,0,0
3094,a071977_kurnalpi 2005_19331320.json,71977,171,Annual,138,mineralization,True,True,High,-1,1
3728,a072020_goldenvalley_ann_05_11276734.json,72020,294,Annual,61,prospects,True,False,High,0,0
4354,a072063_nor_ki_06_2_12113137.json,72063,273,Annual,215,proposed,True,False,High,0,0
...,...,...,...,...,...,...,...,...,...,...,...
580132,a109904_a109904_v1_report.json,109904,201,Partial Surrender,53,mineralisation,True,False,High,0,1
580175,a109904_marymia project historic exploration s...,109904,103,Partial Surrender,29,mineralisation,True,False,High,0,1
580251,a109930_a109930_v1_report.json,109930,118,Final Surrender,23,significant,True,False,High,0,0
580555,a110056_a110056_v2_report.json,110056,63,Partial Surrender,43,low grade,True,False,High,0,1


In [4]:
dataset['daniel'].loc[dataset['daniel'].reviewed]

Unnamed: 0_level_0,filename,anumber,sentence_count,report_type,sentence_idx,triggers,reviewed,label,confidence,lower_bound,upper_bound
index,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1021,a071816_apollo 2005 annual tech report_1127563...,71816,264,Annual,186,"minor gold, mineralisation",True,False,High,0,0
1592,a071875_700-100-go-rep-0002_13675165.json,71875,204,Annual,172,"significance, prospects, follow up work",True,False,High,0,0
2719,a071950_c591_1994_2005a_16001655.json,71950,797,Annual,730,high grade,True,False,High,0,0
4773,a072089_kun_ann_05_17468294.json,72089,944,Annual,804,mineralisation,True,False,High,0,0
7901,a072285_c67_2003_2006a_12640575.json,72285,738,Annual,78,potentially,True,False,High,0,0
...,...,...,...,...,...,...,...,...,...,...,...
571119,a108408_a108408_v1_report.json,108408,224,Final Surrender,222,mineralisation,True,False,High,0,0
576625,a109309_a109309_v1_report.json,109309,166,Final Surrender,62,broad,True,False,High,0,0
578154,a109483_toke_e52_3110_final_surrender_report.json,109483,56,Final Surrender,43,potential,True,False,High,0,0
579079,a109668_a109668_v1_report.json,109668,88,Final Surrender,76,presence of,True,False,High,0,0


In [3]:
# loads events by confidence - note will not load group labelled
events = {conf: build_event_data(dataset, confidence=conf, files=files, nlp=nlp, capstone_files=capstone_files,
    geoview=metadata, return_entities=True, group_all_labelled=group_all_labelled) for conf in confs}

# build geopandas.geodataframe.GeoDataFrame (to start with geoview to preserve data type for plotly map)
# join geoview shape files, geoview metadata, capstone json to anumber mapping, and aggregated event statistics
df = {conf : metadata.merge(capstone_files, on='anumber').merge(
    events[conf].groupby('filename')['label'].sum().reset_index(), on='filename') for conf in confs}

# store one hot encodings for each of the events dataframes
commodities = {}
for conf in confs:
    df[conf]['commodity_list'] = df[conf]['commodity'].apply(lambda x : to_list(x, sep=';', default='NO TARGET COMMODITY'))  # expand string separated strings to list
    mlb = MultiLabelBinarizer()
    mlb.fit(df[conf]['commodity_list'])
    commodities[conf] = pd.DataFrame(mlb.transform(df[conf]['commodity_list']), columns=mlb.classes_, index=df[conf].index)


Extracting low confidence events: 2885it [01:15, 38.13it/s] 
Extracting medium confidence events: 2885it [01:13, 39.05it/s] 
Extracting high confidence events: 1462it [00:41, 35.19it/s] 


In [4]:
# check duplicates
for conf in confs:
    display(events[conf].loc[
        (events[conf][['event_id']].duplicated(keep=False)) &
        ~(events[conf][['event_id','label']].duplicated(keep=False))
    ].sort_values('event_id')[['event_id','filename','sentence_idx','label']])

Unnamed: 0,event_id,filename,sentence_idx,label


Unnamed: 0,event_id,filename,sentence_idx,label


Unnamed: 0,event_id,filename,sentence_idx,label


In [5]:
for conf in confs:
    if group_all_labelled:
        events[conf].to_csv(f'data/events/events_{conf}-conf-extra.csv')
        commodities[conf].to_csv(f'data/events/commodities_{conf}-conf-extra.csv')
    else:
        events[conf].to_csv(f'data/events/events_{conf}-conf.csv')
        commodities[conf].to_csv(f'data/events/commodities_{conf}-conf.csv')

In [1]:
import pandas as pd
pd.read_csv('data/events/events_high-conf.csv', index_col=0)

Unnamed: 0,event_id,filename,anumber,sentence_count,report_type,sentence_idx,sentence_triggers,event_text,label,lower_idx,upper_idx,DATE,LOCATION,MINERAL,ORE_DEPOSIT,ROCK,STRAT,TIMESCALE,TRIGGER
0,a071816_apollo 2005 annual tech report_1127563...,a071816_apollo 2005 annual tech report_1127563...,71816,264,Annual,186,"minor gold, mineralisation",The width of the anomaly is at least 150m and ...,False,186,186,,,"gold, Arsenic, gold, gold",,"metabasalt, bedrock, dolerite",,,"mineralisation, prospect, anomalous"
1,a071875_700-100-go-rep-0002_13675165_172,a071875_700-100-go-rep-0002_13675165.json,71875,204,Annual,172,"significance, prospects, follow up work",The Minyari lease areas are known to be covere...,False,172,172,,,"diamond, Diamond",,"gravel, sand",,,"prospects, significance, prospects, mineralisa..."
2,a071950_c591_1994_2005a_16001655_730,a071950_c591_1994_2005a_16001655.json,71950,797,Annual,730,high grade,The deposit was estimated by ResEval using Inv...,False,730,730,,Horseshoe,,,,,,"mineralisation, high grade"
3,a072089_kun_ann_05_17468294_804,a072089_kun_ann_05_17468294.json,72089,944,Annual,804,mineralisation,The model includes 16 diamond holes drilled be...,False,804,804,"November 2004, January 2005, July 2004, April ...",Hornet,diamond,,,,,"mineralisation, mineralisation"
4,a072285_c67_2003_2006a_12640575_78,a072285_c67_2003_2006a_12640575.json,72285,738,Annual,78,potentially,Emperor and Shogun resource estimates are via ...,False,78,78,the year,,,,,,,potentially
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1457,a104213_lake hopkins brine potash jorc resourc...,a104213_lake hopkins brine potash jorc resourc...,104213,256,Annual,99,possible,Drill hole spacing is roughly at 2-3 km and no...,False,99,100,,,salt,,mixed,,,possible
1458,a104213_lake hopkins brine potash jorc resourc...,a104213_lake hopkins brine potash jorc resourc...,104213,256,Annual,254,possible,It is generally accepted that geological uncer...,False,254,254,,,,,sediments,,,possible
1459,a104633_c203_2011_2014a_65,a104633_c203_2011_2014a.json,104633,136,Annual,65,prospect,Structures of similar age and orientation to t...,False,65,66,,"Kanowna Belle, Gordon, Mulgarrie, southwest",salt,base metal,"sediments, mud, sand, sand, sediments, saprolite",,,"mineralization, prospect"
1460,a106061_e40_195 final surrender report vrifica...,a106061_e40_195 final surrender report vrifica...,106061,12,Final Surrender,4,prospect,Drill sample Stream sediment Soil C62_2006_A_2...,False,4,5,2008 2008 2010 2010 2010 2012 2012 2012,Leonora,"Diamond, sulphide","VMS, VMS, VMS","sediment, intrusive rocks, sediment, ash, tuff...",,,"prospect, mineralisation"
