In [27]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [28]:
import re
import os
import glob
from pathlib import Path
import json
from shutil import copyfile
import itertools
from collections import Counter

# analytics
import pandas as pd
import numpy as np

# Visualisation
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [29]:
# Spacy Imports
import spacy

from spacy import displacy
from spacy.pipeline import EntityRuler
from spacy.matcher import PhraseMatcher
from spacy.tokens import Doc, Span, Token
from spacy.attrs import intify_attrs

def display_ent(doc):
    displacy.render(doc, style="ent", jupyter=True)

spacy_model = "en_core_web_lg"

nlp = spacy.load(spacy_model)

In [30]:
# User Source Code

import utilsCharlie

In [31]:
# Project Base
base_path = Path('..')

# Data Paths
data_path = base_path / 'data'
all_reports_path = data_path / 'wamex_xml'
subset_reports_path = data_path / 'subset'
test_reports_path = data_path / 'testset'

# Event Path
events_path = base_path / 'events'

# Dictionary Paths
dictionary_path = base_path / 'dictionary'
triggers_path = dictionary_path / 'triggers'
patterns_path = dictionary_path / 'patterns'

# NLP

## Load Match Pattern Pipelines

In [32]:
patterns = []
for filename in patterns_path.iterdir():
    with open(filename, encoding="utf8") as f:
        patterns += json.load(f)
        
print(f'loaded {len(patterns)} patterns')

loaded 31120 patterns


In [33]:
patterns[:5]

[{'label': 'ROCK',
  'pattern': [{'LOWER': 'acapulcoite'}, {'LOWER': 'meteorite'}]},
 {'label': 'ROCK', 'pattern': [{'LOWER': 'aceite'}]},
 {'label': 'ROCK',
  'pattern': [{'LOWER': 'acid'}, {'LOWER': 'volcanic'}, {'LOWER': 'rock'}]},
 {'label': 'ROCK', 'pattern': [{'LOWER': 'adakite'}]},
 {'label': 'ROCK', 'pattern': [{'LOWER': 'adamellite'}]}]

## Add patterns to nlp

In [34]:
ruler = EntityRuler(nlp, overwrite_ents=True)
ruler.add_patterns(patterns)

nlp.add_pipe(ruler)

# Load Trigger Phrases/Words into Spacy PhraseMatcher


In [35]:
## Original Trigger Phrases/Words
triggerPhrase = dictionary_path / 'trigger phrases'
near_miss_phrases =[]

for filename in triggerPhrase.iterdir():
    with open(filename, 'r') as f:
        for line in f:
            if len(line) > 1:
                near_miss_phrases.append(line[:-2].split())
                
## New Trigger Phrases/Words taken from labelling process

new_phrases = []
for group in (0,1,2,3,4,6):
    events = pd.read_csv('../events/group_{}_labelled.csv'.format(group))
    events = events.loc[events['Key trigger phrase'].notna(), ]
    events_triggers = set(events['Key trigger phrase'].tolist())
    new_phrases += events_triggers
    new_phrases = list(set(new_phrases))

for phrase in new_phrases:
    near_miss_phrases.append(phrase.split())

In [36]:
near_miss_phrases

[['weak', 'intercepts'],
 ['minor', 'intercepts'],
 ['weak', 'minor', 'intercepts'],
 ['weak', 'level', 'mineralisation'],
 ['minor', 'level', 'mineralisation'],
 ['mineralisation'],
 ['low', 'level', 'mineralisation'],
 ['weak', 'level', 'mineralization'],
 ['minor', 'level', 'mineralization'],
 ['low', 'level', 'mineralization'],
 ['minor', 'gold'],
 ['minor', 'nickel'],
 ['further', 'drilling', 'required'],
 ['follow', 'up', 'work', 'required'],
 ['further', 'drilling'],
 ['follow', 'up', 'work'],
 ['enrichement'],
 ['enriched'],
 ['subeconomic'],
 ['patchy'],
 ['open', 'at', 'depth'],
 ['open', 'at', 'depth', 'strike'],
 ['open', 'along', 'strike'],
 ['open', 'along'],
 ['extensive'],
 ['promising'],
 ['encouraging'],
 ['auspicious'],
 ['bright'],
 ['gifted'],
 ['talented'],
 ['up-and-coming'],
 ['assuring'],
 ['reassuring'],
 ['rising'],
 ['favourable'],
 ['broad'],
 ['low', 'grade'],
 ['low-grade'],
 ['lowgrade'],
 ['medium', 'grade'],
 ['medium-grade'],
 ['mediumgrade'],
 ['pote

## Geology ent's to extract

In [37]:
geology_ents = ['ORE_DEPOSIT', 'ROCK', 'MINERAL', 'STRAT', 'LOCATION', 'TIMESCALE']

# Event Extraction

In [38]:
from utilsCharlie import create_event_df

eventdf = create_event_df(
    nlp = nlp, 
    directory = subset_reports_path, 
    trigger_phrases = near_miss_phrases, 
    geology_ents = geology_ents,
    n_sentences_extract = 2,
)

100%|██████████| 9/9 [00:46<00:00,  5.21s/it]

found 46 events from a total of 659 sentences





In [39]:
eventdf.head()

Unnamed: 0,event_id,filename,sentence_idx,sentence_text,n_trigger_words_in_sentence,trigger_words_in_sentence,n_trigger_words_in_event,trigger_words_in_event,event_text,LOCATION,MINERAL,ROCK,event_label,ORE_DEPOSIT,TIMESCALE,STRAT
0,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,1,falconbridge (australia) pty ltd in a farm in ...,2,"accelerated regional exploration, mineralisation",2,"accelerated regional exploration, mineralisation",giralia resources nl e 45 2186 and 2187 dalton...,"haoma, australia","diamond, sulphide, nickel",gossan,0,,,
1,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,9,"in february 2006, agreement was reached with f...",2,"accelerated regional exploration, mineralisation",2,"accelerated regional exploration, mineralisation",in november 2002 giralia resources nl entered ...,"haoma, australia","gold, sulphide, nickel",,0,"base metals, pge",,
2,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,43,"in all, eleven of the thirteen holes were said...",1,low grade,2,"low grade, prospectivity",some thirteen diamond drill holes were drilled...,,"diamond, sulphides, nickel, violarite, sulphid...","serpentinite, sediment",0,,,
3,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,46,these gossans were subsequently shown to be hi...,1,anomalous,3,"anomalous, prospectivity, mineralisation","concurrent with kingsway's work, pacminex, on ...",,"zinc, chromite, diamond, nickel, sulphur, viol...",gossans,0,pge,,
4,a075860_daltons e45-2186 & 2187 annual tech re...,a075860_daltons e45-2186 & 2187 annual tech re...,55,"hole rbdn002 from 6 18 metres, 12 metres @ 0.3...",1,mineralisation,1,mineralisation,following the completion of aboriginal heritag...,south west,"copper, platinum, nickel",bedrock,0,pge,,


## Save Events to csv file

In [40]:
## Make Sure you are saving to your allocated group number

event_path = events_path / 'group_test_events.csv'
eventdf.to_csv(event_path, index=False)

# Visualisations

## Frequency of number of trigger words found in each event

In [41]:
sns.countplot(eventdf.n_trigger_words);

AttributeError: 'DataFrame' object has no attribute 'n_trigger_words'

## Count of each trigger word found

In [None]:
from utils import get_feature_counts_df

In [None]:
feature = 'trigger_words_in_sentence'
get_feature_counts_df(eventdf, feature)

In [None]:
feature = 'ORE_DEPOSIT'
get_feature_counts_df(eventdf, feature)

In [None]:
feature = 'STRAT'
get_feature_counts_df(eventdf, feature)

In [None]:
feature = 'TIMESCALE'
get_feature_counts_df(eventdf, feature)

In [None]:
feature = 'LOCATION'
get_feature_counts_df(eventdf, feature)

# Ent Test

In [None]:
example = nlp(eventdf.iloc[2]['event_text'])
display_ent(example)

In [None]:
for ent in example.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

In [None]:
minerals = [ent.text for ent in example.ents if ent.label_ == 'MINERAL']
minerals