In [5]:
import os
import glob
from pathlib import Path

from functools import reduce
from itertools import chain

import json
import csv
import re

import random
from datetime import datetime

import numpy as np
import pandas as pd

import spacy
from spacy import displacy
from spacy.matcher import Matcher
from spacy.tokens import Span

spacy.prefer_gpu()

True

In [2]:
# function to flatten lists
flatten = lambda l: [item for sublist in l for item in sublist]

# https://github.com/explosion/spaCy/issues/3558#issuecomment-487953653
def trim_entity_spans(data: list) -> list:
    """Removes leading and trailing white spaces from entity spans.

    Args:
        data (list): The data to be cleaned in spaCy JSON format.

    Returns:
        list: The cleaned data.
    """
    invalid_span_tokens = re.compile(r'\s')

    cleaned_data = []
    for text, annotations in data:
        entities = annotations['entities']
        valid_entities = []
        for start, end, label in entities:
            valid_start = start
            valid_end = end
            while valid_start < len(text) and invalid_span_tokens.match(
                    text[valid_start]):
                valid_start += 1
            while valid_end > 1 and invalid_span_tokens.match(
                    text[valid_end - 1]):
                valid_end -= 1
            valid_entities.append([valid_start, valid_end, label])
        cleaned_data.append([text, {'entities': valid_entities}])

    return cleaned_data

In [10]:
# load vocab (different to nlp.vocab)
vocab_typed = Path('../../dictionary/patterns/patterns_1.json').resolve()  # original VOCABULARY_TYPED.json has encoding issues
vocab = pd.read_json(vocab_typed)#, encoding = 'utf8')

vocab

Unnamed: 0,label,pattern
0,ROCK,"[{'LOWER': 'acapulcoite'}, {'LOWER': 'meteorit..."
1,ROCK,[{'LOWER': 'aceite'}]
2,ROCK,"[{'LOWER': 'acid'}, {'LOWER': 'volcanic'}, {'L..."
3,ROCK,[{'LOWER': 'adakite'}]
4,ROCK,[{'LOWER': 'adamellite'}]
...,...,...
31115,STRAT,"[{'LOWER': 'zeepaard'}, {'LOWER': 'formations'}]"
31116,STRAT,"[{'LOWER': 'ziggy'}, {'LOWER': 'monzogranites'}]"
31117,STRAT,"[{'LOWER': 'zimmerman'}, {'LOWER': 'sandstones'}]"
31118,STRAT,"[{'LOWER': 'zimmermann'}, {'LOWER': 'sandstone..."


In [12]:
# Download large model from spacy
spacy_model = "en_core_web_lg"
nlp = spacy.load(spacy_model)
print(f'Loading {spacy_model} spacy model...')

# Instantiate a matcher from the shared vocab of the loaded nlp model
matcher = Matcher(nlp.vocab)

# Add named entity vocab for the matcher to find. labels are 'ROCK', 'MINERAL', etc
matcher.add('ROCK', None, *vocab.loc[vocab.label == 'ROCK'].pattern.to_list())
matcher.add('MINERAL', None, *vocab.loc[vocab.label == 'MINERAL'].pattern.to_list())
matcher.add('ORE_DEPOSIT', None, *vocab.loc[vocab.label == 'ORE_DEPOSIT'].pattern.to_list())
matcher.add('LOCATION', None, *vocab.loc[vocab.label == 'LOCATION'].pattern.to_list())
matcher.add('TIMESCALE', None, *vocab.loc[vocab.label == 'TIMESCALE'].pattern.to_list())
matcher.add('STRAT', None, *vocab.loc[vocab.label == 'STRAT'].pattern.to_list())

# Load in reports
# We dont want to get all 14000 reports at once - we batch load them in 100s
i = 0 # batch index
file_paths = glob.glob('data/wamex_xml/*.json')[(i)*100:(i+1)*100]  # file paths
file_names = [file.rsplit('/', 1)[1].split('.',1)[0] for file in file_paths]  # file names

# load data to memory to a reports dictionary
# keys are file names, values are loaded json files
reports = {}
for name, path in zip(file_names,file_paths):
    with open(path, 'r') as file:  # ensures file is closed after load
        reports[name] = json.load(file)

load_check = input("Would you like to load pre-built training data? (Yes/No)\n")

if load_check.lower() in ('n', 'no', 'false', 'f'):
    print('Building training data...')
    training_data = []

    # Loop through each report sentence as a doc object parsed by nlp model
    for doc in nlp.pipe(flatten(reports.values())):  # nlp.pipe is more efficient than regular for loop

        matches = matcher(doc)  # get matches from document

        #  for each match found by the matcher, return a span with a label = doc.vocab.strings[match_id]
        spans = [Span(doc, start, end, label = doc.vocab.strings[match_id]) for match_id, start, end in matches]

        # create list of entities in with their start index, end index, and named entity label
        # we filter duplicate/overlapping spans with spacy.util.filter_spans()
        entities = [(span.start_char, span.end_char, span.label_) for span in spacy.util.filter_spans(spans)]

        # format matches in spacy-compatible training data format
        training_entry = (doc.text, {'entities': entities})
        print(training_entry)  # debug

        # fix to handle white space in named entities(?)
    #     if len(entities) > 0 :
    #         print(entities)
    #         training_entry = trim_entity_spans(training_entry)

        # append data to list store
        training_data.append(training_entry)

    # write to json
    with open("data/ner_sample_train.json", "w", newline="") as f:
        json.dump(training_data, f)
else:
    with open("data/ner_sample_train.json", "r") as training_file:
        training_data = json.load(training_file)

print(f'Length of loaded training data: {len(training_data)}')

Loading en_core_web_lg spacy model...
Would you like to load pre-built training data? (Yes/No)
y
Length of loaded training data: 7956


In [13]:
load_check = input("Would you like to load the pre-trained test model? (Yes/No)\n")

nlp = spacy.blank('en')  # create a blank spacy model in english

ner = nlp.create_pipe('ner')  # create a pipe associated with the nlp model, called 'ner'
nlp.add_pipe(ner)  # add pipe to pipeline

# add label for ner to search
ner.add_label('ROCK')
ner.add_label('MINERAL')
ner.add_label('ORE_DEPOSIT')
ner.add_label('LOCATION')
ner.add_label('TIMESCALE')

# if we dont load the model, train it and optionally save.
if load_check.lower() in ('n', 'no', 'false', 'f'):
    save_model = input("Would you like to save the model? This will overwrite other model files. (Yes/No)\n")

    # test demo for nlp training loop
    nlp.begin_training()

    # Loop for 10 iterations
    iterations = 10
    print(f'Running training loop for {iterations} iterations')
    
    for itn in range(iterations):
        print(f'Run {itn} at time {datetime.now()}')
        print('')
        # Shuffle the training data
        random.shuffle(training_data)
        losses = {}
        save_interval = 0
        # Batch the examples and iterate over them
        for batch in spacy.util.minibatch(training_data, size=10):
            texts = [text for text, entities in batch]
            annotations = [entities for text, entities in batch]

            # Update the model
            nlp.update(texts, annotations, losses=losses)
            save_interval += 1
            if save_interval%50 == 0: # print every 50
                print(losses)
    
    # save model; if user input is yes/true
    if save_model.lower() in ('y', 'yes', 'true', 't'):
        nlp.to_disk("models/test_ner_model")
                
else:  # load model from disk
    nlp = nlp.from_disk("models/test_ner_model")

Would you like to load the pre-trained test model? (Yes/No)
y


In [14]:
# example 'near-miss"'
texts = [
    'The Narndee and Mulyeron Hill areas are interpreted as having greater potential for nickel sulphide mineralisation.',
    'The area was selected for potential nickel-copper-PGE mineralisation associated with feeders to the Narndee and Windimurra Intrusions'
]

for doc in nlp.pipe(texts):
    displacy.render(doc, style="ent")

In [19]:
# load some other test data
test = pd.concat([pd.read_json(file) for file in glob.glob('../../data/wamex_xml/*.json')[200:210]])

In [20]:
# Create a Doc object each test case
for doc in nlp.pipe(flatten(test.values)):
    displacy.render(doc, style="ent")



In [21]:
# load back the original model as a comparison
nlp = spacy.load("en_core_web_lg")