In [1]:
# PhraseMatcher.py
# import necessary modules

from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
from spacy.util import minibatch, compounding

from spacy import displacy
from collections import Counter
from spacy.matcher import PhraseMatcher #import PhraseMatcher class
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

from time import sleep
from progressbar import progressbar

import pandas as pd
import numpy as np
import xlrd
import spacy
import en_core_web_sm

# Language class with the English model 'en_core_web_sm' is loaded
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 7000000

In [2]:
terms_corpus = pd.read_excel('astronomy.xls')

In [3]:
# the list containing the pharses to be matched
terminology_list = []
for term in terms_corpus['key']:
    terminology_list.append(term[term.find(':')+2:])

In [4]:
import glob

read_files = glob.glob("corpus/Astromony_2*.txt")

with open("corpus/result.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())



# the input text string is converted to a Document object



file  = open('corpus/result.txt')
text = file.read()

In [5]:
matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "AstroTerm",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})

matcher = PhraseMatcher(nlp.vocab)

In [6]:
doc = nlp(text)
# create the PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
#print([(X.text, X.label_) for X in doc.ents])

In [7]:
#IOB_tagger = [(X, X.ent_iob_, X.ent_type_) for X in doc]


In [8]:
# convert the phrases into document object using nlp.make_doc to #speed up.
patterns = [nlp.make_doc(text) for text in terminology_list]
# add the patterns to the matcher object without any callbacks
matcher.add("TerminologyList", collect_sents, *patterns)

In [9]:
#call the matcher object the document object and it will return #match_id, start and stop indexes of the matched words
matches = matcher(doc)

In [10]:
#displacy.render(matched_sents, style="ent", manual=True)

In [11]:
# preprocess training data
TRAIN_DATA = []
LABEL = 'AstroTerm'
for dic in matched_sents:
    inner_list = []
    dict_item = {}
    inner_list.append((dic['ents'][0]['start'],dic['ents'][0]['end'],LABEL))
    dict_item['entities'] = inner_list
    TRAIN_DATA.append((dic['text'],dict_item))

In [12]:
# Importing requirements
from spacy.util import minibatch, compounding
import random


#nlp = spacy.blank("en")  # create blank Language class


if "ner" not in nlp.pipe_names:
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner, last=True)
        
ner = nlp.get_pipe("ner")

# Add the new label to ner
ner.add_label(LABEL)


# List of pipes you want to train
pipe_exceptions = ["ner", "trf_wordpiecer", "trf_tok2vec"]

# List of pipes which should remain unaffected in training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]



for _, annotations in TRAIN_DATA:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [13]:
# Resume training
optimizer = nlp.resume_training()
move_names = list(ner.move_names)


nlp.begin_training()

n_iter = 100
# Begin training by disabling other pipeline components
with nlp.disable_pipes(*other_pipes) :

    sizes = compounding(1.0, 4.0, 1.001)
    # Training for n_iter iterations     
    for itn in progressbar(range(n_iter)):
        sleep(0.02)
    # shuffle examples before training
        random.shuffle(TRAIN_DATA)
        # batch up the examples using spaCy's minibatch
        batches = minibatch(TRAIN_DATA, size=sizes)
        # ictionary to store losses
        losses = {}
        for batch in batches:
            texts, annotations = zip(*batch)
            # Calling update() over the iteration
            nlp.update(texts, annotations, sgd=optimizer, drop=0.35, losses=losses)
            #print("Losses", losses)

100% (100 of 100) |######################| Elapsed Time: 1:00:59 Time:  1:00:59


In [34]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)


Saved model to output_dir


In [42]:
TEST = ["The Astronomy is hard science", "Telescope is the main tool in astronomy"]

In [43]:
# test the saved model
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
for text in TEST:
    doc = nlp2(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
    print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Loading from output_dir
Entities [('Astronomy', 'AstroTerm')]
Tokens [('The', '', 2), ('Astronomy', 'AstroTerm', 3), ('is', '', 2), ('hard', '', 2), ('science', '', 2)]
Entities [('astronomy', 'AstroTerm')]
Tokens [('Telescope', '', 2), ('is', '', 2), ('the', '', 2), ('main', '', 2), ('tool', '', 2), ('in', '', 2), ('astronomy', 'AstroTerm', 3)]
