In [52]:
# PhraseMatcher.py
# import necessary modules

from __future__ import unicode_literals, print_function

import plac
import random
import warnings
from pathlib import Path
from spacy.util import minibatch, compounding

from spacy import displacy
from collections import Counter
from spacy.matcher import PhraseMatcher #import PhraseMatcher class
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint

from time import sleep
from progressbar import progressbar

import pandas as pd
import numpy as np
import xlrd
import spacy
import en_core_web_sm
import json 

# Language class with the English model 'en_core_web_sm' is loaded
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 7000000

In [53]:
terms_corpus = pd.read_excel('astronomy.xls')

In [54]:
# the list containing the pharses to be matched
terminology_list = []
for term in terms_corpus['key']:
    terminology_list.append(term[term.find(':')+2:])

In [55]:
import glob

read_files = glob.glob("corpus/Astromony_2*.txt")

with open("corpus/result.txt", "wb") as outfile:
    for f in read_files:
        with open(f, "rb") as infile:
            outfile.write(infile.read())



# the input text string is converted to a Document object



file  = open('corpus/result.txt')
text = file.read()

In [81]:
matcher = PhraseMatcher(nlp.vocab)

matched_sents = []  # Collect data of matched sentences to be visualized

def collect_sents(matcher, doc, i, matches):
    match_id, start, end = matches[i]
    span = doc[start:end]  # Matched span
    sent = span.sent  # Sentence containing matched span
    # Append mock entity for match in displaCy style to matched_sents
    # get the match span by ofsetting the start and end of the span with the
    # start and end of the sentence in the doc
    match_ents = [{
        "start": span.start_char - sent.start_char,
        "end": span.end_char - sent.start_char,
        "label": "AstroTerm",
    }]
    matched_sents.append({"text": sent.text, "ents": match_ents})



In [57]:
doc = nlp(text)
# create the PhraseMatcher object
matcher = PhraseMatcher(nlp.vocab, attr='LOWER')
#print([(X.text, X.label_) for X in doc.ents])

In [85]:
doc1 = nlp('Astronomy is the science of space, light, and planets')


Astronomy is the science of space, light, and planets

In [59]:
# convert the phrases into document object using nlp.make_doc to #speed up.
patterns = [nlp.make_doc(text) for text in terminology_list]
# add the patterns to the matcher object without any callbacks
matcher.add("TerminologyList", collect_sents, *patterns)

In [60]:
#call the matcher object the document object and it will return #match_id, start and stop indexes of the matched words
matches = matcher(doc)

In [61]:
#displacy.render(matched_sents, style="ent", manual=True)

In [62]:
# preprocess training data
TRAIN_DATA = []
LABEL = 'AstroTerm'
for dic in matched_sents:
    inner_list = []
    dict_item = {}
    inner_list.append((dic['ents'][0]['start'],dic['ents'][0]['end'],LABEL))
    dict_item['entities'] = inner_list
    TRAIN_DATA.append((dic['text'],dict_item))

In [63]:
# Importing requirements
from spacy.util import minibatch, compounding
import random


# #nlp = spacy.blank("en")  # create blank Language class

# # Add the new label to ner
# ner.add_label(LABEL)

# # get names of other pipes to disable them during training
# other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

# for _, annotations in TRAIN_DATA:
#     for ent in annotations.get('entities'):
#         ner.add_label(ent[2])

In [75]:

def train_spacy(data, n_iter = 10, load = True):
    
    
    TRAIN_DATA = data
    # load space model
    if load:
        nlp = spacy.load("en_core_web_sm")
    else:
        nlp = spacy.blank('en')
        
   
        
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    for _, annotations in TRAIN_DATA:
         for ent in annotations.get('entities'):
            ner.add_label(ent[2])
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    # Resume training
    #optimizer = nlp.resume_training()
    #move_names = list(ner.move_names)

    nlp.begin_training()

    # Begin training by disabling other pipeline components
    with nlp.disable_pipes(*other_pipes):
        # show warnings for misaligned entity spans once
        optimizer = nlp.begin_training()

        sizes = compounding(1.0, 5.0, 1.001)
        # Training for n_iter iterations     
        for itn in progressbar(range(n_iter)):
            sleep(0.02)
        # shuffle examples before training
            random.shuffle(TRAIN_DATA)
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=sizes)
            # dictionary to store losses
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                # Calling update() over the iteration
                nlp.update(texts, annotations, sgd=optimizer, drop=0.25, losses=losses)
                #print("Losses", losses)
    return nlp

In [76]:
nlp2 = train_spacy(TRAIN_DATA)

 50% (5 of 10) |############             | Elapsed Time: 0:02:36 ETA:   0:02:15

KeyboardInterrupt: 

In [15]:
if output_dir is not None:
    output_dir = Path(output_dir)
    if not output_dir.exists():
        output_dir.mkdir()
    nlp.to_disk(output_dir)
    print("Saved model to", output_dir)

In [131]:
# test the model
file  = open('test_corpus/text01.txt')
Test = file.read()

In [67]:
# # test the saved model
# print("Loading from", output_dir)
# nlp2 = spacy.load(output_dir)

In [72]:
doc = nlp2('Physicists began using supercomputers to obtain solutions to this famously hard problem back in the 1960s. In 2000, with no solutions in sight, Kip Thorne, 2018 Nobel Laureate and one of the designers of LIGO, famously bet that there would be an observation of gravitational waves before a numerical solution was reached.')
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Nobel', 'AstroTerm')]
Tokens [('Physicists', '', 2), ('began', '', 2), ('using', '', 2), ('supercomputers', '', 2), ('to', '', 2), ('obtain', '', 2), ('solutions', '', 2), ('to', '', 2), ('this', '', 2), ('famously', '', 2), ('hard', '', 2), ('problem', '', 2), ('back', '', 2), ('in', '', 2), ('the', '', 2), ('1960s', '', 2), ('.', '', 2), ('In', '', 2), ('2000', '', 2), (',', '', 2), ('with', '', 2), ('no', '', 2), ('solutions', '', 2), ('in', '', 2), ('sight', '', 2), (',', '', 2), ('Kip', '', 2), ('Thorne', '', 2), (',', '', 2), ('2018', '', 2), ('Nobel', 'AstroTerm', 3), ('Laureate', '', 2), ('and', '', 2), ('one', '', 2), ('of', '', 2), ('the', '', 2), ('designers', '', 2), ('of', '', 2), ('LIGO', '', 2), (',', '', 2), ('famously', '', 2), ('bet', '', 2), ('that', '', 2), ('there', '', 2), ('would', '', 2), ('be', '', 2), ('an', '', 2), ('observation', '', 2), ('of', '', 2), ('gravitational', '', 2), ('waves', '', 2), ('before', '', 2), ('a', '', 2), ('numerical', '', 

In [50]:
train_file = open('train.json',) 
data = json.load(train_file)

In [144]:
data[:4]

[{'text': "As usual, we've been deprived of clear sky viewing opportunities this winter and spring—all the more reason to start planning for the regional dark sky star parties.",
  'ents': [{'start': 143, 'end': 151, 'label': 'T'}]},
 {'text': "As usual, we've been deprived of clear sky viewing opportunities this winter and spring—all the more reason to start planning for the regional dark sky star parties.",
  'ents': [{'start': 152, 'end': 156, 'label': 'T'}]},
 {'text': 'Also, you need to register early for this star party because attendance is limited: www.tmspa.com.',
  'ents': [{'start': 42, 'end': 46, 'label': 'T'}]},
 {'text': 'Also, you need to register early for this star party because attendance is limited: www.tmspa.com.',
  'ents': [{'start': 42, 'end': 52, 'label': 'T'}]}]

In [171]:
# define a fuction for key 
def key_func(k): 
    return k[1] 
  
# sort INFO data by 'company' key. 
#INFO = sorted(INFO, key=key_func) 
  
for key, value in groupby(TRAIN_DATA, key = itemgetter(0)): 
    #print(key) 
    if len(list(value)) >1:
        #print(key)
        print(list(value)) 

[]
[]


In [114]:
TEST_DATA = [('solar system .', {'entities': [(0, 12, 'AstroTerm')]}),
 ("If Orion hasn't risen yet, you can look high in the east for the little cluster of blue stars called the Pleiades.",
  {'entities': [(3, 8, 'AstroTerm'),(105, 113, 'AstroTerm')]}),
 ('(Above:\xa0Taurus contains interesting targets for naked eye, binoculars, and large and small telescopes.',
  {'entities': [(8, 14, 'AstroTerm'),(91, 101, 'AstroTerm')]}),
 ('★ Kuiper Belt', {'entities': [(2, 13, 'AstroTerm')]}),             
 ("An apparent gap between Saturn's A and B rings.",
  {'entities': [(24, 30, 'AstroTerm')]}), 
 ('A description of gravity formulated by Albert Einstein, which explains that gravity affects the geometry of space and the flow of time.',
  {'entities': [(39, 54, 'AstroTerm'),(76, 84, 'AstroTerm')]}),
 ('A common type of reflector telescope designed by Sir Isaac Newton.\n',
  {'entities': [(27, 36, 'AstroTerm'), (53, 65, 'AstroTerm')]}),
 ('A telescope whose design incorporates innovative features, such as adaptive optics.',
  {'entities': [(2, 11, 'AstroTerm')]}),
 ('A complete circle (360º)      has 2o radians (6.283r), so one radian is about 57.296º.\nRADIANT\n',
  {'entities': [(62, 68, 'AstroTerm')]}),
 ("If the Moon orbited in the     same plane as the Earth around the Sun, the Sun would be eclipsed every month,     but the Moon's orbit is in a slightly inclined plane, making eclipses of the     Sun rare.\n",
  {'entities': [(122, 126, 'AstroTerm'),(129, 134, 'AstroTerm'), (175, 183, 'AstroTerm')]}),     
 ('\uf63c\uf6dc\uf63a focus \uf6dc\uf63c\uf63b, \uf6dc\uf63c\uf63d, \uf6dc\uf63c\uf640 – focusing \uf6dc\uf63d\uf63c, \uf6dc\uf63e\uf63e folded refractor',
  {'entities': [(51, 60, 'AstroTerm')]}),
 ("A mount's top, or head, can be either alt-azimuth (turning side to side, up and down) or equatorial (turning parallel to the celestial coordinate system).",
  {'entities': [(42, 49, 'AstroTerm')]}),
 ('A complex radio source at the centre of the Milky Way      Galaxy.',
  {'entities': [(44, 53, 'AstroTerm'), (59, 65, 'AstroTerm')]}),
 ('The longest diameter of an ellipse.',
  {'entities': [(27, 34, 'AstroTerm')]}),
 ('Sidereal time enables the hour angle of an object      to be found from its right ascension (hour angle sidereal      time - right ascension).\n',
  {'entities': [(0, 13, 'AstroTerm')]}),
 ("Earth's orbit; those that orbit or spin clockwise have retrograde motion.",
  {'entities': [(8,13, 'AstroTerm'), (35, 39, 'AstroTerm')]}),
 ('The Newtonian reflector, designed by Isaac Newton, has a small second mirror mounted diagonally near the front of the tube to divert the light sideways and out to your eye.',
  {'entities': [(37, 49, 'AstroTerm')]}),
 ('population l star.', {'entities': [(13, 17, 'AstroTerm')]}),
 ('The deflection of light from a remote source caused by the presence of an intervening mass.',
  {'entities': [(86, 90, 'AstroTerm')]}),
 ('A natural satellite that orbits a planet.',
  {'entities': [(10, 19, 'AstroTerm')]})]

In [118]:
nlp3 = train_spacy(TEST_DATA, 100)

  gold = GoldParse(doc, **gold)
100% (100 of 100) |######################| Elapsed Time: 0:00:34 Time:  0:00:34


In [119]:
doc = nlp3("If the Moon orbited in the same plane as the Earth around the Sun, the Sun would be eclipsed every month, but the Moon's orbit is in a slightly inclined plane, making eclipses of the Sun rare.")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])

Entities [('Moon', 'AstroTerm'), ('orbit', 'AstroTerm'), ('eclipses', 'AstroTerm')]
Tokens [('If', '', 2), ('the', '', 2), ('Moon', '', 2), ('orbited', '', 2), ('in', '', 2), ('the', '', 2), ('same', '', 2), ('plane', '', 2), ('as', '', 2), ('the', '', 2), ('Earth', '', 2), ('around', '', 2), ('the', '', 2), ('Sun', '', 2), (',', '', 2), ('the', '', 2), ('Sun', '', 2), ('would', '', 2), ('be', '', 2), ('eclipsed', '', 2), ('every', '', 2), ('month', '', 2), (',', '', 2), ('but', '', 2), ('the', '', 2), ('Moon', 'AstroTerm', 3), ("'s", '', 2), ('orbit', 'AstroTerm', 3), ('is', '', 2), ('in', '', 2), ('a', '', 2), ('slightly', '', 2), ('inclined', '', 2), ('plane', '', 2), (',', '', 2), ('making', '', 2), ('eclipses', 'AstroTerm', 3), ('of', '', 2), ('the', '', 2), ('Sun', '', 2), ('rare', '', 2), ('.', '', 2)]
