In [1]:
# PhraseMatcher.py
# import necessary modules

from __future__ import unicode_literals, print_function


from pathlib import Path
from spacy.util import minibatch, compounding
from spacy.pipeline import EntityRuler
from spacy.lang.en import English
from spacy import displacy
from collections import Counter
from spacy.matcher import PhraseMatcher #import PhraseMatcher class
from nltk.chunk import conlltags2tree, tree2conlltags
from pprint import pprint
from time import sleep
from progressbar import progressbar
from spacy.tokens import Span
from spacy.util import minibatch, compounding
from collections import Counter

import nltk
import random
import plac
import warnings
import pandas as pd
import numpy as np
import xlrd
import spacy
import en_core_web_sm
import json 
import re


nltk.download('punkt')

# Language class with the English model 'en_core_web_sm' is loaded
nlp = spacy.load('en_core_web_sm')
nlp.max_length = 7000000

[nltk_data] Downloading package punkt to /Users/abdelhak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [20]:

def train_spacy(data, n_iter = 10, load = False):
    
    
    TRAIN_DATA = data
    # load space model
    if load:
        nlp = spacy.load("en_core_web_sm")
    else:
        nlp = spacy.blank('en')
        
   
        
    if "ner" not in nlp.pipe_names:
        ner = nlp.create_pipe("ner")
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")
    
    # add labels
    # for _, annotations in TRAIN_DATA:
    #      for ent in annotations.get('entities'):
    ner.add_label('AstroTerm')
    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']

    # Resume training
    #optimizer = nlp.resume_training()
    #move_names = list(ner.move_names)

    nlp.begin_training()

    # Begin training by disabling other pipeline components
    with nlp.disable_pipes(*other_pipes):
        # show warnings for misaligned entity spans once
        optimizer = nlp.begin_training()

        sizes = compounding(1.0, 5.0, 1.001)
        # Training for n_iter iterations     
        for itn in progressbar(range(n_iter)):
            sleep(0.02)
        # shuffle examples before training
            random.shuffle(TRAIN_DATA)
            # batch up the examples using spaCy's minibatch
            batches = minibatch(TRAIN_DATA, size=sizes)
            # dictionary to store losses
            losses = {}
            for batch in batches:
                texts, annotations = zip(*batch)
                # Calling update() over the iteration
                nlp.update(texts, annotations, sgd=optimizer, drop=0.25, losses=losses)
                #print("Losses", losses)
    return nlp

def extract_entities(doc):
    dict_new_ents = {}
    list_new_ents = []
    for ent in doc.ents:
        # Only check
        if ent.label_ == "AstroTerm":
            
            
            list_new_ents.append((ent.start_char, ent.end_char, ent.label_))      
    
    dict_new_ents['entities'] = list_new_ents
            
    return (doc.text,dict_new_ents )


### Read the list of terminology 

In [2]:
terms_corpus = pd.read_excel('astronomy.xls')

In [3]:
# the list containing the pharses to be matched
terminology_list = []
for term in terms_corpus['key']:
    terminology_list.append(term[term.find(':')+2:])

### text for training:

In [4]:
# import glob
# read_files = glob.glob("corpus/Astromony_*.txt")
# with open("corpus/result.txt", "wb") as outfile:
#     for f in read_files:
#         with open(f, "rb") as infile:
#             outfile.write(infile.read())

# the input text string is converted to a Document object

file = open('corpus/result.txt')
text = file.read()

### Entity ruler

In [25]:
nlp_rule_based = English()
ruler = EntityRuler(nlp_rule_based)

In [26]:
# create patterns
patterns = []

for term in terminology_list:
    dct = {}
    temp = term.split()
    if len(temp) == 1:
        dct["label"] = "AstroTerm"
        dct["pattern"] = temp[0]
        patterns.append(dct)
    else:
        lst = []
        for item in temp:
            dct_temp = {}
            dct_temp["lower"] = item
            
            lst.append(dct_temp)
            
        dct["label"] = "AstroTerm"
        dct["pattern"] = lst
        patterns.append(dct)

In [27]:
ruler.add_patterns(patterns)
nlp_rule_based.add_pipe(ruler)

In [17]:
train_data = []
for doc in nltk.tokenize.sent_tokenize(text):
    doc = nlp_rule_based(doc)
    train_data.append(extract_entities(doc))

### Neural Network approach:

In [21]:
# train the model
#nlp = train_spacy(train_data, 100)

  **kwargs
100% (100 of 100) |#####################| Elapsed Time: 21:09:17 Time: 21:09:17


In [22]:
# save the model
output_dir = "output_dir"
# if output_dir is not None:
#     output_dir = Path(output_dir)
#     if not output_dir.exists():
#         output_dir.mkdir()
#     nlp.to_disk(output_dir)
#     print("Saved model to", output_dir)

Saved model to output_dir


In [None]:
# upload the saved model
print("Loading from", output_dir)
nlp = spacy.load(output_dir)

In [45]:
# text test
text_test = input("Enter your testing text: ")

Enter your testing text: In this rendition of the human timeline, we don’t abandon heavy industry. We learn to manufacture what we need to maintain our lives in the cold vacuum of space, just in time to give Earth a break.  The race to build an industrial foundation in space has already begun, too: Musk promises Mars Base Alpha by 2028; Bezos’ own Blue Origin is working on a “sustained human presence on the Moon;” and NASA’s Lunar Gateway, a permanent orbital station, is set to go into operation by the end of the decade.  In 40 years, launch costs have fallen from $85,000 per kilogram to less than $1,000/kg, and NASA hopes to get this under $100/kg in the next few years. This trajectory makes space-mining advocate and Skycorp CEO Dennis Wingo more certain than ever that we are on the cusp of a new era of space mining. He reiterates to Astronomy that “industrial activity on the Moon is how we can make things better here on Earth.”  Instead of returning raw materials from the Moon to Ear

In [39]:
# test text
#file = open('test_corpus/text09.txt')
#text_test = file.read()

In [43]:
# run the neural network model
doc = nlp(text_test)
entities = [(ent.text, ent.label_) for ent in doc.ents]
#print("Entities", entities)
counter = Counter(entities)
counter

Counter({('astronomical object', 'AstroTerm'): 1,
         ('imaginary line', 'AstroTerm'): 1,
         ('gravitationally', 'AstroTerm'): 1,
         ('planet', 'AstroTerm'): 2,
         ('perpendicular', 'AstroTerm'): 1,
         ('celestial equator', 'AstroTerm'): 1,
         ('equinox', 'AstroTerm'): 2,
         ('apparent', 'AstroTerm'): 1,
         ('geocentric', 'AstroTerm'): 1,
         ('solstice', 'AstroTerm'): 1,
         ('velocity', 'AstroTerm'): 3,
         ('kinetic energy', 'AstroTerm'): 1,
         ('mass', 'AstroTerm'): 4,
         ('orbit', 'AstroTerm'): 1,
         ('radius', 'AstroTerm'): 1,
         ('star', 'AstroTerm'): 2})

In [44]:
# run rule based model
doc = nlp_rule_based(text_test)
entities = [(ent.text, ent.label_) for ent in doc.ents]
#print("Entities", entities)
counter = Counter(entities)
counter

Counter({('astronomical object', 'AstroTerm'): 1,
         ('imaginary line', 'AstroTerm'): 1,
         ('gravitationally', 'AstroTerm'): 1,
         ('planet', 'AstroTerm'): 2,
         ('perpendicular', 'AstroTerm'): 1,
         ('celestial equator', 'AstroTerm'): 1,
         ('equinox', 'AstroTerm'): 2,
         ('apparent', 'AstroTerm'): 1,
         ('geocentric', 'AstroTerm'): 1,
         ('solstice', 'AstroTerm'): 1,
         ('velocity', 'AstroTerm'): 3,
         ('kinetic energy', 'AstroTerm'): 1,
         ('mass', 'AstroTerm'): 4,
         ('orbit', 'AstroTerm'): 1,
         ('radius', 'AstroTerm'): 1,
         ('star', 'AstroTerm'): 2})

In [None]:
#displacy.render(matched_sents, style="ent", manual=True)