In [21]:
import spacy, re
from skweak import heuristics, gazetteers, generative, utils
import tarfile
import skweak

In [2]:
# We retrieve the texts
texts = [] 
archive_file = tarfile.open("data/reuters_small.tar.gz")
for archive_member in archive_file.getnames():
    if archive_member.endswith(".txt"):
        text = archive_file.extractfile(archive_member).read().decode("utf8")
        texts.append(text)
print (len(texts))

195


In [18]:
# LF 1: heuristic to detect occurrences of MONEY entities
def money_detector(doc):
   for tok in doc[1:]:
      if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
          yield tok.i-1, tok.i+1, "MONEY"
lf1 = heuristics.FunctionAnnotator("money", money_detector)

# LF 2: detection of years with a regex
lf2= heuristics.TokenConstraintAnnotator("years", lambda tok: re.match("(19|20)\d{2}$", 
                                                  tok.text), "DATE")




tries = gazetteers.extract_json_data("data/crunchbase_companies.json.gz")
lf3 = gazetteers.GazetteerAnnotator("gazetteer", tries)

# We run spacy on the texts    
nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
docs = list(nlp.pipe(texts))

new_docs = []
for doc in docs:
    new_doc = lf3(lf2(lf1(doc)))
    new_docs.append(new_doc)

# create and fit the HMM aggregation model
hmm = generative.HMM("hmm", ["DATE", "MONEY", "COMPANY"])
hmm.fit(new_docs)

# once fitted, we simply apply the model to aggregate all functions
doc = hmm(new_docs[0])

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc, "hmm")

Extracting data from data/crunchbase_companies.json.gz
Populating trie for class COMPANY (number: 539174)
Starting iteration 1
Finished E-step with 195 documents
Starting iteration 2


         1      -18260.5264             +nan


Finished E-step with 195 documents
Starting iteration 3


         2      -17418.8139        +841.7125


Finished E-step with 195 documents
Starting iteration 4


         3      -17402.2269         +16.5870


Finished E-step with 195 documents


         4      -17393.2608          +8.9661


In [22]:
# We define the aggregation model
model = skweak.aggregation.HMM("hmm", ["COMPANY", "DATE", "MONEY"])

# And run the estimation
docs = model.fit_and_aggregate(new_docs)

Starting iteration 1
Finished E-step with 195 documents
Starting iteration 2


         1      -18260.5264             +nan


Finished E-step with 195 documents
Starting iteration 3


         2      -17418.8139        +841.7125


Finished E-step with 195 documents
Starting iteration 4


         3      -17402.2269         +16.5870


Finished E-step with 195 documents


         4      -17393.2608          +8.9661


In [23]:
for doc in new_docs:
    doc.ents = doc.spans["hmm"]
utils.docbin_writer(docs, "data/reuters_small.spacy")

Write to data/reuters_small.spacy...done


In [26]:
doc = hmm(docs[5])

# we can then visualise the final result (in Jupyter)
utils.display_entities(doc, "hmm")

In [29]:
for doc in docs:
    doc.ents = doc.spans["hmm"]
skweak.utils.docbin_writer(docs, "data/reuters_small.spacy")

Write to data/reuters_small.spacy...done


In [None]:
!spacy init config - --lang en --pipeline ner --optimize accuracy | \
spacy train - --paths.train data/reuters_small.spacy  --paths.dev data/reuters_small.spacy \
--initialize.vectors en_core_web_md --output data/reuters_small