In [1]:
import tarfile
import spacy
import re
import skweak

In [2]:
texts = [] 
archive_file = tarfile.open("./data/reuters_small.tar.gz")
for archive_member in archive_file.getnames():
    if archive_member.endswith(".txt"):
        text = archive_file.extractfile(archive_member).read().decode("utf8")
        texts.append(text)

In [3]:
texts[0]

'Best buy offers used iPhones at lower price\nATLANTA (Reuters) - Retailer Best Buy Co, seeking new ways to appeal to cost-conscious shoppers, said on Tuesday it is selling refurbished versions of Apple Inc\'s iPhone 3G at its stores that are priced about $50 less than new iPhones. The electronics chain said the used iPhones, which were returned within 30 days of purchase, are priced at $149 for the model with 8 gigabytes of storage, while the 16-gigabyte version is $249. A two-year service contract with AT&T Inc is required. New iPhone 3Gs currently sell for $199 and $299 at Best Buy Mobile stores. "This is focusing on customers\' needs, trying to provide as wide a range of products and networks for our consumers," said Scott Moore, vice president of marketing for Best Buy Mobile. Buyers of first-generation iPhones can also upgrade to the faster refurbished 3G models at Best Buy, he said. Moore said AT&T, the exclusive wireless provider for the iPhone, offers refurbished iPhones onlin

In [4]:
#ModelAnnotator
nlp = spacy.load("en_core_web_sm", disable=["ner", "lemmatizer"])
doc = nlp(texts[0])
annotator = skweak.spacy.ModelAnnotator("spacy_md", "en_core_web_md")
doc = annotator(doc)
skweak.utils.display_entities(doc, "spacy_md")

In [5]:
#FunctionAnnotator
def company_detector_fun(doc):
    for chunk in doc.noun_chunks:
        if chunk[-1].lower_.rstrip(".") in {'corp', 'inc', 'ltd', 'llc', 'sa', 'ag', "co"}:
            yield chunk.start, chunk.end, "COMPANY"

company_detector = skweak.heuristics.FunctionAnnotator("company_detector", company_detector_fun)
doc = company_detector(doc)
skweak.utils.display_entities(doc, "company_detector")

In [12]:
#GazateerAnnotator
tries = skweak.gazetteers.extract_json_data("./data/crunchbase_companies.json.gz")
gazetteer = skweak.gazetteers.GazetteerAnnotator("gazetteer", tries)

Extracting data from ./data/crunchbase_companies.json.gz
Populating trie for class COMPANY (number: 539174)


In [13]:
doc = gazetteer(doc)
skweak.utils.display_entities(doc, "gazetteer")

In [8]:
#TokenConstraintAnnotator

In [9]:
#SpanConstraintAnnotator

In [10]:
#SnipsAnnotator

In [14]:
#label all docs
new_docs = []
for text in texts:
    doc = nlp(text)
    doc = annotator(company_detector(gazetteer(doc)))
    new_docs.append(doc)
print(len(texts))
print(len(new_docs))

195
195


In [15]:
#Hidden Markov Model
hmm = skweak.generative.HMM("hmm", ["COMPANY"])

In [16]:
hmm.fit(new_docs)

Starting iteration 1
Finished E-step with 195 documents
Starting iteration 2


         1      -16018.9627             +nan


Finished E-step with 195 documents
Starting iteration 3


         2      -15563.8583        +455.1045


Finished E-step with 195 documents
Starting iteration 4


         3      -15483.0793         +80.7789


Finished E-step with 195 documents


         4      -15428.3442         +54.7351


In [20]:
skweak.utils.display_entities(new_docs[0], "hmm")

RuntimeError: Annotation source "hmm" cannot be found

In [21]:
new_doc = hmm(new_docs[0])

In [22]:
skweak.utils.display_entities(new_doc, "hmm")

In [24]:
hmm_docs = []
for text in texts:
    doc = nlp(doc)
    doc = hmm(doc)
    doc.ents = doc.spans["hmm"]
    hmm_docs.append(doc)

In [25]:
len(hmm_docs)

195

In [34]:
skweak.utils.docbin_writer(hmm_docs, "training_data/reuters_small.spacy")

Write to training_data/reuters_small.spacy...done


In [None]:
!spacy init config - --lang en --pipeline ner --optimize accuracy | \
spacy train - --paths.train training_data/reuters_small.spacy  --paths.dev training_data/reuters_small.spacy \
--initialize.vectors en_core_web_md --output training_data/reuters_small