In [1]:
import skweak
from skweak import base

## Preprocessing

In [15]:
import spacy
nlp = spacy.load("en_core_web_md")   # We load an English-language model

doc = nlp("A not tall giraffe bought $20 or £40 worth of turtlenecks.")

## Storing documents

In [3]:
# !mkdir minimal_example_data

In [16]:
docs = [doc, nlp("And this is another test. With two sentences.")]
skweak.utils.docbin_writer(docs, "minimal_example_data/minimal_example_data.docbin")

# docbin_reader is doing lazy evaluation, so we need to use list(...)
# to retrieve all documents at once
docs_copy = list(skweak.utils.docbin_reader("minimal_example_data/minimal_example_data.docbin"))

Write to minimal_example_data/minimal_example_data.docbin...done


## Labelling functions

In [17]:
class MoneyDetector(base.SpanAnnotator):
    def __init__(self):
        super(MoneyDetector, self).__init__("money_detector")

    def find_spans(self, doc):
        for tok in doc[1:]:
            if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
                yield tok.i-1, tok.i+1, "MONEY"

money_detector = MoneyDetector()

In [18]:
def tall_detector_fn(doc):
    for tok in doc:
        if tok.text.lower() == "tall":
            yield tok.i, tok.i+1, "TALL"

tall_detector = skweak.heuristics.FunctionAnnotator("tall_detector", tall_detector_fn)

## Agreggate labelling functions

In [19]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["TALL", "MONEY"])

### Manually apply and aggregate labelling functions

In [20]:
mannually_apply_label_fns = money_detector(tall_detector(docs[0]))


In [21]:
voted_doc = majority_voter(docs[0])

# we can then visualise the final result (in Jupyter)
skweak.utils.display_entities(voted_doc, "maj_voter")

## Pipe level labelling function aggregation

In [22]:
combined = skweak.base.CombinedAnnotator()
combined.add_annotator(money_detector)
combined.add_annotator(tall_detector)
combined.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x7fbcdc163760>

In [23]:
docs = list(combined.pipe(docs))

In [24]:
skweak.utils.display_entities(docs[0], "maj_voter")

In [60]:
def not_tall_detector_fn(doc):
    for tok in doc[1:]:
        if tok.text.lower() == "tall" and tok.nbor(-1).text == "not":
            yield tok.i-1, tok.i+1, "NOT_TALL"

not_tall_detector = skweak.heuristics.FunctionAnnotator("not_tall_detector", not_tall_detector_fn)

In [61]:
not_tall_detector(docs[0]).spans

{'tall_detector': [tall], 'money_detector': [$20, £40], 'maj_voter': [tall, $20, £40], 'not_annotator': [], 'not_tall_detector': [not tall]}

In [62]:
skweak.utils.display_entities(not_tall_detector(docs[0]), "not_tall_detector")