In [1]:
import skweak
from skweak import base

## Preprocessing

In [None]:
# python -m spacy download en_core_web_md to install spacy model

In [3]:
import spacy
nlp = spacy.load("en_core_web_md")   # We load an English-language model you can install it

doc = nlp("A not tall giraffe bought $20 or £40 worth of turtlenecks.")

## Storing documents

In [5]:
# !mkdir minimal_example_data

In [6]:
docs = [doc, nlp("And this is another test. With two sentences.")]
skweak.utils.docbin_writer(docs, "minimal_example_data/minimal_example_data.docbin")

# docbin_reader is doing lazy evaluation, so we need to use list(...)
# to retrieve all documents at once
docs_copy = list(skweak.utils.docbin_reader("minimal_example_data/minimal_example_data.docbin"))

Write to minimal_example_data/minimal_example_data.docbin...done


## Labelling functions

In [7]:
class MoneyDetector(base.SpanAnnotator):
    def __init__(self):
        super(MoneyDetector, self).__init__("money_detector")

    def find_spans(self, doc):
        for tok in doc[1:]:
            if tok.text[0].isdigit() and tok.nbor(-1).is_currency:
                yield tok.i-1, tok.i+1, "MONEY"

money_detector = MoneyDetector()

In [8]:
def tall_detector_fn(doc):
    for tok in doc:
        if tok.text.lower() == "tall":
            yield tok.i, tok.i+1, "TALL"

tall_detector = skweak.heuristics.FunctionAnnotator("tall_detector", tall_detector_fn)

## Agreggate labelling functions

In [9]:
majority_voter = skweak.voting.SequentialMajorityVoter("maj_voter", labels=["TALL", "MONEY"])

### Manually apply and aggregate labelling functions

In [10]:
mannually_apply_label_fns = money_detector(tall_detector(docs[0]))


In [11]:
voted_doc = majority_voter(docs[0])

# we can then visualise the final result (in Jupyter)
skweak.utils.display_entities(voted_doc, "maj_voter")

## Pipe level labelling function aggregation

In [12]:
combined = skweak.base.CombinedAnnotator()
combined.add_annotator(money_detector)
combined.add_annotator(tall_detector)
combined.add_annotator(majority_voter)

<skweak.base.CombinedAnnotator at 0x2aee417c0>

In [13]:
docs = list(combined.pipe(docs))

In [14]:
skweak.utils.display_entities(docs[0], "maj_voter")

In [16]:
import spacy
from spacy import displacy

doc = nlp("A not tall giraffe bought $20 or £40 worth of turtlenecks.")
displacy.render(doc, style="dep")

In [93]:
docs[0][1]

not

In [92]:
docs[0][1].dep_

'neg'

In [22]:
negation_tokens = [tok for tok in doc if tok.dep_ == 'neg']
negation_head_tokens = [token.head for token in negation_tokens]

for token in negation_head_tokens:
    print(token.text, token.dep_, token.head.text, token.head.pos_, [child for child in token.children])

tall amod giraffe NOUN [not]


In [88]:
def not_tall_detector_fn(doc):
    for tok in doc[1:]:
        if tok.text.lower() == "tall" and tok.nbor(-1).text == "not":
            yield tok.i-1, tok.i+1, "NOT_TALL"

not_tall_detector = skweak.heuristics.FunctionAnnotator("not_tall_detector", not_tall_detector_fn)

In [89]:
skweak.utils.display_entities(not_tall_detector(docs[0]), "tall_or_not_tall_detector")

In [24]:
def tall_detector_fn(doc):
    for tok in doc:
        if tok.text.lower() == "tall":
            yield tok.i, tok.i+1, "TALL"

In [83]:
def tall_or_not_tall_detector_fn(doc):
    negation_tokens = [tok for tok in doc if tok.dep_ == 'neg']
    negation_head_tokens = [token.head for token in negation_tokens]
    negated_tall_indices = []
    
    for token in negation_head_tokens:
        if token.text.lower() == "tall":
            yield token.i-1, token.i+1, "NOT_TALL"
            negated_tall_indices.append(token.i)
            
    for tok in doc:
        if tok.text.lower() == "tall":
            if tok.i not in negated_tall_indices:
                yield tok.i, tok.i+1, "TALL"
    
    
tall_or_not_tall_detector = skweak.heuristics.FunctionAnnotator("tall_or_not_tall_detector", tall_or_not_tall_detector_fn)

In [84]:
doc = nlp("A tall tale: a not tall giraffe bought $20 or £40 worth of turtlenecks.")

In [86]:
skweak.utils.display_entities(tall_or_not_tall_detector(doc), "tall_or_not_tall_detector")

In [100]:
category = nlp.add_pipe("textcat")
category.add_label("POSITIVE")
category.add_label("NEGATIVE")

ValueError: [E007] 'textcat' already exists in pipeline. Existing names: ['tok2vec', 'tagger', 'parser', 'senter', 'attribute_ruler', 'lemmatizer', 'ner', 'textcat']

In [101]:
nlp.textcat

AttributeError: 'English' object has no attribute 'textcat'