In [1]:
import warnings

import spacy
from spacy import displacy

warnings.filterwarnings("ignore", category=UserWarning)

In [2]:
ALL_KEYWORDS = [
    "revenue",
    "earning",
    "customer",
    "user",
    "sale",
    "income",
    "download",
    "install",
    "traffic",
    "profit",
]

In [3]:
default_text = (
    "Apple today announced financial results for its fiscal 2022 second "
    "quarter ended March 26, 2022. The company posted a March quarter "
    "revenue record of $97.3 billion, up 9 percent year over year, and "
    "quarterly earnings per diluted share of $1.52."
)

input_text = default_text
input_text

'Apple today announced financial results for its fiscal 2022 second quarter ended March 26, 2022. The company posted a March quarter revenue record of $97.3 billion, up 9 percent year over year, and quarterly earnings per diluted share of $1.52.'

In [4]:
models = [
    "en_core_web_sm",  # lighter, faster, less accurate
    "en_core_web_trf",  # heavier, slower, more accurate
]
model = models[0]
nlp = spacy.load(model)

In [5]:
# NER = Named-entity recognition
ALL_NER_TYPES = list(nlp.get_pipe("ner").labels)

In [6]:
def predict_with_awesome_ml_model(contents, ner_types=None, keywords=None):
    if isinstance(contents, str):
        contents = [contents]

    displacy_options = {}
    displacy_options["ents"] = ner_types or []

    if keywords:
        growth_ent = "GROWTH"

        if "entity_ruler" in nlp.pipe_names:
            nlp.remove_pipe("entity_ruler")
        ruler = nlp.add_pipe("entity_ruler")
        patterns = [
            {"label": growth_ent, "pattern": [{"LEMMA": kw}]} for kw in keywords
        ]
        ruler.add_patterns(patterns)

        color = "linear-gradient(45deg, yellow, red)"
        displacy_options["colors"] = {growth_ent: color}
        displacy_options["ents"].append(growth_ent)

    for content in contents:
        doc = nlp(content)
        displacy.render(doc, style="ent", options=displacy_options)

In [7]:
# use rule-based matching
predict_with_awesome_ml_model(contents=input_text, keywords=ALL_KEYWORDS)

In [8]:
# use machine learning named entity recognition
predict_with_awesome_ml_model(input_text, ner_types=ALL_NER_TYPES)

In [9]:
%%time

# combine rule-based + ML's NER
growth_keywords = ["revenue", "earning", "customer"]
quantifiable_types = ["ORG", "CARDINAL", "ORDINAL", "PERCENT", "QUANTITY", "MONEY"]

predict_with_awesome_ml_model(
    input_text, keywords=growth_keywords, ner_types=quantifiable_types
)

CPU times: user 17.1 ms, sys: 962 µs, total: 18 ms
Wall time: 18.3 ms
