In [1]:
import mlrun



In [2]:
!python3 -m spacy download en_core_web_sm

/conda/bin/python3: No module named spacy


In [2]:
%nuclio cmd -c python -m pip uninstall mlrun -y
%nuclio cmd -c python -m pip install git+https://github.com/yaronha/mlrun.git@feature-store --upgrade
%nuclio cmd -c python -m pip install git+https://github.com/mlrun/storey@development
%nuclio cmd -c python -m pip install spacy
%nuclio cmd -c python -m spacy download en_core_web_sm

In [106]:
from storey import MapClass, Event
from storey.dtypes import _termination_obj
from mlrun import get_object
from typing import Union, List
import json
import spacy

class BaseClass:
    def __init__(self, context, state=None, name=None):
        self.context = context
        self.state = state
        self.name = name
        
class ApplyNLP(BaseClass):
    def __init__(self, spacy_dict="en_core_web_sm", context=None, state=None, name=None):
#         super().__init__()

        self.nlp = spacy.load(spacy_dict)

    def do(self, paragraphs: List[dict]):
        if paragraphs == _termination_obj:
            return _termination_obj
        else:
            tokenized_paragraphs = []
            paragraphs = paragraphs.decode()
            paragraphs = json.loads(paragraphs)
            print(type(paragraphs))
            print(paragraphs)
            for paragraph in paragraphs:
                tokenized = {
                        "url": paragraph["url"],
                        "paragraph_id": paragraph["paragraph_id"],
                        "tokens": self.nlp(paragraph["paragraph"]),
                }
                tokenized_paragraphs.append(tokenized)

            return tokenized_paragraphs

class ExtractEntities(BaseClass):
    def __init__(self, context=None, state=None, name=None):
        pass
    
    def do(self, tokens):
        if tokens == _termination_obj:
            return _termination_obj
        else:
            paragraph_entities = []
            for token in tokens:
                entities = token["tokens"].ents
                for entity in entities:
                    paragraph_entities.append(
                        {
                            "url": token["url"],
                            "paragraph_id": token["paragraph_id"],
                            "entity": entity.ents,
                        }
                    )
            return paragraph_entities


class EnrichEntities(BaseClass):
    def __init__(self, context=None, state=None, name=None):
        pass
    
    def do(self, entities):
        if entities == _termination_obj:
            return _termination_obj
        else:
            enriched_entities = []
            for entity in entities:
                enriched_entities.append(
                    {
                        "url": entity["url"],
                        "paragraph_id": entity["paragraph_id"],
                        "entity_text": entity["entity"][0].text,
                        "entity_start_char": entity["entity"][0].start_char,
                        "entity_end_char": entity["entity"][0].end_char,
                        "entity_label": entity["entity"][0].label_,
                    }
                )
            return enriched_entities

In [107]:
# nuclio: end-code

In [108]:
fn = mlrun.code_to_function("myfunc", kind='serving')
graph = fn.set_topology("flow", start_at="process_paragraph", engine="sync", exist_ok=True, result_state='enrich_entities')
fn.verbose = True
graph.add_step(name="process_paragraph", class_name="ApplyNLP", after='$prev')
graph.add_step(name='extract_entities', class_name='ExtractEntities', after='$prev')
graph.add_step(name='enrich_entities', class_name='EnrichEntities', after='$prev')

server = fn.to_mock_server()

In [109]:
test_event = b'[{\"url\": \"s3://igz-downloads/1.json\", \"paragraph_id\": 0, \"paragraph\": \"Born and raised in Queens, New York City, Trump attended Fordham University for two years and received a bachelors degree in economics from the Wharton School of the University of Pennsylvania. He became president of his father Fred Trumps real estate business in 1971, renamed it The Trump Organization, and expanded its operations to building or renovating skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, mostly by licensing his name. Trump and his businesses have been involved in more than 4,000 state and federal legal actions, including six bankruptcies. He owned the Miss Universe brand of beauty pageants from 1996 to 2015, and produced and hosted the reality television series The Apprentice from 2004 to 2015.\"}, {\"url\": \"s3://igz-downloads/1.json\", \"paragraph_id\": 1, \"paragraph\": \"Trumps political positions have been described as populist, protectionist, isolationist, and nationalist. He entered the 2016 presidential race as a Republican and was elected in a surprise electoral college victory over Democratic nominee Hillary Clinton while losing the popular vote.[a] He became the oldest first-term U.S. president[b] and the first without prior military or government service. His election and policies have sparked numerous protests. Trump has made many false or misleading statements during his campaign and presidency. The statements have been documented by fact-checkers, and the media have widely described the phenomenon as unprecedented in American politics. Many of his comments and actions have been characterized as racially charged or racist.\"}]'
server.test('/', body=test_event)

> 2020-12-13 11:37:31,478 [info] state process_paragraph got event b'[{"url": "s3://igz-downloads/1.json", "paragraph_id": 0, "paragraph": "Born and raised in Queens, New York City, Trump attended Fordham University for two years and received a bachelors degree in economics from the Wharton School of the University of Pennsylvania. He became president of his father Fred Trumps real estate business in 1971, renamed it The Trump Organization, and expanded its operations to building or renovating skyscrapers, hotels, casinos, and golf courses. Trump later started various side ventures, mostly by licensing his name. Trump and his businesses have been involved in more than 4,000 state and federal legal actions, including six bankruptcies. He owned the Miss Universe brand of beauty pageants from 1996 to 2015, and produced and hosted the reality television series The Apprentice from 2004 to 2015."}, {"url": "s3://igz-downloads/1.json", "paragraph_id": 1, "paragraph": "Trumps political positio

[{'url': 's3://igz-downloads/1.json',
  'paragraph_id': 0,
  'entity_text': 'Queens',
  'entity_start_char': 19,
  'entity_end_char': 25,
  'entity_label': 'GPE'},
 {'url': 's3://igz-downloads/1.json',
  'paragraph_id': 0,
  'entity_text': 'New York City',
  'entity_start_char': 27,
  'entity_end_char': 40,
  'entity_label': 'GPE'},
 {'url': 's3://igz-downloads/1.json',
  'paragraph_id': 0,
  'entity_text': 'Trump',
  'entity_start_char': 42,
  'entity_end_char': 47,
  'entity_label': 'ORG'},
 {'url': 's3://igz-downloads/1.json',
  'paragraph_id': 0,
  'entity_text': 'Fordham University',
  'entity_start_char': 57,
  'entity_end_char': 75,
  'entity_label': 'ORG'},
 {'url': 's3://igz-downloads/1.json',
  'paragraph_id': 0,
  'entity_text': 'two years',
  'entity_start_char': 80,
  'entity_end_char': 89,
  'entity_label': 'DATE'},
 {'url': 's3://igz-downloads/1.json',
  'paragraph_id': 0,
  'entity_text': 'the Wharton School of the University of Pennsylvania',
  'entity_start_char': 140