## Testing spacy

In [3]:
import spacy

nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser"])

text = 'That is the will of Parliament and the nation. The British Empire and the French Republic, linked together in their cause and in their need, will defend to the death their native soil, aiding each other like good comrades to the utmost of their strength. Even though large tracts of Europe and many old and famous States have fallen or may fall into the grip of the Gestapo and all the odious apparatus of Nazi rule, we shall not flag or fail. We shall go on to the end, we shall fight in France, we shall fight on the seas and oceans, we shall fight with growing confidence and growing strength in the air, we shall defend our Island, whatever the cost may be, we shall fight on the beaches, we shall fight on the landing grounds, we shall fight in the fields and in the streets, we shall fight in the hills; we shall never surrender, and even if, which I do not for a moment believe, this Island or a large part of it were subjugated and starving, then our Empire beyond the seas, armed and guarded by the British Fleet, would carry on the struggle, until, in God’s good time, the New World, with all its power and might, steps forth to the rescue and the liberation of the old.'

t = nlp(text)
for ent in t.ents:
  print(ent.text, ent.start_char, ent.end_char, ent.label_)

Parliament 20 30 ORG
The British Empire 47 65 GPE
the French Republic 70 89 GPE
Europe 283 289 LOC
States 314 320 GPE
Gestapo 366 373 PRODUCT
Nazi 406 410 NORP
France 489 495 GPE
Island 628 634 GPE
Empire 959 965 GPE
the British Fleet 1004 1021 ORG
the New World 1079 1092 ORG


## Saving model and doing test

In [7]:
import os

nlp_model_path = 'spacy-nlp-model'


nlp.to_disk(nlp_model_path)

for e in nlp(text).ents:
  print(e.text, e.label_)

Parliament ORG
The British Empire GPE
the French Republic GPE
Europe LOC
States GPE
Gestapo PRODUCT
Nazi NORP
France GPE
Island GPE
Empire GPE
the British Fleet ORG
the New World ORG


## Load model and get same results

In [8]:
mod = spacy.load(nlp_model_path)
for e in mod(text).ents:
  print(e.text, e.label_)

Parliament ORG
The British Empire GPE
the French Republic GPE
Europe LOC
States GPE
Gestapo PRODUCT
Nazi NORP
France GPE
Island GPE
Empire GPE
the British Fleet ORG
the New World ORG


## Install mlflow

In [0]:
!pip install mlflow 

Collecting mlflow
[?25l  Downloading https://files.pythonhosted.org/packages/9e/a7/40679fdb5ac44ad922902b560818682038be169f88c23ad719b9d1f82090/mlflow-1.8.0-py3-none-any.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 2.9MB/s 
[?25hCollecting databricks-cli>=0.8.7
[?25l  Downloading https://files.pythonhosted.org/packages/49/d1/fe0ba3d5c2b4b76ec035aa243bbc2fd0d60607a391f192ebe1656e17a4e2/databricks-cli-0.10.0.tar.gz (45kB)
[K     |████████████████████████████████| 51kB 7.9MB/s 
Collecting docker>=4.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/58/74/379a9d30b1620def158c40b88c43e01c1936a287ebb97afab0699c601c57/docker-4.2.0-py2.py3-none-any.whl (143kB)
[K     |████████████████████████████████| 153kB 48.4MB/s 
[?25hCollecting sqlalchemy<=1.3.13
[?25l  Downloading https://files.pythonhosted.org/packages/af/47/35edeb0f86c0b44934c05d961c893e223ef27e79e1f53b5e6f14820ff553/SQLAlchemy-1.3.13.tar.gz (6.0MB)
[K     |████████████████████████████████| 6.0MB 

## Save SpacyWrapper model, load and test

In [0]:
import mlflow
import pip

# Create an `artifacts` dictionary that assigns a unique name to the saved XGBoost model file.
# This dictionary will be passed to `mlflow.pyfunc.save_model`, which will copy the model file
# into the new MLflow Model's directory.
artifacts = {
    "nlp_model": nlp_model_path
}

# Define the model class
import mlflow.pyfunc
class SpacyWrapper(mlflow.pyfunc.PythonModel):
    

    def load_context(self, context):
        import spacy
        self.nlp = spacy.load(context.artifacts["nlp_model"])

    def predict(self, context, model_input):
      import json
      def get_entities(text):
        ents = self.nlp(text).ents
        return [(ent.text, ent.start_char, ent.end_char, ent.label_) for ent in ents]
      try:
        ents = model_input.text.apply(get_entities)

        return ents.apply(lambda s: json.dumps(s))
      except TypeError:
        return "DataFrame must contain strings"

# Create a Conda environment for the new MLflow Model that contains the XGBoost library
# as a dependency, as well as the required CloudPickle library
import cloudpickle
# Let's create our own conda environment
conda_env = {
    'channels': ['defaults', 'pytorch'],
    'dependencies': [
      f'python=3.6.9',
      {
          'pip':[
            f'pip=={pip.__version__}',
            f'mlflow=={mlflow.__version__}',
            f'spacy=={spacy.__version__}',
            f'cloudpickle=={cloudpickle.__version__}'
          ]
      }
    ],
    'name': 'mlflow-env-spacy'
}

# Save the MLflow Model
mlflow_pyfunc_model_path = "spacy_mlflow_pyfunc"
# remove pre-existing folder
!rm -rf $mlflow_pyfunc_model_path

mlflow.pyfunc.save_model(
        path=mlflow_pyfunc_model_path, python_model=SpacyWrapper(), artifacts=artifacts,
        conda_env=conda_env)

# Load the model in `python_function` format
loaded_model = mlflow.pyfunc.load_model(mlflow_pyfunc_model_path)

# Evaluate the model
import pandas as pd
test_predictions = loaded_model.predict(pd.DataFrame(data={'text':['What a beautiful day', 'That is the will of Parliament and the nation. The British Empire and the French Republic, linked together in their cause and in their need']}))
print(test_predictions)


0                 [["a beautiful day", 5, 20, "DATE"]]
1    [["Parliament", 20, 30, "ORG"], ["The British ...
Name: text, dtype: object
