<div style="font-size:250%; font-weight:bold">Train NER with SpaCy</div>

This notebook shows how to train a new NER model from scratch using the SpaCy library on Amazon SageMaker.

In [None]:
!pip install --upgrade s3fs

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'
%load_ext autoreload
%autoreload 2

import os
import s3fs
from gtner_blog.util import split

In [2]:
# A few standard SageMaker stanzas

import sagemaker

# Strictly speaking, spacy does not depend on mxnet, tf, or pytorch, so we just
# select mxnet as the base container for simplicity, plus a couple of more reasons:
# - tf script mode does not support requirement_file=..., and it ignore source_dir/requirements.txt
# - pytorch may hang when script produces lots of stdout.
from sagemaker.mxnet import MXNet

role: str = sagemaker.get_execution_role()
sess = sagemaker.Session()
region: str = sess.boto_session.region_name

# Prepare data channels

Split the whole corpus into train:test = 3:1 proportion, then upload the splits to S3.

In [3]:
bucket = 'gtner-blog'                # Change me as necessary
gt_jobname = 'test-gtner-blog-004'   # Change me as necessary

iob_file = f's3://{bucket}/gt/{gt_jobname}/manifests/output/output.iob'
train = f's3://{bucket}/spacy-data/train'
test = f's3://{bucket}/spacy-data/test'
split(iob_file,
      os.path.join(train, 'data.iob'),
      os.path.join(test, 'data.iob'))

display(iob_file, train, test)

's3://gtner-blog/gt/test-gtner-blog-004/manifests/output/output.iob'

's3://gtner-blog/spacy-data/train'

's3://gtner-blog/spacy-data/test'

# Start training

In [4]:
# NOTES: see spacy.cli.train for more hyperparameters. Do note that spacy-train.py forbids you
# to set these hyperparameters: {"lang", "pipeline", "output_path", "train_path", "dev_path"}.
estimator = MXNet(entry_point='spacy-train.py',
                  source_dir='./spacy-scripts',
                  role=role,
                  train_instance_count=1,
                  train_instance_type='ml.m5.large',
                  framework_version='1.6.0',
                  py_version='py3',
                  debugger_hook_config=False,
                  hyperparameters={'n_iter': 10})

In [5]:
estimator.fit({'train': sagemaker.session.s3_input(train),
               'test': sagemaker.session.s3_input(test)})

2020-02-20 10:26:03 Starting - Starting the training job...
2020-02-20 10:26:05 Starting - Launching requested ML instances......
2020-02-20 10:27:04 Starting - Preparing the instances for training...
2020-02-20 10:27:49 Downloading - Downloading input data...
2020-02-20 10:28:34 Training - Training image download completed. Training in progress...[34m2020-02-20 10:28:35,390 sagemaker-containers INFO     Imported framework sagemaker_mxnet_container.training[0m
[34m2020-02-20 10:28:35,393 sagemaker-containers INFO     No GPUs detected (normal if no gpus installed)[0m
[34m2020-02-20 10:28:35,407 sagemaker_mxnet_container.training INFO     MXNet training environment: {'SM_HOSTS': '["algo-1"]', 'SM_NETWORK_INTERFACE_NAME': 'eth0', 'SM_HPS': '{"n_iter":10}', 'SM_USER_ENTRY_POINT': 'spacy-train.py', 'SM_FRAMEWORK_PARAMS': '{}', 'SM_RESOURCE_CONFIG': '{"current_host":"algo-1","hosts":["algo-1"],"network_interface_name":"eth0"}', 'SM_INPUT_DATA_CONFIG': '{"test":{"RecordWrapperType":"None