# BERT NER
[Model files available here. They are quite large](https://drive.google.com/open?id=11CPrF1rlZ-5eCv0m-UlFiAbCy3Z-yG54)

### Setting up workspace

In [4]:
import os
import pathlib

# *********************************************************
# If you actually want to train, switch to GPU runtime now.
# *********************************************************
# work_dir = INSERT DESIRED DIRECTORY
# BE SURE TO MOUNT DRIVE TO HAVE PERMANENT STORAGE
work_dir = pathlib.Path('/content/drive/My Drive/AISC-MLOps/BERT-NER')
if not os.path.exists(work_dir):
  os.mkdir(work_dir)
os.chdir(work_dir)
!ls

bert-ner-model			new_name       raw-train.txt  transformers
cached_dev_BertTokenizer_128	preprocess.py  run_ner.py     utils_ner.py
cached_train_BertTokenizer_128	__pycache__    runs
dev.txt				raw-dev.txt    test.txt
labels.txt			raw-test.txt   train.txt


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


## Downloading training sets 

In [5]:
%cd '/content/drive/My Drive/AISC-MLOps/BERT-NER'
urls = {
    'train':'https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt',
    'dev':'https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt',
    'test':'https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt'
}

files = {
    'train':'raw-train.txt',
    'dev':'raw-dev.txt',
    'test':'raw-test.txt'
}

for k, v in files.items():
    url = urls[k]
    !wget $url -O $v -nc

/content
File ‘raw-train.txt’ already there; not retrieving.
File ‘raw-dev.txt’ already there; not retrieving.
File ‘raw-test.txt’ already there; not retrieving.


## Download HuggingFace utility files

In [4]:
run_ner_url = 'https://raw.githubusercontent.com/huggingface/transformers/master/examples/ner/run_ner.py'
ner_utils_url = 'https://raw.githubusercontent.com/huggingface/transformers/master/examples/ner/utils_ner.py'
preprocess_url = 'https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py'

!wget $run_ner_url -nc
!wget $ner_utils_url -nc
!wget $preprocess_url -nc

File ‘run_ner.py’ already there; not retrieving.

File ‘utils_ner.py’ already there; not retrieving.

File ‘preprocess.py’ already there; not retrieving.



## Must install transformers from source

In [8]:
if not os.path.exists('transformers'):
  !git clone https://github.com/huggingface/transformers
os.chdir("transformers")
# ***********************************************************
# This pip install might take a few minutes. I'm not sure why
# ***********************************************************
!pip install .
os.chdir("..")
!pip install -r ./transformers/examples/requirements.txt

Processing /content/drive/My Drive/AISC-MLOps/BERT-NER/transformers
Collecting tokenizers==0.7.0
[?25l  Downloading https://files.pythonhosted.org/packages/14/e5/a26eb4716523808bb0a799fcfdceb6ebf77a18169d9591b2f46a9adb87d9/tokenizers-0.7.0-cp36-cp36m-manylinux1_x86_64.whl (3.8MB)
[K     |████████████████████████████████| 3.8MB 4.9MB/s 
Collecting sentencepiece
[?25l  Downloading https://files.pythonhosted.org/packages/98/2c/8df20f3ac6c22ac224fff307ebc102818206c53fc454ecd37d8ac2060df5/sentencepiece-0.1.86-cp36-cp36m-manylinux1_x86_64.whl (1.0MB)
[K     |████████████████████████████████| 1.0MB 43.7MB/s 
[?25hCollecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/99/50/93509f906a40bffd7d175f97fd75ea328ad9bd91f48f59c4bd084c94a25e/sacremoses-0.0.41.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 48.1MB/s 
Building wheels for collected packages: transformers, sacremoses
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created

## Preprocess train/dev/test files
It might actually be unnecessary if using conll2003 files. 
And for some reason it doesn't process raw-dev.txt

In [8]:

%cd '/content/drive/My Drive/AISC-MLOps/BERT-NER'
!python3 preprocess.py raw-train.txt bert-base-cased 128 > train.txt
!python3 preprocess.py raw-dev.txt bert-base-cased 128 > dev.txt
!python3 preprocess.py raw-test.txt bert-base-cased 128 > test.txt


/content
2020-05-04 00:18:46.298856: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-05-04 00:19:00.866354: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
2020-05-04 00:19:09.460371: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1


## Check that they are there


In [9]:

for key, f in files.items():
  print(f"{key+'.txt'} in dir? {os.path.exists(key+'.txt')}")

train.txt in dir? True
dev.txt in dir? True
test.txt in dir? True


## Create labels file

In [0]:
labels_file = 'labels.txt'
if not os.path.exists(labels_file):
  !cat train.txt dev.txt test.txt | cut -d " " -f 4 | grep -v "^$"| sort | uniq > $labels_file

## Setup training environment variables

In [0]:
output_dir = 'bert-ner-model'
if not os.path.exists(output_dir):
  !mkdir $output_dir

os.environ['MAX_LENGTH'] = '128'
os.environ['BERT_MODEL'] = 'bert-base-cased'

os.environ['OUTPUT_DIR'] = output_dir
os.environ['BATCH_SIZE'] = '32'
os.environ['NUM_EPOCHS'] = '3'
os.environ['SAVE_STEPS'] = '750'
os.environ['SEED'] = '1'
os.environ['LABELS'] = 'labels.txt'

## Training, evaluating, and predicting

In [10]:
# train on train.txt
# eval on dev.txt
# predict on test.txt

!python3 run_ner.py --data_dir ./ \
--labels $LABELS \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict

2020-05-04 00:30:42.328799: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
05/04/2020 00:30:44 - INFO - transformers.training_args -   PyTorch: setting up devices
05/04/2020 00:30:44 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='bert-ner-model', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=False, per_gpu_train_batch_size=32, per_gpu_eval_batch_size=8, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=750, save_total_limit=None, no_cuda=False, seed=1, fp16=False, fp16_opt_level='O1', local_rank=-1)
05/04/2020 00:30:45 - INFO - filelock -   Lock 139660659419960 acquired on /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c69

## Load model back in

In [0]:
model_dir = 'bert-ner-model'
os.chdir(work_dir / model_dir)

from transformers import BertForTokenClassification, BertTokenizer

model = BertForTokenClassification.from_pretrained('.')
tokenizer = BertTokenizer.from_pretrained('.')

In [0]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [12]:
import json

with open('config.json', 'r') as f:
  config = json.load(f)

def do_ner(model, tokenizer, sentence):
  input_ids = tokenizer.encode(sentence, 
                               add_special_tokens=True,
                               max_length=512,
                               return_tensors='pt')
  return model(input_ids), input_ids

config['id2label']

{'0': 'B-LOC',
 '1': 'B-MISC',
 '2': 'B-ORG',
 '3': 'B-PER',
 '4': 'I-LOC',
 '5': 'I-MISC',
 '6': 'I-ORG',
 '7': 'I-PER',
 '8': 'O'}

In [20]:
import torch

sentence = "donald Trump says the United States will no longer pay the WHO"

output, input_ids = do_ner(model, tokenizer, sentence)

out_tensor = output[0]

id2label = config['id2label']

def ids_2_labels(id_tensor):
  return [id2label[str(id_.item())] for id_ in id_tensor]

def input_ids_to_string(tokenizer, input_ids):
  return tokenizer.convert_ids_to_tokens(input_ids.squeeze())
  
output = list()
for input_, output_ in zip(input_ids, out_tensor):
  label_ids = torch.argmax(output_, 1)

  for i in list(zip(input_ids_to_string(tokenizer, input_ids), ids_2_labels(label_ids))):
    output.append(i)

print(output)

[('[CLS]', 'O'), ('don', 'O'), ('##ald', 'O'), ('Trump', 'B-PER'), ('says', 'O'), ('the', 'O'), ('United', 'B-LOC'), ('States', 'I-LOC'), ('will', 'O'), ('no', 'O'), ('longer', 'O'), ('pay', 'O'), ('the', 'O'), ('WHO', 'B-ORG'), ('[SEP]', 'O')]


# MLflow

## Install MLflow

In [16]:
!pip install mlflow

Collecting mlflow
[?25l  Downloading https://files.pythonhosted.org/packages/9e/a7/40679fdb5ac44ad922902b560818682038be169f88c23ad719b9d1f82090/mlflow-1.8.0-py3-none-any.whl (10.4MB)
[K     |████████████████████████████████| 10.4MB 4.9MB/s 
Collecting sqlalchemy<=1.3.13
[?25l  Downloading https://files.pythonhosted.org/packages/af/47/35edeb0f86c0b44934c05d961c893e223ef27e79e1f53b5e6f14820ff553/SQLAlchemy-1.3.13.tar.gz (6.0MB)
[K     |████████████████████████████████| 6.0MB 51.5MB/s 
Collecting gitpython>=2.1.0
[?25l  Downloading https://files.pythonhosted.org/packages/19/1a/0df85d2bddbca33665d2148173d3281b290ac054b5f50163ea735740ac7b/GitPython-3.1.1-py3-none-any.whl (450kB)
[K     |████████████████████████████████| 460kB 48.2MB/s 
[?25hCollecting gunicorn; platform_system != "Windows"
[?25l  Downloading https://files.pythonhosted.org/packages/69/ca/926f7cd3a2014b16870086b2d0fdc84a9e49473c68a8dff8b57f7c156f43/gunicorn-20.0.4-py2.py3-none-any.whl (77kB)
[K     |█████████████████

## Save BertWrapper model, load and test

In [0]:
import os
base_dir = '/content/drive/My Drive/AISC-MLOps/BERT-NER/'
nlp_model_path = model_dir
path =  os.path.join(base_dir)
os.chdir(path)

In [40]:
import mlflow
import pip


# Create an `artifacts` dictionary that assigns a unique name to the saved XGBoost model file.
# This dictionary will be passed to `mlflow.pyfunc.save_model`, which will copy the model file
# into the new MLflow Model's directory.
artifacts = {
    "nlp_model":  nlp_model_path
}

# Define the model class
import mlflow.pyfunc
class BertWrapper(mlflow.pyfunc.PythonModel):

    def do_ner(model, tokenizer, sentence):
      input_ids = tokenizer.encode(sentence, add_special_tokens=True, max_length=512, return_tensors='pt')
      return model(input_ids), input_ids
    
    def load_context(self, context):
        import os
        import json
        from transformers import BertForTokenClassification, BertTokenizer
        model_dir = context.artifacts["nlp_model"]
        config_file = os.path.join(model_dir, 'config.json')
        with open(config_file, 'r') as f:
          self.config = json.load(f)
        self.model = BertForTokenClassification.from_pretrained(model_dir)
        self.tokenizer = BertTokenizer.from_pretrained(model_dir)

    def predict(self, context, model_input):
      import json

      def get_entities(text):
        output, input_ids = do_ner(self.model, self.tokenizer, text)
        out_tensor = output[0]
        id2label = self.config['id2label']
        def ids_2_labels(id_tensor):
          return [id2label[str(id_.item())] for id_ in id_tensor]
        def input_ids_to_string(tokenizer, input_ids):
          return tokenizer.convert_ids_to_tokens(input_ids.squeeze())
        output = list()
        for input_, output_ in zip(input_ids, out_tensor):
          label_ids = torch.argmax(output_, 1)
          for i in list(zip(input_ids_to_string(tokenizer, input_ids), ids_2_labels(label_ids))):
            output.append(i)
        return output

      try:
        ents = model_input.text.apply(get_entities)

        return ents.apply(lambda s: json.dumps(s))
      except TypeError:
        return "DataFrame must contain strings"

# Create a Conda environment for the new MLflow Model that contains the XGBoost library
# as a dependency, as well as the required CloudPickle library
import cloudpickle
# Let's create our own conda environment
conda_env = {
    'channels': ['defaults', 'pytorch'],
    'dependencies': [
      f'python=3.6.9',
      {
          'pip':[
            f'pip=={pip.__version__}',
            f'mlflow=={mlflow.__version__}',
            f'cloudpickle=={cloudpickle.__version__}',
            f'torch=={torch.__version__}',
            f'transformers',
          ]
      }
    ],
    'name': 'mlflow-env-bert'
}

# Save the MLflow Model
mlflow_pyfunc_model_path = "bert_mlflow_pyfunc"
# remove pre-existing folder
!rm -rf $mlflow_pyfunc_model_path

mlflow.pyfunc.save_model(
        path=mlflow_pyfunc_model_path, python_model=BertWrapper(), artifacts=artifacts,
        conda_env=conda_env)

# Load the model in `python_function` format
loaded_model = mlflow.pyfunc.load_model(mlflow_pyfunc_model_path)

# Evaluate the model
import pandas as pd
test_predictions = loaded_model.predict(pd.DataFrame(data={'text':['What a beautiful day', 'That is the will of Parliament and the nation. The British Empire and the French Republic, linked together in their cause and in their need']}))
print(test_predictions)


NameError: ignored

NameError: ignored