# BERT NER

### Setting up workspace

In [47]:
import os
import pathlib

# *********************************************************
# If you actually want to train, switch to GPU runtime now.
# *********************************************************

# work_dir = INSERT DESIRED DIRECTORY
# BE SURE TO MOUNT DRIVE TO HAVE PERMANENT STORAGE
work_dir = pathlib.Path('/content/drive/My Drive/AISC-MLOps/BERT-NER')
if not os.path.exists(work_dir):
  os.mkdir(work_dir)
os.chdir(work_dir)
!ls

bert-ner-model			labels.txt     raw-test.txt   train.txt
cached_dev_BertTokenizer_128	new_name       raw-train.txt  transformers
cached_test_BertTokenizer_128	preprocess.py  run_ner.py     utils_ner.py
cached_train_BertTokenizer_128	__pycache__    runs
dev.txt				raw-dev.txt    test.txt


## Downloading training sets 

In [0]:
urls = {
    'train':'https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/train.txt',
    'dev':'https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt',
    'test':'https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/test.txt'
}

files = {
    'train':'raw-train.txt',
    'dev':'raw-dev.txt',
    'test':'raw-test.txt'
}

for k, v in files.items():
    url = urls[k]
    !wget $url -O $v -nc

File ‘raw-train.txt’ already there; not retrieving.
--2020-04-30 03:21:03--  https://raw.githubusercontent.com/davidsbatista/NER-datasets/master/CONLL2003/valid.txt
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827441 (808K) [text/plain]
Saving to: ‘raw-dev.txt’


2020-04-30 03:21:03 (23.3 MB/s) - ‘raw-dev.txt’ saved [827441/827441]

File ‘raw-test.txt’ already there; not retrieving.


## Download HuggingFace utility files

In [0]:
run_ner_url = 'https://raw.githubusercontent.com/huggingface/transformers/master/examples/ner/run_ner.py'
ner_utils_url = 'https://raw.githubusercontent.com/huggingface/transformers/master/examples/ner/utils_ner.py'
preprocess_url = 'https://raw.githubusercontent.com/stefan-it/fine-tuned-berts-seq/master/scripts/preprocess.py'

!wget $run_ner_url -nc
!wget $ner_utils_url -nc
!wget $preprocess_url -nc

File ‘run_ner.py’ already there; not retrieving.

File ‘utils_ner.py’ already there; not retrieving.

File ‘preprocess.py’ already there; not retrieving.



## Must install transformers from source

In [48]:
if not os.path.exists('transformers'):
  !git clone https://github.com/huggingface/transformers
os.chdir("transformers")
# ***********************************************************
# This pip install might take a few minutes. I'm not sure why
# ***********************************************************
!pip install .
os.chdir("..")
!pip install -r ./transformers/examples/requirements.txt

Processing /content/drive/My Drive/AISC-MLOps/BERT-NER/transformers
Building wheels for collected packages: transformers
  Building wheel for transformers (setup.py) ... [?25l[?25hdone
  Created wheel for transformers: filename=transformers-2.8.0-cp36-none-any.whl size=595210 sha256=3c42939a1aa64036316c997eac5436c0e6be21bd820e5e723565e2422c7c7874
  Stored in directory: /tmp/pip-ephem-wheel-cache-s_kvdj4x/wheels/96/f8/18/359404934db1fff4a184a9e45de5af138347609f49d4f7bee1
Successfully built transformers
Installing collected packages: transformers
  Found existing installation: transformers 2.8.0
    Uninstalling transformers-2.8.0:
      Successfully uninstalled transformers-2.8.0
Successfully installed transformers-2.8.0




## Preprocess train/dev/test files
It might actually be unnecessary if using conll2003 files. 
And for some reason it doesn't process raw-dev.txt

In [0]:
for key, raw_file in files.items():
  new_name = key+'.txt'
  if os.path.exists(new_name):
    continue
  print(new_name, raw_file)
  !python preprocess.py $raw_file bert-base-cased 128 > new_name

## Check that they are there


In [0]:
for key, f in files.items():
  print(f"{key+'.txt'} in dir? {os.path.exists(key+'.txt')}")

train.txt in dir? True
dev.txt in dir? True
test.txt in dir? True


## Create labels file

In [0]:
labels_file = 'labels.txt'
if not os.path.exists(labels_file):
  !cat train.txt dev.txt test.txt | cut -d " " -f 4 | grep -v "^$"| sort | uniq > $labels_file

## Setup training environment variables

In [0]:
output_dir = 'bert-ner-model'
if not os.path.exists(output_dir):
  !mkdir $output_dir

os.environ['MAX_LENGTH'] = '128'
os.environ['BERT_MODEL'] = 'bert-base-cased'

os.environ['OUTPUT_DIR'] = output_dir
os.environ['BATCH_SIZE'] = '32'
os.environ['NUM_EPOCHS'] = '3'
os.environ['SAVE_STEPS'] = '750'
os.environ['SEED'] = '1'
os.environ['LABELS'] = 'labels.txt'

## Training, evaluating, and predicting

In [0]:
# train on train.txt
# eval on dev.txt
# predict on test.txt


!python3 run_ner.py --data_dir ./ \
--labels $LABELS \
--model_name_or_path $BERT_MODEL \
--output_dir $OUTPUT_DIR \
--max_seq_length  $MAX_LENGTH \
--num_train_epochs $NUM_EPOCHS \
--per_gpu_train_batch_size $BATCH_SIZE \
--save_steps $SAVE_STEPS \
--seed $SEED \
--do_train \
--do_eval \
--do_predict

2020-04-30 03:31:35.577918: I tensorflow/stream_executor/platform/default/dso_loader.cc:44] Successfully opened dynamic library libcudart.so.10.1
04/30/2020 03:31:37 - INFO - transformers.training_args -   PyTorch: setting up devices
04/30/2020 03:31:37 - INFO - __main__ -   Training/evaluation parameters TrainingArguments(output_dir='bert-ner-model', overwrite_output_dir=False, do_train=True, do_eval=True, do_predict=True, evaluate_during_training=False, per_gpu_train_batch_size=32, per_gpu_eval_batch_size=8, gradient_accumulation_steps=1, learning_rate=5e-05, weight_decay=0.0, adam_epsilon=1e-08, max_grad_norm=1.0, num_train_epochs=3.0, max_steps=-1, warmup_steps=0, logging_dir=None, logging_first_step=False, logging_steps=500, save_steps=750, save_total_limit=None, no_cuda=False, seed=1, fp16=False, fp16_opt_level='O1', local_rank=-1)
04/30/2020 03:31:38 - INFO - filelock -   Lock 139921799461912 acquired on /root/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c69

## Load model back in

In [0]:
model_dir = 'bert-ner-model'
os.chdir(work_dir / model_dir)

from transformers import BertForTokenClassification, BertTokenizer

model = BertForTokenClassification.from_pretrained('.')
tokenizer = BertTokenizer.from_pretrained('.')

In [50]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwis

In [59]:
import json

with open('config.json', 'r') as f:
  config = json.load(f)

def do_ner(model, tokenizer, sentence):
  input_ids = tokenizer.encode(sentence, 
                               add_special_tokens=True,
                               max_length=512,
                               return_tensors='pt')
  return model(input_ids), input_ids

config['id2label']

{'_num_labels': 9,
 'architectures': ['BertForTokenClassification'],
 'attention_probs_dropout_prob': 0.1,
 'hidden_act': 'gelu',
 'hidden_dropout_prob': 0.1,
 'hidden_size': 768,
 'id2label': {'0': 'B-LOC',
  '1': 'B-MISC',
  '2': 'B-ORG',
  '3': 'B-PER',
  '4': 'I-LOC',
  '5': 'I-MISC',
  '6': 'I-ORG',
  '7': 'I-PER',
  '8': 'O'},
 'initializer_range': 0.02,
 'intermediate_size': 3072,
 'label2id': {'B-LOC': 0,
  'B-MISC': 1,
  'B-ORG': 2,
  'B-PER': 3,
  'I-LOC': 4,
  'I-MISC': 5,
  'I-ORG': 6,
  'I-PER': 7,
  'O': 8},
 'layer_norm_eps': 1e-12,
 'max_position_embeddings': 512,
 'model_type': 'bert',
 'num_attention_heads': 12,
 'num_hidden_layers': 12,
 'pad_token_id': 0,
 'type_vocab_size': 2,
 'vocab_size': 28996}

In [70]:
import torch

sentence = "Donald Trump says the United States will no longer pay the WHO"

output, input_ids = do_ner(model, tokenizer, sentence)

out_tensor = output[0]

id2label = config['id2label']

def ids_2_labels(id_tensor):
  return [id2label[str(id_.item())] for id_ in id_tensor]

def input_ids_to_string(tokenizer, input_ids):
  return tokenizer.convert_ids_to_tokens(input_ids.squeeze())
  

for input_, output_ in zip(input_ids, out_tensor):
  label_ids = torch.argmax(sample, 1)

  for i in list(zip(input_ids_to_string(tokenizer, input_ids), ids_2_labels(label_ids))):
    print(i)

('[CLS]', 'O')
('Donald', 'B-PER')
('Trump', 'I-PER')
('says', 'O')
('the', 'O')
('United', 'B-LOC')
('States', 'I-LOC')
('will', 'O')
('no', 'O')
('longer', 'O')
('pay', 'O')
('the', 'O')
('WHO', 'B-ORG')
('[SEP]', 'O')
