In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

from official.nlp import optimization as nlp_opt
from official.nlp.bert import tokenization as bert_token

from berts.berts import BertClassificationModel
from berts.utils import get_bert_inputs

 The versions of TensorFlow you are currently using is 2.3.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons


In [2]:
# use pre-processed (tokernized) data (original data is from Kaggle)
df = pd.read_csv('data/ner/preprocess/ner_case_normal_tokenized.csv', na_filter= False)
df.head()

Unnamed: 0,sid,token,pos,tag
0,1,Thousands,NNS,O
1,1,of,IN,O
2,1,demons,NNS,O
3,1,##tra,NNS,O
4,1,##tors,NNS,O


In [3]:
classes = df['tag'].nunique() + 1 # + 1 is for padding and <CLS> <SEP>

In [4]:
bert_url = "https://tfhub.dev/tensorflow/bert_en_cased_L-12_H-768_A-12/2" # cased is better than uncased in NER
model, bert_layer = BertClassificationModel(bert_url, classes, return_sequences=True)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_words_seq (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
input_attention_mask (InputLaye [(None, None)]       0                                            
__________________________________________________________________________________________________
input_segment_mask (InputLayer) [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 108310273   input_words_seq[0][0]            
                                                                 input_attention_mask[0

In [5]:
sentence_group = df.groupby('sid')
sentences = pd.DataFrame(data={'sentence': [list(sentence_group.get_group(g)['token']) for g in sentence_group.groups],
                               'tag': [list(sentence_group.get_group(g)['tag']) for g in sentence_group.groups],
                              })
sentences.head()

Unnamed: 0,sentence,tag
0,"[Thousands, of, demons, ##tra, ##tors, have, m...","[O, O, O, O, O, O, O, O, B-geo, O, O, O, O, O,..."
1,"[Families, of, soldiers, killed, in, the, conf...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
2,"[They, marched, from, the, Houses, of, Parliam...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, I-geo..."
3,"[Police, put, the, number, of, march, ##ers, a...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
4,"[The, protest, comes, on, the, eve, of, the, a...","[O, O, O, O, O, O, O, O, O, O, O, B-geo, O, O,..."


In [6]:
# load vocabulary (must be same as pre-trained bert)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
bert_tokenizer = bert_token.FullTokenizer(vocab_file, to_lower_case)
print('vocabulary size:', len(bert_tokenizer.vocab))

vocabulary size: 28996


In [7]:
tag2id = {t:i+1 for i, t in enumerate(df['tag'].unique())} # 0 is for padding
print(tag2id)

{'O': 1, 'B-geo': 2, 'B-gpe': 3, 'B-per': 4, 'I-geo': 5, 'B-org': 6, 'I-org': 7, 'B-tim': 8, 'B-art': 9, 'I-art': 10, 'I-per': 11, 'I-gpe': 12, 'I-tim': 13, 'B-nat': 14, 'B-eve': 15, 'I-eve': 16, 'I-nat': 17}


In [8]:
id2tag = [None]*classes
for k, v in tag2id.items():
    id2tag[v] = k
print(id2tag)

[None, 'O', 'B-geo', 'B-gpe', 'B-per', 'I-geo', 'B-org', 'I-org', 'B-tim', 'B-art', 'I-art', 'I-per', 'I-gpe', 'I-tim', 'B-nat', 'B-eve', 'I-eve', 'I-nat']


In [9]:
tag_labels = sentences['tag'].map(lambda x: [tag2id[k] for k in x])

In [10]:
input_words, input_mask, input_seg = get_bert_inputs(bert_tokenizer,
                                                     sentences['sentence'],
                                                     tokenized=True)
labels = tf.ragged.constant(tag_labels).to_tensor()
zero_pad = tf.zeros_like(labels[:,:1])
labels = tf.concat([zero_pad, labels, zero_pad], axis=-1) # <CLS> <sentence> <SEP>

val_size = int(input_words.shape[0] * 0.2)
train_input_words, train_input_mask, train_input_seg = input_words[val_size:], input_mask[val_size:], input_seg[val_size:]
train_labels = labels[val_size:]
valid_input_words, valid_input_mask, valid_input_seg = input_words[:val_size], input_mask[:val_size], input_seg[:val_size]
valid_labels = labels[:val_size]
print('training data shapes:', train_input_words.shape, train_input_mask.shape, train_input_seg.shape, train_labels.shape)
print('validation data shapes:', valid_input_words.shape, valid_input_mask.shape, valid_input_seg.shape, valid_labels.shape)

training data shapes: (38368, 143) (38368, 143) (38368, 143) (38368, 143)
validation data shapes: (9591, 143) (9591, 143) (9591, 143) (9591, 143)


In [11]:
batch_size = 20
epochs = 3
train_data_size = len(train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp_opt.create_optimizer(2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [12]:
def tag_loss(y_true, y_pred):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False)
    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), loss.dtype)
    loss *= mask
    return tf.math.reduce_sum(loss)/tf.math.reduce_sum(mask)

In [13]:
def tag_accuracy(y_true, y_pred):
    acc = tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), acc.dtype)
    acc *= mask
    return tf.math.reduce_sum(acc)/tf.math.reduce_sum(mask)

In [14]:
model.compile(loss=tag_loss, optimizer=optimizer, metrics=tag_accuracy)

In [15]:
history = model.fit([train_input_words, train_input_mask, train_input_seg], train_labels,
                    validation_data=([valid_input_words, valid_input_mask, valid_input_seg], valid_labels),
                    batch_size=batch_size, epochs=epochs
                   )

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
valid_id = 1234 # 0 ~ 9590
pred = model.predict([valid_input_words[valid_id:valid_id+1],
                      valid_input_mask[valid_id:valid_id+1],
                      valid_input_seg[valid_id:valid_id+1]])
pred_tagid = tf.math.argmax(pred, axis=-1)

sentence_len = tf.math.reduce_sum(valid_input_mask[valid_id]) - 2
pred_tag = [id2tag[id] for id in pred_tagid[0,1:sentence_len+1]]
print('Sentence (tokenized):', sentences['sentence'][valid_id])
print('Prediction: ', pred_tag)
print('Ground true:', [id2tag[id] for id in valid_labels[valid_id,1:sentence_len+1]])

Sentence (tokenized): ['The', 'United', 'States', 'says', 'Iran', 'is', 'trying', 'to', 'covert', '##ly', 'develop', 'nuclear', 'weapons', '.']
Prediction:  ['O', 'B-geo', 'I-geo', 'O', 'B-geo', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']
Ground true: ['O', 'B-geo', 'I-geo', 'O', 'B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O']


In [17]:
valid_id = 2234 # 0 ~ 9590
pred = model.predict([valid_input_words[valid_id:valid_id+1],
                      valid_input_mask[valid_id:valid_id+1],
                      valid_input_seg[valid_id:valid_id+1]])
pred_tagid = tf.math.argmax(pred, axis=-1)

sentence_len = tf.math.reduce_sum(valid_input_mask[valid_id]) - 2
pred_tag = [id2tag[id] for id in pred_tagid[0,1:sentence_len+1]]
print('Sentence (tokenized):', sentences['sentence'][valid_id])
print('Prediction: ', pred_tag)
print('Ground true:', [id2tag[id] for id in valid_labels[valid_id,1:sentence_len+1]])

Sentence (tokenized): ['NASA', 'officials', 'now', 'say', 'the', 'shuttle', 'could', 'launch', 'as', 'early', 'as', 'February', '24', '.']
Prediction:  ['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O']
Ground true: ['B-org', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-tim', 'I-tim', 'O']


In [18]:
model.save_weights('models/ner/weights/naive/weights')