In [1]:
import pandas as pd
import tensorflow as tf
import tensorflow_datasets as tfds

from official.nlp import optimization as nlp_opt
from official.nlp.bert import tokenization as bert_token

from berts.berts import BertClassificationModel
from berts.utils import get_bert_inputs

In [2]:
df = pd.read_csv('data/ner/preprocess/ner_normal_tokenized.csv', na_filter= False)
df = df[(df.pos != ':') & (df.pos != ',') & (df.pos != ':') & (df.pos != '``')]

In [3]:
classes = df['pos'].nunique() + 1 # + 1 is for padding and <CLS> <SEP>

In [4]:
bert_url = "https://tfhub.dev/tensorflow/bert_en_uncased_L-12_H-768_A-12/2"
model, bert_layer = BertClassificationModel(bert_url, classes, return_sequences=True)
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_words_seq (InputLayer)    [(None, None)]       0                                            
__________________________________________________________________________________________________
input_attention_mask (InputLaye [(None, None)]       0                                            
__________________________________________________________________________________________________
input_segment_mask (InputLayer) [(None, None)]       0                                            
__________________________________________________________________________________________________
keras_layer (KerasLayer)        [(None, 768), (None, 109482241   input_words_seq[0][0]            
                                                                 input_attention_mask[0

In [5]:
sentence_group = df.groupby('sid')
sentences = pd.DataFrame(data={'sentence': [list(sentence_group.get_group(g)['token']) for g in sentence_group.groups],
                               'pos': [list(sentence_group.get_group(g)['pos']) for g in sentence_group.groups],
                              })
sentences.head()

Unnamed: 0,sentence,pos
0,"[thousands, of, demonstrators, have, marched, ...","[NNS, IN, NNS, VBP, VBN, IN, NNP, TO, VB, DT, ..."
1,"[families, of, soldiers, killed, in, the, conf...","[NNS, IN, NNS, VBN, IN, DT, NN, VBD, DT, NNS, ..."
2,"[they, marched, from, the, houses, of, parliam...","[PRP, VBD, IN, DT, NNS, IN, NN, TO, DT, NN, IN..."
3,"[police, put, the, number, of, marche, ##rs, a...","[NNS, VBD, DT, NN, IN, NNS, NNS, IN, CD, CD, I..."
4,"[the, protest, comes, on, the, eve, of, the, a...","[DT, NN, VBZ, IN, DT, NN, IN, DT, JJ, NN, IN, ..."


In [6]:
# load vocabulary (must be same as pre-trained bert)
vocab_file = bert_layer.resolved_object.vocab_file.asset_path.numpy()
to_lower_case = bert_layer.resolved_object.do_lower_case.numpy()
bert_tokenizer = bert_token.FullTokenizer(vocab_file, to_lower_case)
print('vocabulary size:', len(bert_tokenizer.vocab))

vocabulary size: 30522


In [7]:
pos2id = {p:i+1 for i, p in enumerate(df['pos'].unique())} # 0 is for padding
print(pos2id)

{'NNS': 1, 'IN': 2, 'VBP': 3, 'VBN': 4, 'NNP': 5, 'TO': 6, 'VB': 7, 'DT': 8, 'NN': 9, 'CC': 10, 'JJ': 11, 'VBD': 12, 'WP': 13, 'CD': 14, 'PRP': 15, 'VBZ': 16, 'POS': 17, 'VBG': 18, 'RB': 19, 'WRB': 20, 'PRP$': 21, 'MD': 22, 'WDT': 23, 'JJR': 24, 'JJS': 25, 'WP$': 26, 'RP': 27, 'PDT': 28, 'NNPS': 29, 'EX': 30, 'RBS': 31, 'RBR': 32, 'UH': 33, 'LRB': 34, 'RRB': 35, 'FW': 36}


In [8]:
id2pos = [None]*classes
for k, v in pos2id.items():
    id2pos[v] = k
print(id2pos)

[None, 'NNS', 'IN', 'VBP', 'VBN', 'NNP', 'TO', 'VB', 'DT', 'NN', 'CC', 'JJ', 'VBD', 'WP', 'CD', 'PRP', 'VBZ', 'POS', 'VBG', 'RB', 'WRB', 'PRP$', 'MD', 'WDT', 'JJR', 'JJS', 'WP$', 'RP', 'PDT', 'NNPS', 'EX', 'RBS', 'RBR', 'UH', 'LRB', 'RRB', 'FW']


In [9]:
pos_labels = sentences['pos'].map(lambda x: [pos2id[k] for k in x])

In [10]:
input_words, input_mask, input_seg = get_bert_inputs(bert_tokenizer,
                                                     sentences['sentence'],
                                                     tokenized=True)
labels = tf.ragged.constant(pos_labels).to_tensor()
zero_pad = tf.zeros_like(labels[:,:1])
labels = tf.concat([zero_pad, labels, zero_pad], axis=-1) # <CLS> <sentence> <SEP>

val_size = int(input_words.shape[0] * 0.2)
train_input_words, train_input_mask, train_input_seg = input_words[val_size:], input_mask[val_size:], input_seg[val_size:]
train_labels = labels[val_size:]
valid_input_words, valid_input_mask, valid_input_seg = input_words[:val_size], input_mask[:val_size], input_seg[:val_size]
valid_labels = labels[:val_size]
print('training data shapes:', train_input_words.shape, train_input_mask.shape, train_input_seg.shape, train_labels.shape)
print('validation data shapes:', valid_input_words.shape, valid_input_mask.shape, valid_input_seg.shape, valid_labels.shape)

training data shapes: (38367, 110) (38367, 110) (38367, 110) (38367, 110)
validation data shapes: (9591, 110) (9591, 110) (9591, 110) (9591, 110)


In [11]:
batch_size = 24
epochs = 3
train_data_size = len(train_labels)
steps_per_epoch = int(train_data_size / batch_size)
num_train_steps = steps_per_epoch * epochs
warmup_steps = int(epochs * train_data_size * 0.1 / batch_size)

# creates an optimizer with learning rate schedule
optimizer = nlp_opt.create_optimizer(2e-5, num_train_steps=num_train_steps, num_warmup_steps=warmup_steps)

In [12]:
def pos_loss(y_true, y_pred):
    loss = tf.keras.losses.sparse_categorical_crossentropy(y_true, y_pred, from_logits=False)
    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), loss.dtype)
    loss *= mask
    return tf.math.reduce_sum(loss)/tf.math.reduce_sum(mask)

In [13]:
def pos_accuracy(y_true, y_pred):
    acc = tf.keras.metrics.sparse_categorical_accuracy(y_true, y_pred)
    mask = tf.cast(tf.math.logical_not(tf.math.equal(y_true, 0)), acc.dtype)
    acc *= mask
    return tf.math.reduce_sum(acc)/tf.math.reduce_sum(mask)

In [14]:
model.compile(loss=pos_loss, optimizer=optimizer, metrics=pos_accuracy)

In [15]:
history = model.fit([train_input_words, train_input_mask, train_input_seg], train_labels,
                    validation_data=([valid_input_words, valid_input_mask, valid_input_seg], valid_labels),
                    batch_size=batch_size, epochs=epochs)

Epoch 1/3
Epoch 2/3
Epoch 3/3


In [16]:
valid_id = 1000 # 0 ~ 9590
pred = model.predict([valid_input_words[valid_id:valid_id+1],
                      valid_input_mask[valid_id:valid_id+1],
                      valid_input_seg[valid_id:valid_id+1]])
pred_posid = tf.math.argmax(pred, axis=-1)

sentence_len = tf.math.reduce_sum(valid_input_mask[valid_id]) - 2
pred_pos = [id2pos[id] for id in pred_posid[0,1:sentence_len+1]]
print('prediction: ', pred_pos)
print('ground true:', [id2pos[id] for id in valid_labels[valid_id,1:sentence_len+1]])

prediction:  ['DT', 'JJ', 'JJ', 'NN', 'VBZ', 'VBN', 'NNP', 'POS', 'POS', 'NN', 'IN', 'NNS', 'NNS']
ground true: ['DT', 'JJ', 'JJ', 'NN', 'VBZ', 'VBN', 'NNP', 'POS', 'POS', 'NN', 'IN', 'NNS', 'NNS']
