<a href="https://colab.research.google.com/github/xhr0804/BotBuilder-Samples/blob/master/NER_BiLSTM_L2_E128_H128__F1_072.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa

--2020-12-05 13:34:13--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3281528 (3.1M) [text/plain]
Saving to: ‘eng.train.14’


2020-12-05 13:34:13 (50.9 MB/s) - ‘eng.train.14’ saved [3281528/3281528]

--2020-12-05 13:34:13--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827012 (808K) [text/plain]
Saving to: ‘eng.testa.14’


2020-12-05 13:34:13 (19.3 MB/s) - ‘eng.testa.14’ saved 

In [2]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import time
import matplotlib.pyplot as plt

print(tf.__version__)
# print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.3.0


In [3]:
def countWord(file):
  word_count = {}
  tag_count = {}
  for line in open(file):
    if '-DOCSTART-' in line or not len(line.strip('\n')):
      continue
    splitted = line.split()
    if splitted[0] in word_count:
      word_count[splitted[0]] += 1
    else:
      word_count[splitted[0]] = 1
    if splitted[-1] in tag_count:
      tag_count[splitted[-1]] += 1
    else:
      tag_count[splitted[-1]] = 1
  return word_count, tag_count

In [4]:
word_count, tag_count = countWord('eng.train')
sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
word2idx = {kv[0]: index+2 for index, kv in enumerate(sorted_word_count)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1
idx2word = {idx:word for word, idx in word2idx.items()}
sorted_tag_count = sorted(tag_count.items(), key=lambda x: x[1], reverse=True)
tag2idx = {kv[0]: index for index, kv in enumerate(sorted_tag_count)}
idx2tag = {idx:tag for tag, idx in tag2idx.items()}

In [5]:
def parse(file):
  left, right = [], []
  for line in open(file):
    line = line.strip()
    if '-DOCSTART-' in line:
      this_left, this_right = [], []
    elif not len(line):
      if len(this_left) > 0:
        left.append(this_left)
        right.append(this_right)
      this_left, this_right = [], []
    else:
      splitted = line.split()
      this_left.append(splitted[0])
      this_right.append(splitted[-1])
  return left, right

In [6]:
left_train, right_train = parse('eng.train')
left_test, right_test = parse('eng.testa')

In [7]:
def convertWord2Idx(words, map, threshold):
  ret = []
  for word in words:
    if word in map and map[word] < threshold:
      ret.append(map[word])
    else:
      ret.append(map["<UNK>"])
  return np.array(ret, dtype=np.int32)

In [8]:
x_train = np.array([convertWord2Idx(words, word2idx, 10000) for words in left_train])
y_train = np.array([convertWord2Idx(words, tag2idx, 10000) for words in right_train])
x_test = np.array([convertWord2Idx(words, word2idx, 10000) for words in left_test])
y_test = np.array([convertWord2Idx(words, tag2idx, 10000) for words in right_test])

In [9]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                     padding='post',
                                                     value = word2idx["<PAD>"],
                                                     maxlen = 64)

In [10]:
y_train = keras.preprocessing.sequence.pad_sequences(y_train,
                                                     padding='post',
                                                     value = 0,
                                                     maxlen = 64)

In [11]:
model = keras.Sequential([
  keras.layers.Embedding(10000, 128, mask_zero=True),
  keras.layers.Bidirectional(keras.layers.LSTM(64, return_sequences=True)),
  keras.layers.Dropout(0.2),
  keras.layers.Dense(128, activation='relu'),
  keras.layers.Dense(8)
])

In [12]:
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, None, 128)         1280000   
_________________________________________________________________
bidirectional (Bidirectional (None, None, 128)         98816     
_________________________________________________________________
dropout (Dropout)            (None, None, 128)         0         
_________________________________________________________________
dense (Dense)                (None, None, 128)         16512     
_________________________________________________________________
dense_1 (Dense)              (None, None, 8)           1032      
Total params: 1,396,360
Trainable params: 1,396,360
Non-trainable params: 0
_________________________________________________________________


In [13]:
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [14]:
model.compile(optimizer='adam', 
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), #if the model output doesn't have softmax, should set from_logits=True
              metrics=['accuracy'])

In [15]:
model.fit(
    x = x_train[:12000], # batchsize * sentence_len(word index)
    y = y_train[:12000], # batchsize * sentence_len(tag index)
    batch_size = 32,
    epochs = 40,
    validation_data = (x_train[12000:], y_train[12000:]),
    callbacks = [earlystopping]
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40


<tensorflow.python.keras.callbacks.History at 0x7f48b31d7cf8>

In [16]:
def decode(pred_tag_index, idx2tag, sentence):
  ret = [sentence[i] + "(" + idx2tag[pred_tag_index[i]] + ")" for i in range(len(sentence))]
  print(" ".join(ret))

In [17]:
x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                     padding='post',
                                                     value = word2idx["<PAD>"],
                                                     maxlen = 64)

In [18]:
y_test = keras.preprocessing.sequence.pad_sequences(y_test,
                                                     padding='post',
                                                     value = 0,
                                                     maxlen = 64)

In [19]:
x_pred = model.predict(x_test)

In [20]:
pred = tf.argmax(x_pred, axis=2)
pred = pred.numpy()

In [21]:
from sklearn.metrics import classification_report

In [22]:
pred_label = []
for i in range(len(right_test)):
  pred_label += [idx2tag[pred[i][j]] for j in range(min(len(right_test[i]), 64))]
real_label = []
for i in range(len(right_test)):
  real_label += [idx2tag[y_test[i][j]] for j in range(min(len(right_test[i]), 64))]

In [23]:
print(classification_report(pred_label, real_label))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         0
       I-LOC       0.82      0.93      0.87      1839
      I-MISC       0.77      0.84      0.80      1171
       I-ORG       0.76      0.83      0.79      1931
       I-PER       0.84      0.91      0.88      2859
           O       0.99      0.97      0.98     43375

    accuracy                           0.96     51175
   macro avg       0.70      0.75      0.72     51175
weighted avg       0.96      0.96      0.96     51175

