<a href="https://colab.research.google.com/github/xhr0804/BotBuilder-Samples/blob/master/tensorflow2/NER_lstm_L1_E16_H16__F1_068.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train
!wget https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa

--2020-12-05 12:47:05--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.train
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 3281528 (3.1M) [text/plain]
Saving to: ‘eng.train.10’


2020-12-05 12:47:05 (21.5 MB/s) - ‘eng.train.10’ saved [3281528/3281528]

--2020-12-05 12:47:05--  https://raw.githubusercontent.com/synalp/NER/master/corpus/CoNLL-2003/eng.testa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.0.133, 151.101.64.133, 151.101.128.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.0.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 827012 (808K) [text/plain]
Saving to: ‘eng.testa.10’


2020-12-05 12:47:05 (18.3 MB/s) - ‘eng.testa.10’ saved 

In [2]:
import tensorflow as tf
from tensorflow import keras

import numpy as np
import time
import matplotlib.pyplot as plt

print(tf.__version__)
# print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

2.3.0


In [3]:
def countWord(file):
  word_count = {}
  tag_count = {}
  for line in open(file):
    if '-DOCSTART-' in line or not len(line.strip('\n')):
      continue
    splitted = line.split()
    if splitted[0] in word_count:
      word_count[splitted[0]] += 1
    else:
      word_count[splitted[0]] = 1
    if splitted[-1] in tag_count:
      tag_count[splitted[-1]] += 1
    else:
      tag_count[splitted[-1]] = 1
  return word_count, tag_count

In [4]:
word_count, tag_count = countWord('eng.train')
sorted_word_count = sorted(word_count.items(), key=lambda x: x[1], reverse=True)
word2idx = {kv[0]: index+2 for index, kv in enumerate(sorted_word_count)}
word2idx["<PAD>"] = 0
word2idx["<UNK>"] = 1
idx2word = {idx:word for word, idx in word2idx.items()}
sorted_tag_count = sorted(tag_count.items(), key=lambda x: x[1], reverse=True)
tag2idx = {kv[0]: index for index, kv in enumerate(sorted_tag_count)}
idx2tag = {idx:tag for tag, idx in tag2idx.items()}

In [5]:
def parse(file):
  left, right = [], []
  for line in open(file):
    line = line.strip()
    if '-DOCSTART-' in line:
      this_left, this_right = [], []
    elif not len(line):
      if len(this_left) > 0:
        left.append(this_left)
        right.append(this_right)
      this_left, this_right = [], []
    else:
      splitted = line.split()
      this_left.append(splitted[0])
      this_right.append(splitted[-1])
  return left, right

In [6]:
left_train, right_train = parse('eng.train')
left_test, right_test = parse('eng.testa')

In [7]:
def convertWord2Idx(words, map, threshold):
  ret = []
  for word in words:
    if word in map and map[word] < threshold:
      ret.append(map[word])
    else:
      ret.append(map["<UNK>"])
  return np.array(ret, dtype=np.int32)

In [8]:
x_train = np.array([convertWord2Idx(words, word2idx, 10000) for words in left_train])
y_train = np.array([convertWord2Idx(words, tag2idx, 10000) for words in right_train])
x_test = np.array([convertWord2Idx(words, word2idx, 10000) for words in left_test])
y_test = np.array([convertWord2Idx(words, tag2idx, 10000) for words in right_test])

In [9]:
x_train = keras.preprocessing.sequence.pad_sequences(x_train,
                                                     padding='post',
                                                     value = word2idx["<PAD>"],
                                                     maxlen = 64)

In [10]:
y_train = keras.preprocessing.sequence.pad_sequences(y_train,
                                                     padding='post',
                                                     value = 0,
                                                     maxlen = 64)

In [24]:
model = keras.Sequential([
  keras.layers.Embedding(10000, 16, mask_zero=True),
  keras.layers.LSTM(16, return_sequences=True),
  keras.layers.Dense(8)
])

In [25]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 16)          160000    
_________________________________________________________________
lstm_1 (LSTM)                (None, None, 16)          2112      
_________________________________________________________________
dense_1 (Dense)              (None, None, 8)           136       
Total params: 162,248
Trainable params: 162,248
Non-trainable params: 0
_________________________________________________________________


In [13]:
earlystopping = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=5)

In [26]:
model.compile(optimizer='adam', 
              loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True), #if the model output doesn't have softmax, should set from_logits=True
              metrics=['accuracy'])

In [27]:
model.fit(
    x = x_train[:12000], # batchsize * sentence_len(word index)
    y = y_train[:12000], # batchsize * sentence_len(tag index)
    batch_size = 32,
    epochs = 40,
    validation_data = (x_train[12000:], y_train[12000:]),
    callbacks = [earlystopping]
)

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40


<tensorflow.python.keras.callbacks.History at 0x7f72a035b6a0>

In [16]:
def decode(pred_tag_index, idx2tag, sentence):
  ret = [sentence[i] + "(" + idx2tag[pred_tag_index[i]] + ")" for i in range(len(sentence))]
  print(" ".join(ret))

In [17]:
x_test = keras.preprocessing.sequence.pad_sequences(x_test,
                                                     padding='post',
                                                     value = word2idx["<PAD>"],
                                                     maxlen = 64)

In [18]:
y_test = keras.preprocessing.sequence.pad_sequences(y_test,
                                                     padding='post',
                                                     value = 0,
                                                     maxlen = 64)

In [28]:
x_pred = model.predict(x_test)

In [29]:
pred = tf.argmax(x_pred, axis=2)
pred = pred.numpy()

In [21]:
from sklearn.metrics import classification_report

In [30]:
pred_label = []
for i in range(len(right_test)):
  pred_label += [idx2tag[pred[i][j]] for j in range(min(len(right_test[i]), 64))]
real_label = []
for i in range(len(right_test)):
  real_label += [idx2tag[y_test[i][j]] for j in range(min(len(right_test[i]), 64))]

In [31]:
print(classification_report(pred_label, real_label))

  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      B-MISC       0.00      0.00      0.00         0
       I-LOC       0.77      0.79      0.78      2029
      I-MISC       0.71      0.83      0.77      1091
       I-ORG       0.66      0.78      0.72      1785
       I-PER       0.79      0.87      0.83      2824
           O       0.98      0.97      0.98     43446

    accuracy                           0.94     51175
   macro avg       0.65      0.71      0.68     51175
weighted avg       0.95      0.94      0.95     51175

