In [2]:
import matplotlib.pyplot as plt
from io import BytesIO
import pandas as pd
import seaborn as sn
import numpy as np
import requests
np.random.seed(0)
plt.style.use("ggplot")
import tensorflow as tf
print('Tensorflow version:', tf.__version__)
print('GPU detected:', tf.config.list_physical_devices('GPU'))

Tensorflow version: 2.16.1
GPU detected: []


In [None]:
!git clone https://github.com/VinAIResearch/PhoNER_COVID19.git

Cloning into 'PhoNER_COVID19'...
remote: Enumerating objects: 61, done.[K
remote: Counting objects: 100% (61/61), done.[K
remote: Compressing objects: 100% (42/42), done.[K
remote: Total 61 (delta 24), reused 41 (delta 17), pack-reused 0 (from 0)[K
Receiving objects: 100% (61/61), 3.61 MiB | 7.41 MiB/s, done.
Resolving deltas: 100% (24/24), done.


# **Data Loading**

In [3]:
# TrainSet
data_train = pd.read_csv('train_word.conll', sep='\n\n', header=None)

Sentence_train = []
Word_train = []
Tag_train = []
flag_train = 0

for i in range(data_train.shape[0]):
  s = data_train[0][i].split()
  Sentence_train.append(flag_train)
  Word_train.append(s[0])
  Tag_train.append(s[1])
  if s[0] == '.':
    flag_train += 1

data_train = pd.DataFrame({'Sentence': Sentence_train,
                         'Word': Word_train,
                         'Tag': Tag_train})

# DevSet
data_dev = pd.read_csv('dev_word.conll', sep='\n\n', header=None)

Sentence_dev = []
Word_dev = []
Tag_dev = []
flag_dev = flag_train

for i in range(data_dev.shape[0]):
  s = data_dev[0][i].split()
  Sentence_dev.append(flag_dev)
  Word_dev.append(s[0])
  Tag_dev.append(s[1])
  if s[0] == '.':
    flag_dev += 1

data_dev = pd.DataFrame({'Sentence': Sentence_dev,
                       'Word': Word_dev,
                       'Tag': Tag_dev})

# TestSet
data_test = pd.read_csv('test_word.conll', sep='\n\n', header=None)

Sentence_test = []
Word_test = []
Tag_test = []
flag_test = flag_dev

for i in range(data_test.shape[0]):
  s = data_test[0][i].split()
  Sentence_test.append(flag_test)
  Word_test.append(s[0])
  Tag_test.append(s[1])
  if s[0] == '.':
    flag_test += 1

data_test = pd.DataFrame({'Sentence': Sentence_test,
                        'Word': Word_test,
                        'Tag': Tag_test})

  data_train = pd.read_csv('train_word.conll', sep='\n\n', header=None)
  data_dev = pd.read_csv('dev_word.conll', sep='\n\n', header=None)
  data_test = pd.read_csv('test_word.conll', sep='\n\n', header=None)


In [4]:
data = pd.concat([data_train, data_dev, data_test], axis=0, ignore_index=True)
data

Unnamed: 0,Sentence,Word,Tag
0,0,Đồng_thời,O
1,0,",",O
2,0,bệnh_viện,O
3,0,tiếp_tục,O
4,0,thực_hiện,O
...,...,...,...
274467,9888,nhiệt_đới,I-LOCATION
274468,9888,trung_ương,I-LOCATION
274469,9888,cơ_sở,I-LOCATION
274470,9888,Đông_Anh,I-LOCATION


In [5]:
words = list(set(data["Word"].values))
words.append("ENDPAD")
tags = list(set(data["Tag"].values))

n_words = len(words)
n_tags = len(tags)

print('Tổng số từ duy nhất:', n_words)
print('Số lượng nhãn thực thể:', n_tags)

Tổng số từ duy nhất: 8102
Số lượng nhãn thực thể: 20


In [6]:
class SentenceGetter(object):
    def __init__(self, data):
        self.n_sent = 1
        self.data = data
        self.empty = False
        agg_func = lambda s: [(w, t) for w, t in zip(s["Word"].values.tolist(),
                                                     s["Tag"].values.tolist())]
        self.grouped = self.data.groupby("Sentence").apply(agg_func)
        self.sentences = [s for s in self.grouped]

sentences = SentenceGetter(data).sentences

In [7]:
# padding
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_len=80
tag2idx = {t: i for i, t in enumerate(tags)}
word2idx = {w: i for i, w in enumerate(words)}


X = [[word2idx[w[0]] for w in s] for s in sentences]
y = [[tag2idx[w[1]] for w in s] for s in sentences]

X = pad_sequences(maxlen=max_len, sequences=X, padding="post", value=n_words-1)
y = pad_sequences(maxlen=max_len, sequences=y, padding="post", value=tag2idx["O"])

In [8]:
# train dev test split

X_train = X[0:flag_train]
y_train = y[0:flag_train]

X_dev = X[flag_train:flag_dev]
y_dev = y[flag_train:flag_dev]

X_test = X[flag_dev:]
y_test = y[flag_dev:]

In [9]:
# model
from tensorflow.keras import Model, Input
from tensorflow.keras.layers import GRU, Embedding, Dense
from tensorflow.keras.layers import TimeDistributed, SpatialDropout1D, Bidirectional

In [10]:
# Thông số mô hình

input_word = Input(shape=(max_len,))
model = Embedding(input_dim=n_words, output_dim=80, input_length=max_len)(input_word)
model = SpatialDropout1D(0.1)(model)
model = Bidirectional(GRU(units=100, return_sequences=True, recurrent_dropout=0.1))(model)
out = TimeDistributed(Dense(n_tags, activation="softmax"))(model)
model = Model(input_word, out)
model.summary()



In [11]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
callback = tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# train model
import time
start_time = time.time()

history = model.fit(X_train, y_train, validation_data=(X_dev, y_dev),
                    batch_size= 32, epochs=20, callbacks=[callback])

print("[{}] Completed!".format(time.time() - start_time))

Epoch 1/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 81ms/step - accuracy: 0.8954 - loss: 0.7407 - val_accuracy: 0.9360 - val_loss: 0.2354
Epoch 2/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 73ms/step - accuracy: 0.9596 - loss: 0.1422 - val_accuracy: 0.9665 - val_loss: 0.1329
Epoch 3/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 72ms/step - accuracy: 0.9807 - loss: 0.0752 - val_accuracy: 0.9780 - val_loss: 0.0921
Epoch 4/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 73ms/step - accuracy: 0.9891 - loss: 0.0431 - val_accuracy: 0.9816 - val_loss: 0.0719
Epoch 5/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 73ms/step - accuracy: 0.9911 - loss: 0.0337 - val_accuracy: 0.9826 - val_loss: 0.0653
Epoch 6/20
[1m155/155[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 72ms/step - accuracy: 0.9929 - loss: 0.0258 - val_accuracy: 0.9837 - val_loss: 0.0593
Epoch 7/20
[1m1

In [12]:
# evaluation
from seqeval.metrics import f1_score, classification_report

y_true = []
y_pred = []
for i in range(y_test.shape[0]):
  y_true.append(y_test[i])
  p = model.predict(np.array([X_test[i]]))
  p = np.argmax(p, axis=-1)
  y_pred.extend(p)

Y_true = []
Y_pred = []

for i, j in zip(range(len(y_true)), range(len(y_pred))):
  true = []
  pred = []
  for k in range(len(y_true[i])):
    true.append(tags[y_true[i][k]])
    pred.append(tags[y_pred[j][k]])
  Y_true.append(true)
  Y_pred.append(pred)

print('F1-score micro test set: {}%'.format(round(f1_score(Y_true, Y_pred, average='micro')*100,2)))
print('F1-score macro test set: {}%'.format(round(f1_score(Y_true, Y_pred, average='macro')*100,2)))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 801ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 20ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1

In [13]:
# F1-score và Accuracy cho từng nhãn thực thể.
# F1-score
from collections.abc import Iterable
def flatten(lis):
     for item in lis:
         if isinstance(item, Iterable) and not isinstance(item, str):
             for x in flatten(item):
                 yield x
         else:
             yield item

YT = list(flatten(Y_true))
YP = list(flatten(Y_pred))


from sklearn.metrics import classification_report as sklearn_cs

print(sklearn_cs(YT, YP))

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


                       precision    recall  f1-score   support

                B-AGE       0.90      0.96      0.93       568
               B-DATE       0.97      0.97      0.97      1640
             B-GENDER       0.90      0.95      0.93       447
                B-JOB       0.63      0.49      0.55       172
           B-LOCATION       0.93      0.89      0.91      4425
               B-NAME       0.90      0.56      0.69       317
       B-ORGANIZATION       0.86      0.84      0.85       769
         B-PATIENT_ID       0.96      0.91      0.93      1978
B-SYMPTOM_AND_DISEASE       0.91      0.83      0.87      1134
     B-TRANSPORTATION       0.91      0.76      0.83       193
                I-AGE       0.00      0.00      0.00         6
               I-DATE       0.95      0.99      0.97      1726
                I-JOB       0.50      0.05      0.10       114
           I-LOCATION       0.95      0.83      0.89      4892
               I-NAME       0.00      0.00      0.00  

  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
# accuracy
from seqeval.metrics import accuracy_score

accuracy = round(accuracy_score(Y_true, Y_pred)*100,2)
print('Accuracy test = {}%'.format(accuracy))

from sklearn.metrics import confusion_matrix

cm = confusion_matrix(YT, YP)
labels=['B-AGE', 'B-DATE', 'B-GENDER', 'B-JOB', 'B-LOCATION', 'B-NAME', 'B-ORGANIZATION', 'B-PATIENT_ID', 'B-SYMPTOM_AND_DISEASE', 'B-TRANSPORTATION',
        'I-AGE', 'I-DATE', 'I-JOB', 'I-LOCATION', 'I-NAME', 'I-ORGANIZATION', 'I-PATIENT_ID', 'I-SYMPTOM_AND_DISEASE', 'I-TRANSPORTATION', 'O' ]
# Accuracy của từng nhãn thực thể
acc_class = cm.diagonal()/cm.sum(axis=1)
df_acc_class = pd.DataFrame({'Label': labels,
                             'Accuracy': acc_class})
df_acc_class

Accuracy test = 98.38%


Unnamed: 0,Label,Accuracy
0,B-AGE,0.964789
1,B-DATE,0.966463
2,B-GENDER,0.950783
3,B-JOB,0.488372
4,B-LOCATION,0.886554
5,B-NAME,0.55836
6,B-ORGANIZATION,0.837451
7,B-PATIENT_ID,0.911527
8,B-SYMPTOM_AND_DISEASE,0.828924
9,B-TRANSPORTATION,0.756477


In [15]:
model.save_weights('NER_COVID19_GRU_weights_22521117.weights.h5')
model.save('NER_COVID19_GRU_22521117.h5')

