In [1]:
import re
import os
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

In [2]:
def select_gpu(N):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            tf.config.experimental.set_visible_devices(gpus[N], 'GPU')
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)    

In [3]:
select_gpu(2)
epochs = 512
batch_size = 1024
model_name = 'chinese_roberta_wwm_large_ext_L-24_H-1024_A-16'
embedding_path = '/home/Danny/pretrain_model/{}'.format(model_name)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]
3 Physical GPUs, 1 Logical GPUs


In [4]:
# 不用儲存article_id, 轉成list
def training_txt_to_list(path):
    with open(path, 'r') as f:
        txt = str(f.read())
    txt_list = txt.split('\n')
    text_label_list = list()
    tmp = list()
    for line in txt_list:
        if line == '--------------------':
            text_label_list.append(tmp)
            tmp = list()
            continue
        if line == '':
            continue
        tmp.append(line)
    x = list()
    y = list()
    for text_label in text_label_list:
        text = text_label[0]
        label = text_label[2:]
        label_list = ['O' for i in range(len(text))]
        for i in label:
            entity = i.split('\t')
            if int(entity[1]) > int(entity[2]):
                continue
            b = int(entity[1])
            label_list[b] = 'B-{}'.format(entity[-1])
            for j in range(int(entity[1])+1, int(entity[2])):
                label_list[j] = 'I-{}'.format(entity[-1])
        for i, j in enumerate(text):
            if j == '，' or j == '。' or j == '？':
                label_list[i] = j
                
        text_list = re.split('\uff0c|\u3002|\uff1f', text)
        for sentence in text_list:
            x.append([i for i in sentence])
        x = x[:-1]
            
        sentence = list()
        for i in label_list:
            if i == '，' or i == '。' or i == '？':
                y.append(sentence)
                sentence = list()
            else:
                sentence.append(i)
                
    return [x, y]

In [5]:
x1, y1 = training_txt_to_list('/home/Danny/ai-cup-2020/datasets/stage1/SampleData_deid.txt')

In [6]:
x2, y2 = training_txt_to_list('/home/Danny/ai-cup-2020/datasets/stage2/train_1_update.txt')

In [7]:
x4, y4 = training_txt_to_list('/home/Danny/ai-cup-2020/datasets/stage4/train_2.txt')

In [8]:
x = x1 + x2 + x4
y = y1 + y2 + y4
print(len(x))
print(len(y))

76162
76162


In [9]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(valid_x), len(valid_y))
print(len(test_x), len(test_y))

48743 48743
12186 12186
15233 15233


In [10]:
import kashgari
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.embeddings import TransformerEmbedding
from keras_radam import RAdam
kashgari.config.use_cudnn_cell = True

embedding_path = '/home/Danny/pretrain_model/{}'.format(model_name)
vocab_path = os.path.join(embedding_path, 'vocab.txt')
config_path = os.path.join(embedding_path, 'bert_config.json')
checkpoint_path = os.path.join(embedding_path, 'bert_model.ckpt')
embedding = TransformerEmbedding(vocab_path, 
                                 config_path, 
                                 checkpoint_path,
                                 bert_type='bert',
                                 sequence_length='auto',
                                 trainable=True,
                                 task='kashgari.LABELING',
                                )

model = BiLSTM_CRF_Model(embedding)
# model.build_model(train_x, train_y)
# model.compile_model(optimizer=RAdam(lr=3e-5))
# hyper = model.default_hyper_parameters()
# print(hyper)

2020-12-23 00:52:56,041 [DEBUG] kashgari - ------------------------------------------------
2020-12-23 00:52:56,041 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-23 00:52:56,042 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_config.json
2020-12-23 00:52:56,042 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/vocab.txt
2020-12-23 00:52:56,043 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_model.ckpt
2020-12-23 00:52:56,043 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]'

In [11]:
%%time
history = model.fit(train_x,
                    train_y,
                    valid_x,
                    valid_y,
                    epochs=epochs,
                    batch_size=batch_size,
                   )

Preparing text vocab dict: 100%|██████████| 48743/48743 [00:00<00:00, 381058.02it/s]
Preparing text vocab dict: 100%|██████████| 12186/12186 [00:00<00:00, 393497.54it/s]
2020-12-23 00:52:56,213 [DEBUG] kashgari - --- Build vocab dict finished, Total: 1742 ---
2020-12-23 00:52:56,213 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '：', '師', '是', '民', '眾', '醫']
Preparing text vocab dict: 100%|██████████| 48743/48743 [00:00<00:00, 521512.89it/s]
Preparing text vocab dict: 100%|██████████| 12186/12186 [00:00<00:00, 479045.77it/s]
2020-12-23 00:52:56,336 [DEBUG] kashgari - --- Build vocab dict finished, Total: 28 ---
2020-12-23 00:52:56,337 [DEBUG] kashgari - Top-10: ['[PAD]', 'O', 'I-time', 'B-time', 'I-med_exam', 'I-name', 'I-location', 'B-med_exam', 'I-money', 'B-name']
Calculating sequence length: 100%|██████████| 48743/48743 [00:00<00:00, 1294647.47it/s]
Calculating sequence length: 100%|██████████| 12186/12186 [00:00<00:00, 1274290.41it/s]
2020-12-23 00:53:03,638 [DEBU

Epoch 1/512
Epoch 2/512
Epoch 3/512
Epoch 4/512
Epoch 5/512
Epoch 6/512
Epoch 7/512
Epoch 8/512
Epoch 9/512
Epoch 10/512
Epoch 11/512
Epoch 12/512
Epoch 13/512
Epoch 14/512
Epoch 15/512
Epoch 16/512
Epoch 17/512
Epoch 18/512
Epoch 19/512
Epoch 20/512
Epoch 21/512
Epoch 22/512
Epoch 23/512
Epoch 24/512
Epoch 25/512
Epoch 26/512
Epoch 27/512
Epoch 28/512
Epoch 29/512
Epoch 30/512
Epoch 31/512
Epoch 32/512
Epoch 33/512
Epoch 34/512
Epoch 35/512
Epoch 36/512
Epoch 37/512
Epoch 38/512
Epoch 39/512
Epoch 40/512
Epoch 41/512
Epoch 42/512
Epoch 43/512
Epoch 44/512
Epoch 45/512
Epoch 46/512
Epoch 47/512
Epoch 48/512
Epoch 49/512
Epoch 50/512
Epoch 51/512
Epoch 52/512
Epoch 53/512
Epoch 54/512
Epoch 55/512
Epoch 56/512
Epoch 57/512


Epoch 58/512
Epoch 59/512
Epoch 60/512
Epoch 61/512
Epoch 62/512
Epoch 63/512
Epoch 64/512
Epoch 65/512
Epoch 66/512
Epoch 67/512
Epoch 68/512
Epoch 69/512
Epoch 70/512
Epoch 71/512
Epoch 72/512
Epoch 73/512
Epoch 74/512
Epoch 75/512
Epoch 76/512
Epoch 77/512
Epoch 78/512
Epoch 79/512
Epoch 80/512
Epoch 81/512
Epoch 82/512
Epoch 83/512
Epoch 84/512
Epoch 85/512
Epoch 86/512
Epoch 87/512
Epoch 88/512
Epoch 89/512
Epoch 90/512
Epoch 91/512
Epoch 92/512
Epoch 93/512
Epoch 94/512
Epoch 95/512
Epoch 96/512
Epoch 97/512
Epoch 98/512
Epoch 99/512
Epoch 100/512
Epoch 101/512
Epoch 102/512
Epoch 103/512
Epoch 104/512
Epoch 105/512
Epoch 106/512
Epoch 107/512
Epoch 108/512
Epoch 109/512
Epoch 110/512
Epoch 111/512
Epoch 112/512
Epoch 113/512
Epoch 114/512


Epoch 115/512
Epoch 116/512
Epoch 117/512
Epoch 118/512
Epoch 119/512
Epoch 120/512
Epoch 121/512
Epoch 122/512
Epoch 123/512
Epoch 124/512
Epoch 125/512
Epoch 126/512
Epoch 127/512
Epoch 128/512
Epoch 129/512
Epoch 130/512
Epoch 131/512
Epoch 132/512
Epoch 133/512
Epoch 134/512
Epoch 135/512
Epoch 136/512
Epoch 137/512
Epoch 138/512
Epoch 139/512
Epoch 140/512
Epoch 141/512
Epoch 142/512
Epoch 143/512
Epoch 144/512
Epoch 145/512
Epoch 146/512
Epoch 147/512
Epoch 148/512
Epoch 149/512
Epoch 150/512
Epoch 151/512
Epoch 152/512
Epoch 153/512
Epoch 154/512
Epoch 155/512
Epoch 156/512
Epoch 157/512
Epoch 158/512
Epoch 159/512
Epoch 160/512
Epoch 161/512
Epoch 162/512
Epoch 163/512
Epoch 164/512
Epoch 165/512
Epoch 166/512
Epoch 167/512
Epoch 168/512
Epoch 169/512
Epoch 170/512
Epoch 171/512
Epoch 172/512
Epoch 173/512
Epoch 174/512
Epoch 175/512
Epoch 176/512
Epoch 177/512
Epoch 178/512
Epoch 179/512
Epoch 180/512
Epoch 181/512
Epoch 182/512
Epoch 183/512
Epoch 184/512
Epoch 185/512
Epoch 

Epoch 228/512
Epoch 229/512
Epoch 230/512
Epoch 231/512
Epoch 232/512
Epoch 233/512
Epoch 234/512
Epoch 235/512
Epoch 236/512
Epoch 237/512
Epoch 238/512
Epoch 239/512
Epoch 240/512
Epoch 241/512
Epoch 242/512
Epoch 243/512
Epoch 244/512
Epoch 245/512
Epoch 246/512
Epoch 247/512
Epoch 248/512
Epoch 249/512
Epoch 250/512
Epoch 251/512
Epoch 252/512
Epoch 253/512
Epoch 254/512
Epoch 255/512
Epoch 256/512
Epoch 257/512
Epoch 258/512
Epoch 259/512
Epoch 260/512
Epoch 261/512
Epoch 262/512
Epoch 263/512
Epoch 264/512
Epoch 265/512
Epoch 266/512
Epoch 267/512
Epoch 268/512
Epoch 269/512
Epoch 270/512
Epoch 271/512
Epoch 272/512
Epoch 273/512
Epoch 274/512
Epoch 275/512
Epoch 276/512
Epoch 277/512
Epoch 278/512
Epoch 279/512
Epoch 280/512
Epoch 281/512
Epoch 282/512
Epoch 283/512
Epoch 284/512
Epoch 285/512
Epoch 286/512
Epoch 287/512
Epoch 288/512
Epoch 289/512
Epoch 290/512
Epoch 291/512
Epoch 292/512
Epoch 293/512
Epoch 294/512
Epoch 295/512
Epoch 296/512
Epoch 297/512
Epoch 298/512
Epoch 

KeyboardInterrupt: 

In [None]:
model_path = 'model/{}_epoch_{}_batch_{}'.format(model_name, epochs, batch_size)
model.save(model_path)

In [10]:
from kashgari.utils import load_model
model_path = 'model/{}_epoch_{}_batch_{}'.format(model_name, epochs, batch_size)
model = load_model(model_path)
model.evaluate(test_x, test_y)

  This is separate from the ipykernel package so we can avoid doing imports until
2020-12-23 13:03:59,978 [DEBUG] kashgari - ------------------------------------------------
2020-12-23 13:03:59,979 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-23 13:03:59,979 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_config.json
2020-12-23 13:03:59,979 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/vocab.txt
2020-12-23 13:03:59,980 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_model.ckpt
2020-12-23 13:03:59,980 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', 



2020-12-23 13:06:17,000 [DEBUG] kashgari - predict output: (15233, 65)
2020-12-23 13:06:17,001 [DEBUG] kashgari - predict output argmax: [[0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 ...
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]
 [0 1 1 ... 1 1 1]]



                precision    recall  f1-score   support

      med_exam     0.8219    0.8955    0.8571       134
          time     0.8431    0.8383    0.8407       878
    profession     1.0000    0.4118    0.5833        17
      location     0.9341    0.8333    0.8808       102
         money     0.9487    0.8605    0.9024        43
        family     0.8571    0.5455    0.6667        11
          name     0.9630    0.8864    0.9231        88
       contact     0.8571    0.6667    0.7500        18
     education     1.0000    0.5000    0.6667         2
            ID     1.0000    0.4444    0.6154         9
        others     0.0000    0.0000    0.0000         1
  organization     1.0000    1.0000    1.0000         2
clinical_event     0.0000    0.0000    0.0000         1

     micro avg     0.8601    0.8331    0.8464      1306
     macro avg     0.8622    0.8331    0.8442      1306



{'detail': {'med_exam': {'precision': 0.821917808219178,
   'recall': 0.8955223880597015,
   'f1-score': 0.8571428571428571,
   'support': 134},
  'time': {'precision': 0.843069873997709,
   'recall': 0.8382687927107062,
   'f1-score': 0.8406624785836665,
   'support': 878},
  'profession': {'precision': 1.0,
   'recall': 0.4117647058823529,
   'f1-score': 0.5833333333333334,
   'support': 17},
  'location': {'precision': 0.9340659340659341,
   'recall': 0.8333333333333334,
   'f1-score': 0.8808290155440415,
   'support': 102},
  'money': {'precision': 0.9487179487179487,
   'recall': 0.8604651162790697,
   'f1-score': 0.9024390243902439,
   'support': 43},
  'family': {'precision': 0.8571428571428571,
   'recall': 0.5454545454545454,
   'f1-score': 0.6666666666666665,
   'support': 11},
  'name': {'precision': 0.9629629629629629,
   'recall': 0.8863636363636364,
   'f1-score': 0.923076923076923,
   'support': 88},
  'contact': {'precision': 0.8571428571428571,
   'recall': 0.666666666

In [14]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'val_accuracy', 'loss', 'val_loss'])
plt.show()

NameError: name 'history' is not defined

In [15]:
# 需要儲存article_id, 轉成dataframe
def predicting_txt_to_dataframe(path):    
    with open(path, 'r') as f:
        txt = str(f.read())
        txt_list = txt.split('\n')
    row_list = list()
    tmp_list = list()
    for row in txt_list:
        if row == '--------------------':
            tmp_list[0] = tmp_list[0].replace('article_id:', '')
            tmp_list[0] = int(tmp_list[0])
            row_list.append(tmp_list)
            tmp_list = list()
            continue
        if len(row) > 0:
            tmp_list.append(row)
    df = pd.DataFrame(row_list, columns=['article_id','text'])
    return df

In [16]:
def predict_NER(text):
    x_list = list()    
    text_list = re.split('\uff0c|\u3002|\uff1f', text)
    for article in text_list:
        x_list.append([i for i in article])
    y_list_list = model.predict(x_list)
    y_list = list()
    for sentence in y_list_list:
        for word in sentence:
            y_list.append(word)
        y_list.append('O') # append(，。？)
    y_list = y_list[:-1]
    return y_list

In [None]:
def output_NER(article_id, text, y_list):
    output_str = str()
    flag = False
    for i, j in enumerate(y_list):
        if j != 'O':
            if j[0] == 'B':
                start_position = i
                entity_type = j.split('-')[-1]
                flag = True
        elif j == 'O' and flag == True:
            end_position = i
            flag = False
            entity_text = text[start_position: end_position]
            entity = '{}\t{}\t{}\t{}\t{}\n'.format(article_id, start_position, end_position, entity_text, entity_type)  
            output_str += entity
    return output_str

In [None]:
df = predicting_txt_to_dataframe('/home/Danny/ai-cup-2020/datasets/stage5/test.txt')
output_str = "article_id\tstart_position\tend_position\tentity_text\tentity_type\n"

for article_id, text in zip(df['article_id'], df['text']):
    x_list = [word for word in text]
    y_list = predict_NER(text)
    output_str += output_NER(article_id, text, y_list)

In [None]:
# print(output_str)

In [None]:
output_path = 'output/{}_epoch_{}_batch_{}.tsv'.format(model_name, epochs, batch_size)
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(output_str)

In [None]:
df = pd.read_csv(output_path, sep='\t')
df