In [1]:
import re
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

In [2]:
def select_gpu(N):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            tf.config.experimental.set_visible_devices(gpus[N], 'GPU')
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)    

In [3]:
select_gpu(0)
epochs = 512
batch_size = 1024
model_name = 'bert-chinese'
embedding_path = '/home/Danny/pretrain_model/{}'.format(model_name)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]
3 Physical GPUs, 1 Logical GPUs


In [4]:
# 不用儲存article_id, 轉成list
def training_txt_to_list(path):
    with open(path, 'r') as f:
        txt = str(f.read())
    txt_list = txt.split('\n')
    text_label_list = list()
    tmp = list()
    for line in txt_list:
        if line == '--------------------':
            text_label_list.append(tmp)
            tmp = list()
            continue
        if line == '':
            continue
        tmp.append(line)
    x = list()
    y = list()
    for text_label in text_label_list:
        text = text_label[0]
        label = text_label[2:]
        label_list = ['O' for i in range(len(text))]
        for i in label:
            entity = i.split('\t')
            if int(entity[1]) > int(entity[2]):
                continue
            b = int(entity[1])
            label_list[b] = 'B-{}'.format(entity[-1])
            for j in range(int(entity[1])+1, int(entity[2])):
                label_list[j] = 'I-{}'.format(entity[-1])
        for i, j in enumerate(text):
            if j == '，' or j == '。' or j == '？':
                label_list[i] = j
                
        text_list = re.split('\uff0c|\u3002|\uff1f', text)
        for sentence in text_list:
            x.append([i for i in sentence])
        x = x[:-1]
            
        sentence = list()
        for i in label_list:
            if i == '，' or i == '。' or i == '？':
                y.append(sentence)
                sentence = list()
            else:
                sentence.append(i)
                
    return [x, y]

In [5]:
x1, y1 = training_txt_to_list('/home/Danny/AI-CUP-2020/datasets/stage1/SampleData_deid.txt')

In [6]:
x2, y2 = training_txt_to_list('/home/Danny/AI-CUP-2020/datasets/stage2/train_1_update.txt')

In [7]:
x4, y4 = training_txt_to_list('/home/Danny/AI-CUP-2020/datasets/stage4/train_2.txt')

In [8]:
x = x1 + x2 + x4
y = y1 + y2 + y4
print(len(x))
print(len(y))

76162
76162


In [9]:
df = pd.DataFrame({'x': x, 'y': y})
df

Unnamed: 0,x,y
0,"[醫, 師, ：, 你, 有, 做, 超, 音, 波, 嘛]","[O, O, O, O, O, O, O, O, O, O]"
1,"[那, 我, 們, 來, 看, 報, 告]","[O, O, O, O, O, O, O]"
2,"[有, 些, 部, 分, 有, 紅, 字, 耶]","[O, O, O, O, O, O, O, O]"
3,"[民, 眾, ：, 紅, 字, 是, 甚, 麼, 意, 思]","[O, O, O, O, O, O, O, O, O, O]"
4,"[醫, 師, ：, 就, 是, 肝, 功, 能, 有, 比, 較, 高]","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
76157,"[醫, 師, ：, 他, 有, 在, 騎, 腳, 踏, 車]","[O, O, O, O, O, O, O, O, O, O]"
76158,"[民, 眾, ：, 恩]","[O, O, O, O]"
76159,"[醫, 師, ：, 騎, 腳, 踏, 車, 可, 以]","[O, O, O, O, O, O, O, O, O]"
76160,"[小, 美]","[B-name, I-name]"


In [10]:
clinical_event = pd.read_csv('/home/Danny/AI-CUP-2020/datasets/augmentation/clinical_event.csv')
contact = pd.read_csv('/home/Danny/AI-CUP-2020/datasets/augmentation/contact.csv')
education = pd.read_csv('/home/Danny/AI-CUP-2020/datasets/augmentation/education.csv')
family = pd.read_csv('/home/Danny/AI-CUP-2020/datasets/augmentation/family.csv')
profession = pd.read_csv('/home/Danny/AI-CUP-2020/datasets/augmentation/profession.csv')

In [11]:
df = df.append(clinical_event)
df = df.append(contact)
df = df.append(education)
df = df.append(family)
df = df.append(profession)
df

Unnamed: 0,x,y
0,"[醫, 師, ：, 你, 有, 做, 超, 音, 波, 嘛]","[O, O, O, O, O, O, O, O, O, O]"
1,"[那, 我, 們, 來, 看, 報, 告]","[O, O, O, O, O, O, O]"
2,"[有, 些, 部, 分, 有, 紅, 字, 耶]","[O, O, O, O, O, O, O, O]"
3,"[民, 眾, ：, 紅, 字, 是, 甚, 麼, 意, 思]","[O, O, O, O, O, O, O, O, O, O]"
4,"[醫, 師, ：, 就, 是, 肝, 功, 能, 有, 比, 較, 高]","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
495,"['那', '個', '時', '候', '是', '白', '天', '釣', '具', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-profess..."
496,"['因', '為', '我', '是', '還', '是', '網', '球', '球', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-profession',..."
497,"['醫', '師', '：', '你', '是', '採', '椰', '子', '工', ...","['O', 'O', 'O', 'O', 'O', 'B-profession', 'I-p..."
498,"['安', '養', '院', '工', '作', '人', '員']","['B-profession', 'I-profession', 'I-profession..."


In [13]:
df['x'] = df['x'].apply(lambda row : str(row))
df['y'] = df['y'].apply(lambda row : str(row))
# df = df.drop_duplicates()
df['x'] = df['x'].apply(lambda row : eval(row))
df['y'] = df['y'].apply(lambda row : eval(row))
df

Unnamed: 0,x,y
0,"[醫, 師, ：, 你, 有, 做, 超, 音, 波, 嘛]","[O, O, O, O, O, O, O, O, O, O]"
1,"[那, 我, 們, 來, 看, 報, 告]","[O, O, O, O, O, O, O]"
2,"[有, 些, 部, 分, 有, 紅, 字, 耶]","[O, O, O, O, O, O, O, O]"
3,"[民, 眾, ：, 紅, 字, 是, 甚, 麼, 意, 思]","[O, O, O, O, O, O, O, O, O, O]"
4,"[醫, 師, ：, 就, 是, 肝, 功, 能, 有, 比, 較, 高]","[O, O, O, O, O, O, O, O, O, O, O, O]"
...,...,...
495,"[那, 個, 時, 候, 是, 白, 天, 釣, 具, 製, 造, 工, 人]","[O, O, O, O, O, O, O, B-profession, I-professi..."
496,"[因, 為, 我, 是, 還, 是, 網, 球, 球, 員, 嘛]","[O, O, O, O, O, O, B-profession, I-profession,..."
497,"[醫, 師, ：, 你, 是, 採, 椰, 子, 工, 是, 不, 是]","[O, O, O, O, O, B-profession, I-profession, I-..."
498,"[安, 養, 院, 工, 作, 人, 員]","[B-profession, I-profession, I-profession, I-p..."


In [14]:
x = df['x'].to_list()
y = df['y'].to_list()

In [15]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(valid_x), len(valid_y))
print(len(test_x), len(test_y))

50151 50151
12538 12538
15673 15673


In [None]:
%%time
import kashgari
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.embeddings import BertEmbedding
kashgari.config.use_cudnn_cell = True
bert_embedding = BertEmbedding(embedding_path, 
                           sequence_length='auto',
                           trainable=True,
                           task='kashgari.LABELING', 
                          )
model = BiLSTM_CRF_Model(bert_embedding)
history = model.fit(train_x,
                    train_y,
                    valid_x,
                    valid_y,
                    epochs=epochs,
                    batch_size=batch_size,
                   )

2020-12-17 17:05:43,369 [DEBUG] kashgari - ------------------------------------------------
2020-12-17 17:05:43,370 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-17 17:05:43,370 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/bert-chinese/bert_config.json
2020-12-17 17:05:43,370 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/bert-chinese/vocab.txt
2020-12-17 17:05:43,371 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/bert-chinese/bert_model.ckpt
2020-12-17 17:05:43,371 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', '[unused30]', '[

Epoch 1/512
Epoch 2/512
Epoch 3/512
Epoch 4/512
Epoch 5/512
Epoch 6/512
Epoch 7/512
Epoch 8/512
Epoch 9/512
Epoch 10/512
Epoch 11/512
Epoch 12/512
Epoch 13/512
Epoch 14/512
Epoch 15/512
Epoch 16/512
Epoch 17/512
Epoch 18/512
Epoch 19/512
Epoch 20/512
Epoch 21/512
Epoch 22/512
Epoch 23/512
Epoch 24/512
Epoch 25/512
Epoch 26/512
Epoch 27/512
Epoch 28/512
Epoch 29/512
Epoch 30/512
Epoch 31/512
Epoch 32/512
Epoch 33/512
Epoch 34/512
Epoch 35/512
Epoch 36/512
Epoch 37/512
Epoch 38/512
Epoch 39/512
Epoch 40/512
Epoch 41/512
Epoch 42/512
Epoch 43/512
Epoch 44/512
Epoch 45/512
Epoch 46/512
Epoch 47/512
Epoch 48/512
Epoch 49/512
Epoch 50/512
Epoch 51/512
Epoch 52/512
Epoch 53/512
Epoch 54/512
Epoch 55/512
Epoch 56/512
Epoch 57/512


Epoch 58/512
Epoch 59/512
Epoch 60/512
Epoch 61/512
Epoch 62/512
Epoch 63/512
Epoch 64/512
Epoch 65/512
Epoch 66/512
Epoch 67/512
Epoch 68/512
Epoch 69/512
Epoch 70/512
Epoch 71/512
Epoch 72/512
Epoch 73/512
Epoch 74/512
Epoch 75/512
Epoch 76/512
Epoch 77/512
Epoch 78/512
Epoch 79/512
Epoch 80/512
Epoch 81/512
Epoch 82/512
Epoch 83/512
Epoch 84/512
Epoch 85/512
Epoch 86/512
Epoch 87/512
Epoch 88/512
Epoch 89/512
Epoch 90/512
Epoch 91/512
Epoch 92/512
Epoch 93/512
Epoch 94/512
Epoch 95/512
Epoch 96/512
Epoch 97/512
Epoch 98/512
Epoch 99/512
Epoch 100/512
Epoch 101/512
Epoch 102/512
Epoch 103/512
Epoch 104/512
Epoch 105/512
Epoch 106/512
Epoch 107/512
Epoch 108/512
Epoch 109/512
Epoch 110/512
Epoch 111/512
Epoch 112/512
Epoch 113/512


Epoch 114/512
Epoch 115/512
Epoch 116/512
Epoch 117/512
Epoch 118/512
Epoch 119/512
Epoch 120/512
Epoch 121/512
Epoch 122/512
Epoch 123/512
Epoch 124/512
Epoch 125/512
Epoch 126/512
Epoch 127/512
Epoch 128/512
Epoch 129/512
Epoch 130/512
Epoch 131/512
Epoch 132/512
Epoch 133/512
Epoch 134/512
Epoch 135/512
Epoch 136/512
Epoch 137/512
Epoch 138/512
Epoch 139/512
Epoch 140/512
Epoch 141/512
Epoch 142/512
Epoch 143/512
Epoch 144/512
Epoch 145/512
Epoch 146/512
Epoch 147/512
Epoch 148/512
Epoch 149/512
Epoch 150/512
Epoch 151/512
Epoch 152/512
Epoch 153/512
Epoch 154/512
Epoch 155/512
Epoch 156/512
Epoch 157/512
Epoch 158/512
Epoch 159/512
Epoch 160/512
Epoch 161/512
Epoch 162/512
Epoch 163/512
Epoch 164/512
Epoch 165/512
Epoch 166/512
Epoch 167/512
Epoch 168/512


Epoch 169/512
Epoch 170/512
Epoch 171/512
Epoch 172/512
Epoch 173/512
Epoch 174/512
Epoch 175/512
Epoch 176/512
Epoch 177/512
Epoch 178/512
Epoch 179/512
Epoch 180/512
Epoch 181/512
Epoch 182/512
Epoch 183/512
Epoch 184/512
Epoch 185/512
Epoch 186/512
Epoch 187/512
Epoch 188/512
Epoch 189/512
Epoch 190/512
Epoch 191/512
Epoch 192/512
Epoch 193/512
Epoch 194/512
Epoch 195/512
Epoch 196/512
Epoch 197/512
Epoch 198/512
Epoch 199/512
Epoch 200/512
Epoch 201/512
Epoch 202/512
Epoch 203/512
Epoch 204/512
Epoch 205/512
Epoch 206/512
Epoch 207/512
Epoch 208/512
Epoch 209/512
Epoch 210/512
Epoch 211/512
Epoch 212/512
Epoch 213/512
Epoch 214/512
Epoch 215/512
Epoch 216/512
Epoch 217/512
Epoch 218/512
Epoch 219/512
Epoch 220/512
Epoch 221/512
Epoch 222/512
Epoch 223/512
Epoch 224/512


Epoch 225/512
Epoch 226/512
Epoch 227/512
Epoch 228/512
Epoch 229/512
Epoch 230/512
Epoch 231/512
Epoch 232/512
Epoch 233/512
Epoch 234/512
Epoch 235/512
Epoch 236/512
Epoch 237/512
Epoch 238/512
Epoch 239/512
Epoch 240/512
Epoch 241/512
Epoch 242/512
Epoch 243/512
Epoch 244/512
Epoch 245/512
Epoch 246/512
Epoch 247/512
Epoch 248/512
Epoch 249/512
Epoch 250/512
Epoch 251/512
Epoch 252/512
Epoch 253/512
Epoch 254/512
Epoch 255/512
Epoch 256/512
Epoch 257/512
Epoch 258/512
Epoch 259/512
Epoch 260/512
Epoch 261/512
Epoch 262/512
Epoch 263/512
Epoch 264/512
Epoch 265/512
Epoch 266/512
Epoch 267/512
Epoch 268/512
Epoch 269/512
Epoch 270/512
Epoch 271/512
Epoch 272/512
Epoch 273/512
Epoch 274/512
Epoch 275/512
Epoch 276/512
Epoch 277/512
Epoch 278/512
Epoch 279/512
Epoch 280/512


Epoch 281/512
Epoch 282/512
Epoch 283/512
Epoch 284/512
Epoch 285/512
Epoch 286/512
Epoch 287/512
Epoch 288/512
Epoch 289/512
Epoch 290/512
Epoch 291/512
Epoch 292/512
Epoch 293/512
Epoch 294/512
Epoch 295/512
Epoch 296/512
Epoch 297/512
Epoch 298/512
Epoch 299/512
Epoch 300/512
Epoch 301/512
Epoch 302/512
Epoch 303/512
Epoch 304/512
Epoch 305/512
Epoch 306/512
Epoch 307/512
Epoch 308/512
Epoch 309/512
Epoch 310/512
Epoch 311/512
Epoch 312/512
Epoch 313/512
Epoch 314/512
Epoch 315/512
Epoch 316/512
Epoch 317/512
Epoch 318/512
Epoch 319/512
Epoch 320/512
Epoch 321/512
Epoch 322/512
Epoch 323/512
Epoch 324/512
Epoch 325/512
Epoch 326/512
Epoch 327/512
Epoch 328/512
Epoch 329/512
Epoch 330/512
Epoch 331/512
Epoch 332/512
Epoch 333/512
Epoch 334/512
Epoch 335/512
Epoch 336/512


Epoch 337/512
Epoch 338/512
Epoch 339/512
Epoch 340/512
Epoch 341/512
Epoch 342/512
Epoch 343/512
Epoch 344/512
Epoch 345/512
Epoch 346/512
Epoch 347/512
Epoch 348/512
Epoch 349/512
Epoch 350/512
Epoch 351/512
Epoch 352/512
Epoch 353/512
Epoch 354/512
Epoch 355/512
Epoch 356/512
Epoch 357/512
Epoch 358/512
Epoch 359/512
Epoch 360/512
Epoch 361/512
Epoch 362/512
Epoch 363/512
Epoch 364/512
Epoch 365/512
Epoch 366/512
Epoch 367/512
Epoch 368/512
Epoch 369/512
Epoch 370/512
Epoch 371/512
Epoch 372/512
Epoch 373/512
Epoch 374/512
Epoch 375/512
Epoch 376/512
Epoch 377/512
Epoch 378/512
Epoch 379/512
Epoch 380/512
Epoch 381/512
Epoch 382/512
Epoch 383/512
Epoch 384/512
Epoch 385/512
Epoch 386/512
Epoch 387/512
Epoch 388/512
Epoch 389/512
Epoch 390/512
Epoch 391/512
Epoch 392/512


Epoch 393/512
Epoch 394/512
Epoch 395/512
Epoch 396/512
Epoch 397/512
Epoch 398/512
Epoch 399/512
Epoch 400/512
Epoch 401/512
Epoch 402/512
Epoch 403/512
Epoch 404/512
Epoch 405/512
Epoch 406/512
Epoch 407/512
Epoch 408/512
Epoch 409/512
Epoch 410/512
Epoch 411/512
Epoch 412/512
Epoch 413/512
Epoch 414/512

In [None]:
model.evaluate(test_x, test_y)
model_path = 'model/duplicates_augmentation_{}_epoch_{}_batch_{}'.format(model_name, epochs, batch_size)
model.save(model_path)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'val_accuracy', 'loss', 'val_loss'])
plt.show()

In [None]:
# 需要儲存article_id, 轉成dataframe
def predicting_txt_to_dataframe(path):    
    with open(path, 'r') as f:
        txt = str(f.read())
        txt_list = txt.split('\n')
    row_list = list()
    tmp_list = list()
    for row in txt_list:
        if row == '--------------------':
            tmp_list[0] = tmp_list[0].replace('article_id:', '')
            tmp_list[0] = int(tmp_list[0])
            row_list.append(tmp_list)
            tmp_list = list()
            continue
        if len(row) > 0:
            tmp_list.append(row)
    df = pd.DataFrame(row_list, columns=['article_id','text'])
    return df

In [None]:
def predict_NER(text):
    x_list = list()    
    text_list = re.split('\uff0c|\u3002|\uff1f', text)
    for article in text_list:
        x_list.append([i for i in article])
    y_list_list = model.predict(x_list)
    y_list = list()
    for sentence in y_list_list:
        for word in sentence:
            y_list.append(word)
        y_list.append('O') # append(，。？)
    y_list = y_list[:-1]
    return y_list

In [None]:
def output_NER(article_id, text, y_list):
    output_str = str()
    flag = False
    for i, j in enumerate(y_list):
        if j != 'O':
            if j[0] == 'B':
                start_position = i
                entity_type = j.split('-')[-1]
                flag = True
        elif j == 'O' and flag == True:
            end_position = i
            flag = False
            entity_text = text[start_position: end_position]
            entity = '{}\t{}\t{}\t{}\t{}\n'.format(article_id, start_position, end_position, entity_text, entity_type)  
            output_str += entity
    return output_str

In [None]:
df = predicting_txt_to_dataframe('/home/Danny/AI-CUP-2020/datasets/stage5/test.txt')
output_str = "article_id\tstart_position\tend_position\tentity_text\tentity_type\n"

for article_id, text in zip(df['article_id'], df['text']):
    x_list = [word for word in text]
    y_list = predict_NER(text)
    output_str += output_NER(article_id, text, y_list)

In [None]:
output_path = 'output/duplicates_augmentation_{}_epoch_{}_batch_{}.tsv'.format(model_name, epochs, batch_size)
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(output_str)

In [None]:
df = pd.read_csv(output_path, sep='\t')
df