In [1]:
import tensorflow as tf

def select_gpu(N):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            tf.config.experimental.set_visible_devices(gpus[N], 'GPU')
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)    

In [2]:
select_gpu(1)
sequence_length = 128
epochs = 10
model_name = 'albert_xxlarge'

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU')]
2 Physical GPUs, 1 Logical GPUs


In [3]:
import re

def txt_to_list(path, sequence_length):
    with open(path, 'r') as f:
        txt = str(f.read())
    txt_list = txt.split('\n')
    text_label_list = list()
    tmp = list()
    for line in txt_list:
        if line == '--------------------':
            text_label_list.append(tmp)
            tmp = list()
            continue
        if line == '':
            continue
        tmp.append(line)
    x = list()
    y = list()
    for text_label in text_label_list:
        text = text_label[0]
#         text_list = [i for i in text]
        label = text_label[2:]
        label_list = ['O' for i in range(len(text))]
        for i in label:
            entity = i.split('\t')
            if int(entity[1]) > int(entity[2]):
                continue
#                 print(entity)            
            b = int(entity[1])
            label_list[b] = 'B-{}'.format(entity[-1])
            for j in range(int(entity[1])+1, int(entity[2])):
                label_list[j] = 'I-{}'.format(entity[-1])
        for i, j in enumerate(text):
            if j == '，' or j == '。' or j == '？':
                label_list[i] = j
        
        text_list = re.split('\uff0c|\u3002|\uff1f', text)
        for sentence in text_list:
#             while len(sentence) < sequence_length:
#                 sentence += 'O'
#             if len(sentence) > sequence_length:
#                 sentence = sentence[:sequence_length]
            x.append([i for i in sentence])
            
        tmp = list()
        for i in label_list:
            if i == '，' or i == '。' or i == '？':
#                 while len(tmp) < sequence_length:
#                     tmp.append('O')
#                 if len(tmp) > sequence_length:
#                     tmp = tmp[:sequence_length]
                y.append(tmp)
                tmp = list()
            else:
                tmp.append(i)
        y.append(label_list)
    return [x, y]

In [4]:
x1, y1 = txt_to_list('/home/Danny/AI-CUP-2020/datasets/stage1/SampleData_deid.txt', sequence_length)

In [5]:
x2, y2 = txt_to_list('/home/Danny/AI-CUP-2020/datasets/stage2/train_1_update.txt', sequence_length)

In [6]:
x4, y4 = txt_to_list('/home/Danny/AI-CUP-2020/datasets/stage4/train_2.txt', sequence_length)

In [7]:
x = x1 + x2 + x4
y = y1 + y2 + y4
print(len(x))
print(len(y))

76508
76508


In [8]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(valid_x), len(valid_y))
print(len(test_x), len(test_y))

48964 48964
12242 12242
15302 15302


In [9]:
import os
import kashgari
from kashgari.tasks.labeling import BiLSTM_CRF_Model
# from kashgari.embeddings import BertEmbedding
from kashgari.embeddings import TransformerEmbedding
kashgari.config.use_cudnn_cell = True
embedding_path = '/home/Danny/pretrain_model/{}'.format(model_name)
vocab_path = os.path.join(embedding_path, 'vocab_chinese.txt')
config_path = os.path.join(embedding_path, 'albert_config.json')
checkpoint_path = os.path.join(embedding_path, 'model.ckpt-best')
embedding = TransformerEmbedding(vocab_path, 
                                 config_path, 
                                 checkpoint_path,
                                 bert_type='albert',
                                 sequence_length=sequence_length,
                                )
model = BiLSTM_CRF_Model(embedding)
history = model.fit(train_x,
                    train_y,
                    valid_x,
                    valid_y,
                    epochs=epochs,
                    batch_size=256,
                   )

2020-12-03 16:41:56,743 [DEBUG] kashgari - ------------------------------------------------
2020-12-03 16:41:56,743 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-03 16:41:56,743 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/albert_xxlarge/albert_config.json
2020-12-03 16:41:56,744 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/albert_xxlarge/vocab_chinese.txt
2020-12-03 16:41:56,744 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/albert_xxlarge/model.ckpt-best
2020-12-03 16:41:56,744 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]', '[unused24]', '[unused25]', '[unused26]', '[unused27]', '[unused28]', '[unused29]', 

ResourceExhaustedError: OOM when allocating tensor with shape[16384,4096] and type float on /job:localhost/replica:0/task:0/device:GPU:0 by allocator GPU_0_bfc [Op:Add]

In [None]:
model.evaluate(test_x, test_y)
model_path = 'model/{}-epoch-{}'.format(model_name, epochs)
model.save(model_path)

In [None]:
from matplotlib import pyplot as plt
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'val_accuracy', 'loss', 'val_loss'])
plt.show()

In [None]:
import pandas as pd
df = pd.read_csv('/home/Danny/AI-CUP-2020/datasets/stage4/development_2.csv')
df

In [None]:
def predict_NER(model, text, sequence_length):
    x_list = list()
    while len(text) > sequence_length:
        x_list.append([i for i in text[:sequence_length]])
        text = text[sequence_length:]        
    x_list.append([i for i in text[:sequence_length]])
    y_list = model.predict(x_list)
    merge_list = list()
    for i in y_list:
        for j in i:
            merge_list.append(j)
    return merge_list

In [None]:
def output_NER(article_id, text, y):
    output = str()
    flag = False
    for i, j in enumerate(y):
        if j != 'O':
            if j[0] == 'B':
                start_position = i
                entity_type = j.split('-')[-1]
                flag = True
        elif j == 'O' and flag == True:
            end_position = i
            flag = False
            entity_text = text[start_position: end_position]
            output += '{}\t{}\t{}\t{}\t{}\n'.format(article_id, start_position, end_position, entity_text, entity_type)  
    return output

In [None]:
output="article_id\tstart_position\tend_position\tentity_text\tentity_type\n"
for article_id, text in zip(df['article_id'], df['text']):
    y = predict_NER(model, text, sequence_length)
    output += output_NER(article_id, text, y)

In [None]:
output_path='output/{}_epo_{}.tsv'.format(model_name, epochs)
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(output)