In [1]:
import re
import os
import pandas as pd
import tensorflow as tf
from matplotlib import pyplot as plt

In [2]:
def select_gpu(N):
    gpus = tf.config.experimental.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        try:
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)
            tf.config.experimental.set_visible_devices(gpus[N], 'GPU')
            logical_gpus = tf.config.experimental.list_logical_devices('GPU')
            print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
        except RuntimeError as e:
            print(e)    

In [3]:
select_gpu(0)
epochs = 512
batch_size = 2048
model_name = 'chinese_roberta_wwm_large_ext_L-24_H-1024_A-16'
embedding_path = '/home/Danny/pretrain_model/{}'.format(model_name)

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:1', device_type='GPU'), PhysicalDevice(name='/physical_device:GPU:2', device_type='GPU')]
3 Physical GPUs, 1 Logical GPUs


In [4]:
# 不用儲存article_id, 轉成list
def training_txt_to_list_one_dialog(path):
    with open(path, 'r') as f:
        txt = str(f.read())
    txt_list = txt.split('\n')
    text_label_list = list()
    tmp = list()
    for line in txt_list:
        if line == '--------------------':
            text_label_list.append(tmp)
            tmp = list()
            continue
        if line == '':
            continue
        tmp.append(line)
    x = list()
    y = list()
    for text_label in text_label_list:
        text = text_label[0]
        label = text_label[2:]
        label_list = ['O' for i in range(len(text))]
        for i in label:
            entity = i.split('\t')
            if int(entity[1]) > int(entity[2]):
                continue
            b = int(entity[1])
            label_list[b] = 'B-{}'.format(entity[-1])
            for j in range(int(entity[1])+1, int(entity[2])):
                label_list[j] = 'I-{}'.format(entity[-1])

        text_list = re.split('：', text)
        #print(text_list)
        last_end_pos = 0
        char_num = 0
        for idx in range(1,len(text_list)):
            string = []
            labels = []
            end_pos = 0
            for idx_iterate_sentence in range(len(text_list[idx])-1,-1,-1):
                #print(idx_iterate_sentence)
                if text_list[idx][idx_iterate_sentence] == '。' or text_list[idx][idx_iterate_sentence] == '？' or text_list[idx][idx_iterate_sentence] == '！' or text_list[idx][idx_iterate_sentence] == '～' or text_list[idx][idx_iterate_sentence] == '…' or text_list[idx][idx_iterate_sentence] == '.' or text_list[idx][idx_iterate_sentence] == '⋯⋯':
                    end_pos = idx_iterate_sentence + 1
                    break
            for s in text_list[idx-1][last_end_pos:]:
                string.append(s)
            string.append('：')
            #print(string)
            for s in text_list[idx][:end_pos]:
                string.append(s)
            x.append(string)
            for label_idx in range(char_num, char_num + len(string)):
                try:
                    labels.append(label_list[label_idx])
                except(IndexError):
                    print(labels)
            y.append(labels)
            char_num += len(string)
            last_end_pos = end_pos

                
    return [x, y]

In [5]:
x1, y1 = training_txt_to_list_one_dialog('/home/Danny/ai-cup-2020/datasets/stage1/SampleData_deid.txt')

In [6]:
x2, y2 = training_txt_to_list_one_dialog('/home/Danny/ai-cup-2020/datasets/stage2/train_1_update.txt')

In [7]:
x4, y4 = training_txt_to_list_one_dialog('/home/Danny/ai-cup-2020/datasets/stage4/train_2.txt')

In [8]:
augmentation_df = pd.DataFrame(columns=['x', 'y'])
clinical_event_augmentation_df = pd.read_csv('/home/Danny/ai-cup-2020/datasets/data_augmentation_dialog/clinical_event_augmentation_from_train_label_1223.csv')
contact_augmentation_df = pd.read_csv('/home/Danny/ai-cup-2020/datasets/data_augmentation_dialog/contact_augmentation_from_train_label_1223.csv')
education_dialog_augmentation_df = pd.read_csv('/home/Danny/ai-cup-2020/datasets/data_augmentation_dialog/education_dialog_augmentation_train_label_1223.csv')
family_dialog_augmentation_df = pd.read_csv('/home/Danny/ai-cup-2020/datasets/data_augmentation_dialog/family_dialog_augmentation_from_train_label_1223.csv')
med_exam_dialog_augmentation_df = pd.read_csv('/home/Danny/ai-cup-2020/datasets/data_augmentation_dialog/med_exam_dialog_augmentation_from_train_label_1223.csv')
money_dialog_augmentation_df = pd.read_csv('/home/Danny/ai-cup-2020/datasets/data_augmentation_dialog/money_dialog_augmentation_from_train_label_1223.csv')
augmentation_df = augmentation_df.append(clinical_event_augmentation_df)
augmentation_df = augmentation_df.append(contact_augmentation_df)
augmentation_df = augmentation_df.append(education_dialog_augmentation_df)
augmentation_df = augmentation_df.append(family_dialog_augmentation_df)
augmentation_df = augmentation_df.append(med_exam_dialog_augmentation_df)
augmentation_df = augmentation_df.append(money_dialog_augmentation_df)
augmentation_df

Unnamed: 0,x,y
0,"['醫', '師', '：', '那', '時', '候', '在', '做', 'S', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cl..."
1,"['醫', '師', '：', '那', '時', '候', '在', '做', 's', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cl..."
2,"['醫', '師', '：', '那', '時', '候', '在', '做', 'C', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cl..."
3,"['醫', '師', '：', '那', '時', '候', '在', '做', 'c', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cl..."
4,"['醫', '師', '：', '那', '時', '候', '在', '做', 'C', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'B-cl..."
...,...,...
495,"['民', '眾', '：', '第', '二', '支', '7', '8', '0', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-money', 'I-m..."
496,"['民', '眾', '：', '我', '是', '帶', '十', '塊', '錢', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-money', 'I-m..."
497,"['醫', '師', '：', '那', '是', '1', '2', '0', '0', ...","['O', 'O', 'O', 'O', 'O', 'B-money', 'I-money'..."
498,"['醫', '師', '：', '齁', '所', '以', '你', '今', '天', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [9]:
augmentation_x_list = augmentation_df['x'].tolist()
augmentation_y_list = augmentation_df['y'].tolist()

In [10]:
x = x1 + x2 + x4 + augmentation_x_list
y = y1 + y2 + y4 + augmentation_y_list
print(len(x))
print(len(y))

42623
42623


In [11]:
df = pd.DataFrame({'x': x, 'y': y})
df

Unnamed: 0,x,y
0,"[醫, 師, ：, 你, 有, 做, 超, 音, 波, 嘛, ，, 那, 我, 們, 來, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[民, 眾, ：, 紅, 字, 是, 甚, 麼, 意, 思, ？]","[O, O, O, O, O, O, O, O, O, O, O]"
2,"[醫, 師, ：, 就, 是, 肝, 功, 能, 有, 比, 較, 高, ，, 肝, 功, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[民, 眾, ：, 它, 會, 自, 動, 修, 復, 阿, 。]","[O, O, O, O, O, O, O, O, O, O, O]"
4,"[醫, 師, ：, 你, 有, 抗, 體, 了, 阿, ，, 所, 以, 你, B, 肝, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
42618,"['民', '眾', '：', '第', '二', '支', '7', '8', '0', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-money', 'I-m..."
42619,"['民', '眾', '：', '我', '是', '帶', '十', '塊', '錢', ...","['O', 'O', 'O', 'O', 'O', 'O', 'B-money', 'I-m..."
42620,"['醫', '師', '：', '那', '是', '1', '2', '0', '0', ...","['O', 'O', 'O', 'O', 'O', 'B-money', 'I-money'..."
42621,"['醫', '師', '：', '齁', '所', '以', '你', '今', '天', ...","['O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', 'O', ..."


In [12]:
df['x'] = df['x'].apply(lambda row : str(row))
df['y'] = df['y'].apply(lambda row : str(row))
df = df.drop_duplicates()
df['x'] = df['x'].apply(lambda row : eval(row))
df['y'] = df['y'].apply(lambda row : eval(row))
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  after removing the cwd from sys.path.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


Unnamed: 0,x,y
0,"[醫, 師, ：, 你, 有, 做, 超, 音, 波, 嘛, ，, 那, 我, 們, 來, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
1,"[民, 眾, ：, 紅, 字, 是, 甚, 麼, 意, 思, ？]","[O, O, O, O, O, O, O, O, O, O, O]"
2,"[醫, 師, ：, 就, 是, 肝, 功, 能, 有, 比, 較, 高, ，, 肝, 功, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
3,"[民, 眾, ：, 它, 會, 自, 動, 修, 復, 阿, 。]","[O, O, O, O, O, O, O, O, O, O, O]"
4,"[醫, 師, ：, 你, 有, 抗, 體, 了, 阿, ，, 所, 以, 你, B, 肝, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, O, ..."
...,...,...
42618,"[民, 眾, ：, 第, 二, 支, 7, 8, 0, 0, 那, 支, 喔, ？]","[O, O, O, O, O, O, B-money, I-money, I-money, ..."
42619,"[民, 眾, ：, 我, 是, 帶, 十, 塊, 錢, 餒, 。]","[O, O, O, O, O, O, B-money, I-money, I-money, ..."
42620,"[醫, 師, ：, 那, 是, 1, 2, 0, 0, 。]","[O, O, O, O, O, B-money, I-money, I-money, I-m..."
42621,"[醫, 師, ：, 齁, 所, 以, 你, 今, 天, 大, 概, 會, 花, 到, 二, ...","[O, O, O, O, O, O, O, O, O, O, O, O, O, O, B-m..."


In [13]:
x = df['x'].to_list()
y = df['y'].to_list()

In [14]:
from sklearn.model_selection import train_test_split
train_x, test_x, train_y, test_y = train_test_split(x, y, test_size=0.2, random_state=42)
train_x, valid_x, train_y, valid_y = train_test_split(train_x, train_y, test_size=0.2, random_state=42)
print(len(train_x), len(train_y))
print(len(valid_x), len(valid_y))
print(len(test_x), len(test_y))

12942 12942
3236 3236
4045 4045


In [15]:
import kashgari
from kashgari.tasks.labeling import BiLSTM_CRF_Model
from kashgari.embeddings import TransformerEmbedding
from keras_radam import RAdam
kashgari.config.use_cudnn_cell = True

embedding_path = '/home/Danny/pretrain_model/{}'.format(model_name)
vocab_path = os.path.join(embedding_path, 'vocab.txt')
config_path = os.path.join(embedding_path, 'bert_config.json')
checkpoint_path = os.path.join(embedding_path, 'bert_model.ckpt')
embedding = TransformerEmbedding(vocab_path, 
                                 config_path, 
                                 checkpoint_path,
                                 bert_type='bert',
                                 sequence_length='auto',
                                 trainable=True,
                                 task='kashgari.LABELING',
                                )

model = BiLSTM_CRF_Model(embedding)
# model.build_model(train_x, train_y)
# model.compile_model(optimizer=RAdam(lr=3e-5))
# hyper = model.default_hyper_parameters()
# print(hyper)

2020-12-24 13:05:06,922 [DEBUG] kashgari - ------------------------------------------------
2020-12-24 13:05:06,923 [DEBUG] kashgari - Loaded transformer model's vocab
2020-12-24 13:05:06,924 [DEBUG] kashgari - config_path       : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_config.json
2020-12-24 13:05:06,924 [DEBUG] kashgari - vocab_path      : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/vocab.txt
2020-12-24 13:05:06,925 [DEBUG] kashgari - checkpoint_path : /home/Danny/pretrain_model/chinese_roberta_wwm_large_ext_L-24_H-1024_A-16/bert_model.ckpt
2020-12-24 13:05:06,925 [DEBUG] kashgari - Top 50 words    : ['[PAD]', '[unused1]', '[unused2]', '[unused3]', '[unused4]', '[unused5]', '[unused6]', '[unused7]', '[unused8]', '[unused9]', '[unused10]', '[unused11]', '[unused12]', '[unused13]', '[unused14]', '[unused15]', '[unused16]', '[unused17]', '[unused18]', '[unused19]', '[unused20]', '[unused21]', '[unused22]', '[unused23]'

In [None]:
%%time
history = model.fit(train_x,
                    train_y,
                    valid_x,
                    valid_y,
                    epochs=epochs,
                    batch_size=batch_size,
                   )

Preparing text vocab dict: 100%|██████████| 12942/12942 [00:00<00:00, 224799.08it/s]
Preparing text vocab dict: 100%|██████████| 3236/3236 [00:00<00:00, 178607.85it/s]
2020-12-24 13:05:07,013 [DEBUG] kashgari - --- Build vocab dict finished, Total: 1572 ---
2020-12-24 13:05:07,013 [DEBUG] kashgari - Top-10: ['[PAD]', '[UNK]', '[CLS]', '[SEP]', '，', '：', '。', '是', '師', '醫']
Preparing text vocab dict: 100%|██████████| 12942/12942 [00:00<00:00, 298596.11it/s]
Preparing text vocab dict: 100%|██████████| 3236/3236 [00:00<00:00, 294842.24it/s]
2020-12-24 13:05:07,071 [DEBUG] kashgari - --- Build vocab dict finished, Total: 28 ---
2020-12-24 13:05:07,071 [DEBUG] kashgari - Top-10: ['[PAD]', 'O', 'I-time', 'B-time', 'I-contact', 'I-money', 'I-med_exam', 'B-money', 'I-family', 'B-family']
Calculating sequence length: 100%|██████████| 12942/12942 [00:00<00:00, 1346730.90it/s]
Calculating sequence length: 100%|██████████| 3236/3236 [00:00<00:00, 961857.26it/s]
2020-12-24 13:05:13,250 [DEBUG] kash

Epoch 1/512
Epoch 2/512
Epoch 3/512
Epoch 4/512
Epoch 5/512

In [None]:
model.evaluate(test_x, test_y)
model_path = 'model/dialog_{}_epoch_{}_batch_{}'.format(model_name, epochs, batch_size)
model.save(model_path)

In [None]:
# from kashgari.utils import load_model
# model_path = 'model/dialog_{}_epoch_{}_batch_{}'.format(model_name, epochs, batch_size)
# model = load_model(model_path)
# model.evaluate(test_x, test_y)

In [None]:
plt.plot(history.history['accuracy'])
plt.plot(history.history['val_accuracy'])
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model accuracy')
plt.ylabel('accuracy')
plt.xlabel('epoch')
plt.legend(['accuracy', 'val_accuracy', 'loss', 'val_loss'])
plt.show()

In [None]:
# 需要儲存article_id, 轉成dataframe
def predicting_txt_to_dataframe(path):    
    with open(path, 'r') as f:
        txt = str(f.read())
        txt_list = txt.split('\n')
    row_list = list()
    tmp_list = list()
    for row in txt_list:
        if row == '--------------------':
            tmp_list[0] = tmp_list[0].replace('article_id:', '')
            tmp_list[0] = int(tmp_list[0])
            row_list.append(tmp_list)
            tmp_list = list()
            continue
        if len(row) > 0:
            tmp_list.append(row)
    df = pd.DataFrame(row_list, columns=['article_id','text'])
    return df

In [None]:
def predict_NER(text):
    x_list = list()    
    text_list = re.split('\u3002|\uff1f', text)
    for article in text_list:
        x_list.append([i for i in article])
    y_list_list = model.predict(x_list)
    y_list = list()
    for sentence in y_list_list:
        for word in sentence:
            y_list.append(word)
        y_list.append('O') # append(，。？)
    y_list = y_list[:-1]
    
    return y_list

In [None]:
def output_NER(article_id, text, y_list):
    output_str = str()
    flag = False
    for i, j in enumerate(y_list):
        if j != 'O':
            if j[0] == 'B':
                start_position = i
                entity_type = j.split('-')[-1]
                flag = True
        elif j == 'O' and flag == True:
            end_position = i
            flag = False
            entity_text = text[start_position: end_position]
            entity = '{}\t{}\t{}\t{}\t{}\n'.format(article_id, start_position, end_position, entity_text, entity_type)  
            output_str += entity
    return output_str

In [None]:
df = predicting_txt_to_dataframe('/home/Danny/ai-cup-2020/datasets/stage5/test.txt')
output_str = "article_id\tstart_position\tend_position\tentity_text\tentity_type\n"

for article_id, text in zip(df['article_id'], df['text']):
    x_list = [word for word in text]
    y_list = predict_NER(text)
    output_str += output_NER(article_id, text, y_list)

In [None]:
# print(output_str)

In [None]:
output_path = 'output/dialog_{}_epoch_{}_batch_{}.tsv'.format(model_name, epochs, batch_size)
with open(output_path, 'w', encoding='utf-8') as f:
    f.write(output_str)

In [None]:
df = pd.read_csv(output_path, sep='\t')
df