In [47]:
import os
import copy
import json
import logging

import torch
from torch.utils.data import TensorDataset


import re
import pandas as pd
from bert4keras.snippets import sequence_padding,DataGenerator
from bert4keras.tokenizers import Tokenizer


entity_labels = ['Symptom','Operation','Medical_Examination','Drug_Category','Drug']

id2label = {i:j for i,j in enumerate(sorted(entity_labels))}
label2id = {j:i for i,j in id2label.items()}

num_labels = len(entity_labels) * 2 + 1

vocab_path = './bert_weight_file/uncased_L-4_H-768_A-12/vocab.txt'
tokenizer = Tokenizer(vocab_path,do_lower_case=True)



def load_data(data_path,max_len):
    
    sentence = []
    labels = []
    X = []
    y = []
    datasets = []
    samples_len = []
    split_pattern = re.compile(r'[;；。，、？\.\?!]')
    with open(data_path,'r',encoding= 'utf8') as f:
        for line in f.readlines():
            line = line.strip().split()
            if(not line or len(line) < 2):
                X.append(sentence.copy())
                y.append(labels.copy())
                sentence.clear()
                labels.clear()
                continue
            word, tag = line[0], line[1]
            if split_pattern.match(word) and len(sentence) >= max_len:
                sentence.append(word)
                labels.append(tag)
                sentence.clear()
                labels.clear()
            else:
                sentence.append(word)
                labels.append(tag)
    if len(sentence):
        X.append(sentence.copy())
        sentence.clear()
        y.append(labels.copy())
        labels.clear()

    for token_seq,label_seq in zip(X,y):
        if len(token_seq) < 2:
            continue
        sample_seq, last_flag = [], ''
        for token, this_flag in zip(token_seq,label_seq):
            
            if this_flag == 'O' and last_flag == 'O':
                sample_seq[-1][0] += token
            elif this_flag == 'O' and last_flag != 'O':
                sample_seq.append([token, 'O'])
            elif this_flag[:1] == 'B':
                sample_seq.append([token, this_flag[2:]]) 
                save = token
            elif this_flag[:1] == 'I' and last_flag[:1] == 'B':
                del sample_seq[-1][-1] 
                del sample_seq[-1][-1] 
                sample_seq.append([save+token, this_flag[2:]])
                save = save+token
            elif this_flag[:1] == 'I' and last_flag[:1] == 'I':
                del sample_seq[-1][-1] 
                del sample_seq[-1][-1]
                sample_seq.append([save+token, this_flag[2:]])
                save = save+token
            last_flag = this_flag
        datasets.append([x for x in sample_seq if x != []])
        
        samples_len.append(len(token_seq))
        

        
    return datasets,y

In [48]:
if __name__ == '__main__':
    data_path = './ner_data/train/train.txt'
    d = load_data(data_path,128)
    

In [49]:
d

([[['医生：你好我是您的接诊医生', 'O']],
  [['医生：宝贝最近吃奶量可以吗？下降了吗', 'O']],
  [['患者：没有，也没怎么', 'O'], ['哭闹', 'Symptom']],
  [['医生：宝妈有没有吃生冷辛辣刺激食物油腻食物来吗？', 'O']],
  [['医生：宝贝奶粉的话最近换过牌子吗？', 'O']],
  [['医生：宝贝肚子着凉来吗？', 'O']],
  [['患者：喝茶油腻也少，菜吃很多', 'O']],
  [['医生：嗯嗯，宝妈饮食一定注意，生冷辛辣刺激食物不能吃油腻食物不能吃，特别油腻食物的奥，清淡饮食为主，这个时候宝贝胃肠功能可能会有影响，能吃多少吃多少别强喂的奥！',
    'O']],
  [['医生：宝贝最近有没有', 'O'], ['呕吐', 'Symptom'], ['症状呢？', 'O']],
  [['患者：', 'O'],
   ['呕吐', 'Symptom'],
   ['，有时会', 'O'],
   ['吐', 'Symptom'],
   ['，不多', 'O']],
  [['医生：嗯嗯，如果那种漾奶，每次量很少的话就问题不大，这个时候小婴儿胃处于水平位，所以很容易漾奶的奥，看好宝贝别呛到了就可以', 'O']],
  [['医生：宝贝如果', 'O'],
   ['呕吐', 'Symptom'],
   ['量非常大的话就要给宝贝空肚子的奥，至少2小时不要喝水不要吃奶的', 'O']],
  [['医生：宝贝', 'O'], ['拉肚子', 'Symptom'], ['到今天为止总共第几天了呢', 'O']],
  [['患者：现在不敢喂多，但吃不饱宝宝又会闹，就90毫升，加点水这样。', 'O']],
  [['医生：嗯嗯，这个时候您按需哺乳就可以，能喝多少喝多少别强喂就可以的', 'O']],
  [['医生：宝贝大便的话现在一天多少次呢？', 'O']],
  [['患者：好的好的', 'O']],
  [['医生：宝贝到今天为止', 'O'], ['拉肚子', 'Symptom'], ['总共几天了呢？', 'O']],
  [['患者：多多少少十来次', 'O']],
  [['医生：大便什么样子的呢？除了', 'O'],
   ['浠水', 'Symptom']

In [26]:
a = [[['医生：你好我是您的接诊医生', 'O']],
  [['医生：宝贝最近吃奶量可以吗？下降了吗', 'O']],
  [['患者：没有，也没怎么', 'O'], ['哭闹', 'Symptom']]]
del a[-1][-1][0]

a

[[['医生：你好我是您的接诊医生', 'O']],
 [['医生：宝贝最近吃奶量可以吗？下降了吗', 'O']],
 [['患者：没有，也没怎么', 'O'], ['Symptom']]]

In [64]:
d

([[['医生：你好我是您的接诊医生', 'O']],
  [['医生：宝贝最近吃奶量可以吗？下降了吗', 'O']],
  [['患者：没有，也没怎么', 'O'], ['哭闹', 'Symptom']],
  [['医生：宝妈有没有吃生冷辛辣刺激食物油腻食物来吗？', 'O']],
  [['医生：宝贝奶粉的话最近换过牌子吗？', 'O']],
  [['医生：宝贝肚子着凉来吗？', 'O']],
  [['患者：喝茶油腻也少，菜吃很多', 'O']],
  [['医生：嗯嗯，宝妈饮食一定注意，生冷辛辣刺激食物不能吃油腻食物不能吃，特别油腻食物的奥，清淡饮食为主，这个时候宝贝胃肠功能可能会有影响，能吃多少吃多少别强喂的奥！',
    'O']],
  [['医生：宝贝最近有没有', 'O'], ['呕吐', 'Symptom'], ['症状呢？', 'O']],
  [['患者：', 'O'], ['呕吐', 'Symptom'], ['，有时会', 'O'], ['，不多', 'O']],
  [['医生：嗯嗯，如果那种漾奶，每次量很少的话就问题不大，这个时候小婴儿胃处于水平位，所以很容易漾奶的奥，看好宝贝别呛到了就可以', 'O']],
  [['医生：宝贝如果', 'O'],
   ['呕吐', 'Symptom'],
   ['量非常大的话就要给宝贝空肚子的奥，至少2小时不要喝水不要吃奶的', 'O']],
  [['医生：宝贝', 'O'],
   ['拉肚', 'Symptom'],
   ['拉子', 'Symptom'],
   ['到今天为止总共第几天了呢', 'O']],
  [['患者：现在不敢喂多，但吃不饱宝宝又会闹，就90毫升，加点水这样。', 'O']],
  [['医生：嗯嗯，这个时候您按需哺乳就可以，能喝多少喝多少别强喂就可以的', 'O']],
  [['医生：宝贝大便的话现在一天多少次呢？', 'O']],
  [['患者：好的好的', 'O']],
  [['医生：宝贝到今天为止', 'O'],
   ['拉肚', 'Symptom'],
   ['拉子', 'Symptom'],
   ['总共几天了呢？', 'O']],
  [['患者：多多少少十来次', 'O']],
  [['医生：大便什么样子的呢？除了', 'O

In [57]:
if __name__ == '__main__':
    data_path = './ner_data/ccks.txt'
    d1 = load_data(data_path,128)

In [58]:
d1[-1][0]

['B-prov',
 'I-prov',
 'I-prov',
 'B-city',
 'I-city',
 'I-city',
 'B-district',
 'I-district',
 'I-district',
 'B-town',
 'I-town',
 'I-town',
 'I-town',
 'B-road',
 'I-road',
 'I-road',
 'B-roadno',
 'I-roadno',
 'B-poi',
 'I-poi',
 'I-poi',
 'I-poi',
 'B-roomno',
 'I-roomno']