In [19]:
from itertools import zip_longest
from copy import deepcopy
from evaluating import Metrics
import pickle
import time
import json
import torch
import torch.nn as nn
import torch.optim as optim
import torch.utils.data as Data
from torch.nn.utils.rnn import pad_packed_sequence, pack_padded_sequence

In [3]:
def build_corpus(data_path,make_word2id =True):
    word_lists = []
    tag_lists = []
    with open(data_path,'r',encoding='utf-8') as f:
        word_list = []
        tag_list = []
        for line in f:
            if line !='\n':
                line = line.strip('\n').split()
                if len(line) < 2:
                    continue
                word,tag = line[0],line[1]

                word_list.append(word)
                tag_list.append(tag)
            else:
                word_lists.append(word_list)
                tag_lists.append(tag_list)
                word_list = []
                tag_list = []
    def build_map(lists):
        maps = {}
        for sent in lists:
            for word in sent:
                if word not in maps:
                    maps[word]=len(maps)
        return maps

    if make_word2id:
        word2id = build_map(word_lists)
        tag2id = build_map(tag_lists)
        return word_lists,tag_lists,word2id,tag2id
    else:
        return word_lists,tag_lists

In [4]:
def extend_maps(word2id, tag2id, for_crf=True):
    word2id['<pad>'] = len(word2id)
    tag2id['<pad>'] = len(tag2id)
    # 如果是加了CRF的bilstm  那么还要加入<start> 和 <end>token
    if for_crf:
        word2id['<start>'] = len(word2id)
        word2id['<end>'] = len(word2id)
        tag2id['<start>'] = len(tag2id)
        tag2id['<end>'] = len(tag2id)
    return word2id, tag2id

In [5]:
def prepocess_data_for_lstmcrf(word_lists, tag_lists, test=False):
    assert len(word_lists) == len(tag_lists)
    for i in range(len(word_lists)):
        word_lists[i].append("<end>")
        if not test:  # 如果是测试数据，就不需要加end token了
            tag_lists[i].append("<end>")
    return word_lists, tag_lists

In [6]:
#LSTM模型 工具函数
def tensorized(batch,maps):
    PAD = maps.get('<pad>')

    max_len = len(batch[0])
    batch_size = len(batch)

    batch_tensor = torch.ones(batch_size,max_len).long()*PAD
    for i,sen in enumerate(batch):
        for j,word in enumerate(sen):
            batch_tensor[i][j] = maps.get(word)
    #每个批次的长度
    lengths = [len(sen) for sen in batch]

    return batch_tensor,lengths

In [7]:
def save_model(model, file_name):
    """用于保存模型"""
    # with open(file_name, "wb") as f:
    #     pickle.dump(model, f)
    torch.save(model,file_name)
def load_model(file_name):
    """用于加载模型"""
    # with open(file_name, "rb") as f:
        # model = pickle.load(f)
    model = torch.load(file_name)
    return model

In [8]:
def indexed(targets, tagset_size, start_id):
    """将targets中的数转化为在[T*T]大小序列中的索引,T是标注的种类"""
    batch_size, max_len = targets.size()
    for col in range(max_len-1, 0, -1):
        targets[:, col] += (targets[:, col-1] * tagset_size)
    targets[:, 0] += (start_id * tagset_size)
    return targets

In [9]:
def sort_by_lengths(word_lists,tag_lists):
    pairs = list(zip(word_lists,tag_lists))
    indice = sorted(range(len(pairs)),key = lambda k:len(pairs[k][0]),reverse = True)
    pairs = [pairs[i] for i in indice]
    word_lists,tag_lists = list(zip(*pairs))

    return word_lists,tag_lists,indice

In [10]:
def cal_lstm_crf_loss(crf_scores, targets, tag2id):
    """计算双向LSTM-CRF模型的损失
    该损失函数的计算可以参考:https://arxiv.org/pdf/1603.01360.pdf
    """
    pad_id = tag2id.get('<pad>')
    start_id = tag2id.get('<start>')
    end_id = tag2id.get('<end>')

    device = crf_scores.device

    # targets:[B, L] crf_scores:[B, L, T, T]
    batch_size, max_len = targets.size()
    target_size = len(tag2id)

    # mask = 1 - ((targets == pad_id) + (targets == end_id))  # [B, L]
    mask = (targets != pad_id)
    lengths = mask.sum(dim=1)
    targets = indexed(targets, target_size, start_id)

    # # 计算Golden scores方法１
    # import pdb
    # pdb.set_trace()
    targets = targets.masked_select(mask)  # [real_L]

    flatten_scores = crf_scores.masked_select(
        mask.view(batch_size, max_len, 1, 1).expand_as(crf_scores)
    ).view(-1, target_size*target_size).contiguous()

    golden_scores = flatten_scores.gather(
        dim=1, index=targets.unsqueeze(1)).sum()


    scores_upto_t = torch.zeros(batch_size, target_size).to(device)
    for t in range(max_len):
        # 当前时刻 有效的batch_size（因为有些序列比较短)
        batch_size_t = (lengths > t).sum().item()
        if t == 0:
            scores_upto_t[:batch_size_t] = crf_scores[:batch_size_t,
                                                      t, start_id, :]
        else:
            scores_upto_t[:batch_size_t] = torch.logsumexp(
                crf_scores[:batch_size_t, t, :, :] +
                scores_upto_t[:batch_size_t].unsqueeze(2),
                dim=1
            )
    all_path_scores = scores_upto_t[:, end_id].sum()

    # 训练大约两个epoch loss变成负数，从数学的角度上来说，loss = -logP
    loss = (all_path_scores - golden_scores) / batch_size
    return loss

In [11]:
class BiLSTM(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, out_size):
        """初始化参数：
            vocab_size:字典的大小
            emb_size:词向量的维数
            hidden_size：隐向量的维数
            out_size:标注的种类
        """
        super(BiLSTM, self).__init__()
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.bilstm = nn.LSTM(emb_size, hidden_size,
                              batch_first=True,
                              bidirectional=True)

        self.lin = nn.Linear(2*hidden_size, out_size)

    def forward(self, sents_tensor, lengths):
        emb = self.embedding(sents_tensor)  # [B, L, emb_size]

        packed = pack_padded_sequence(emb, lengths, batch_first=True)
        rnn_out, _ = self.bilstm(packed)
        # rnn_out:[B, L, hidden_size*2]
        rnn_out, _ = pad_packed_sequence(rnn_out, batch_first=True)

        scores = self.lin(rnn_out)  # [B, L, out_size]

        return scores

    def test(self, sents_tensor, lengths, _):
        """第三个参数不会用到，加它是为了与BiLSTM_CRF保持同样的接口"""
        logits = self.forward(sents_tensor, lengths)  # [B, L, out_size]
        batch_max_value, batch_tagids = torch.max(logits, dim=2)

        return batch_tagids

In [12]:
class BiLSTM_CRF(nn.Module):
    def __init__(self, vocab_size, emb_size, hidden_size, out_size):
        """初始化参数：
            vocab_size:字典的大小
            emb_size:词向量的维数
            hidden_size：隐向量的维数
            out_size:标注的种类
        """
        super(BiLSTM_CRF, self).__init__()
        # self.bilstm = BiLSTM(vocab_size, emb_size, hidden_size, out_size)
        self.embedding = nn.Embedding(vocab_size, emb_size)
        self.bilstm = nn.LSTM(emb_size, hidden_size,
                              batch_first=True,
                              bidirectional=True)

        self.lin = nn.Linear(2*hidden_size, out_size)

        # CRF实际上就是多学习一个转移矩阵 [out_size, out_size] 初始化为均匀分布
        self.transition = nn.Parameter(
            torch.ones(out_size, out_size) * 1/out_size)
        # self.transition.data.zero_()

    def forward(self, sents_tensor, lengths):
        # [B, L, out_size]
        emb = self.embedding(sents_tensor)  # [B, L, emb_size]

        packed = pack_padded_sequence(emb, lengths, batch_first=True)
        rnn_out, _ = self.bilstm(packed)
        # rnn_out:[B, L, hidden_size*2]
        rnn_out, _ = pad_packed_sequence(rnn_out, batch_first=True)

        emission = self.lin(rnn_out)
        # emission = self.bilstm(sents_tensor, lengths)

        # 计算CRF scores, 这个scores大小为[B, L, out_size, out_size]
        # 也就是每个字对应对应一个 [out_size, out_size]的矩阵
        # 这个矩阵第i行第j列的元素的含义是：上一时刻tag为i，这一时刻tag为j的分数
        batch_size, max_len, out_size = emission.size()
        crf_scores = emission.unsqueeze(2).expand(-1, -1, out_size, -1) + self.transition.unsqueeze(0)

        return crf_scores

    def test(self, test_sents_tensor, lengths, tag2id):
        """使用维特比算法进行解码"""
        start_id = tag2id['<start>']
        end_id = tag2id['<end>']
        pad = tag2id['<pad>']
        tagset_size = len(tag2id)

        crf_scores = self.forward(test_sents_tensor, lengths)
        device = crf_scores.device
        # B:batch_size, L:max_len, T:target set size
        B, L, T, _ = crf_scores.size()
        # viterbi[i, j, k]表示第i个句子，第j个字对应第k个标记的最大分数
        viterbi = torch.zeros(B, L, T).to(device)
        # backpointer[i, j, k]表示第i个句子，第j个字对应第k个标记时前一个标记的id，用于回溯
        backpointer = (torch.zeros(B, L, T).long() * end_id).to(device)
        lengths = torch.LongTensor(lengths).to(device)
        # 向前递推
        for step in range(L):
            batch_size_t = (lengths > step).sum().item()
            if step == 0:
                # 第一个字它的前一个标记只能是start_id
                viterbi[:batch_size_t, step,
                        :] = crf_scores[: batch_size_t, step, start_id, :]
                backpointer[: batch_size_t, step, :] = start_id
            else:
                max_scores, prev_tags = torch.max(
                    viterbi[:batch_size_t, step-1, :].unsqueeze(2) +
                    crf_scores[:batch_size_t, step, :, :],     # [B, T, T]
                    dim=1
                )
                viterbi[:batch_size_t, step, :] = max_scores
                backpointer[:batch_size_t, step, :] = prev_tags

        # 在回溯的时候我们只需要用到backpointer矩阵
        backpointer = backpointer.view(B, -1)  # [B, L * T]
        tagids = []  # 存放结果
        tags_t = None
        for step in range(L-1, 0, -1):
            batch_size_t = (lengths > step).sum().item()
            if step == L-1:
                index = torch.ones(batch_size_t).long() * (step * tagset_size)
                index = index.to(device)
                index += end_id
            else:
                prev_batch_size_t = len(tags_t)

                new_in_batch = torch.LongTensor(
                    [end_id] * (batch_size_t - prev_batch_size_t)).to(device)
                offset = torch.cat(
                    [tags_t, new_in_batch],
                    dim=0
                )  # 这个offset实际上就是前一时刻的
                index = torch.ones(batch_size_t).long() * (step * tagset_size)
                index = index.to(device)
                index += offset.long()

            try:
                tags_t = backpointer[:batch_size_t].gather(
                    dim=1, index=index.unsqueeze(1).long())
            except RuntimeError:
                import pdb
                pdb.set_trace()
            tags_t = tags_t.squeeze(1)
            tagids.append(tags_t.tolist())

        # tagids:[L-1]（L-1是因为扣去了end_token),大小的liebiao
        # 其中列表内的元素是该batch在该时刻的标记
        # 下面修正其顺序，并将维度转换为 [B, L]
        tagids = list(zip_longest(*reversed(tagids), fillvalue=pad))
        tagids = torch.Tensor(tagids).long()

        # 返回解码的结果
        return tagids

In [13]:
# 设置lstm训练参数
class TrainingConfig(object):
    batch_size = 32
    # 学习速率
    lr = 0.0001
    epoches = 2
    print_step = 5


class LSTMConfig(object):
    emb_size = 128  # 词向量的维数
    hidden_size = 128  # lstm隐向量的维数

In [14]:
class BILSTM_Model(object):
    def __init__(self, vocab_size, out_size, crf=True):
        """功能：对LSTM的模型进行训练与测试
           参数:
            vocab_size:词典大小
            out_size:标注种类
            crf选择是否添加CRF层"""
        self.device = torch.device(
            "cuda" if torch.cuda.is_available() else "cpu")

        # 加载模型参数
        self.emb_size = LSTMConfig.emb_size
        self.hidden_size = LSTMConfig.hidden_size

        self.crf = crf
        # 根据是否添加crf初始化不同的模型 选择不一样的损失计算函数
        if not crf:
            self.model = BiLSTM(vocab_size, self.emb_size,
                                self.hidden_size, out_size).to(self.device)
            self.cal_loss_func = cal_loss
        else:
            self.model = BiLSTM_CRF(vocab_size, self.emb_size,
                                    self.hidden_size, out_size).to(self.device)
            self.cal_loss_func = cal_lstm_crf_loss

        # 加载训练参数：
        self.epoches = TrainingConfig.epoches
        self.print_step = TrainingConfig.print_step
        self.lr = TrainingConfig.lr
        self.batch_size = TrainingConfig.batch_size

        # 初始化优化器
        self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr)

        # 初始化其他指标
        self.step = 0
        self._best_val_loss = 1e18
        self.best_model = None

    def train(self, word_lists, tag_lists,
              dev_word_lists, dev_tag_lists,
              word2id, tag2id):
        # 对数据集按照长度进行排序
        word_lists, tag_lists, _ = sort_by_lengths(word_lists, tag_lists)
        dev_word_lists, dev_tag_lists, _ = sort_by_lengths(dev_word_lists, dev_tag_lists)

        B = self.batch_size
        for e in range(1, self.epoches+1):
            self.step = 0
            losses = 0.
            for ind in range(0, len(word_lists), B):
                batch_sents = word_lists[ind:ind+B]
                batch_tags = tag_lists[ind:ind+B]

                losses += self.train_step(batch_sents,
                                          batch_tags, word2id, tag2id)

                if self.step % TrainingConfig.print_step == 0:
                    total_step = (len(word_lists) // B + 1)
                    print("Epoch {}, step/total_step: {}/{} {:.2f}% Loss:{:.4f}".format(
                        e, self.step, total_step,
                        100. * self.step / total_step,
                        losses / self.print_step
                    ))
                    losses = 0.

            # 每轮结束测试在验证集上的性能，保存最好的一个
            val_loss = self.validate(dev_word_lists, dev_tag_lists, word2id, tag2id)
            print("Epoch {}, Val Loss:{:.4f}".format(e, val_loss))

    def train_step(self, batch_sents, batch_tags, word2id, tag2id):
        self.model.train()
        self.step += 1
        # 准备数据
        tensorized_sents, lengths = tensorized(batch_sents, word2id)
        tensorized_sents = tensorized_sents.to(self.device)
        targets, lengths = tensorized(batch_tags, tag2id)
        targets = targets.to(self.device)

        # forward
        scores = self.model(tensorized_sents, lengths)

        # 计算损失 更新参数
        self.optimizer.zero_grad()
        loss = self.cal_loss_func(scores, targets, tag2id).to(self.device)
        loss.backward()
        self.optimizer.step()

        return loss.item()

    def validate(self, dev_word_lists, dev_tag_lists, word2id, tag2id):
        self.model.eval()
        with torch.no_grad():
            val_losses = 0.
            val_step = 0
            for ind in range(0, len(dev_word_lists), self.batch_size):
                val_step += 1
                # 准备batch数据
                batch_sents = dev_word_lists[ind:ind+self.batch_size]
                batch_tags = dev_tag_lists[ind:ind+self.batch_size]
                tensorized_sents, lengths = tensorized(
                    batch_sents, word2id)
                tensorized_sents = tensorized_sents.to(self.device)
                targets, lengths = tensorized(batch_tags, tag2id)
                targets = targets.to(self.device)

                # forward
                scores = self.model(tensorized_sents, lengths)

                # 计算损失
                loss = self.cal_loss_func(
                    scores, targets, tag2id).to(self.device)
                val_losses += loss.item()
            val_loss = val_losses / val_step

            if val_loss < self._best_val_loss:
                print("保存模型...")
                self.best_model = deepcopy(self.model)
                self._best_val_loss = val_loss

            return val_loss

    def test(self, word_lists, tag_lists, word2id, tag2id):
        """返回最佳模型在测试集上的预测结果"""
        # 准备数据
        word_lists, tag_lists, indices = sort_by_lengths(word_lists, tag_lists)
        # tensorized_sents, lengths = tensorized(word_lists, word2id)
        # tensorized_sents = tensorized_sents.to(self.device)

        self.best_model.eval()
        # 将id转化为标注
        pred_tag_lists = []
        with torch.no_grad():
            for ind in range(0, len(word_lists), self.batch_size):
                batch_sents = word_lists[ind:ind+self.batch_size]
                tensorized_sents, lengths = tensorized(batch_sents, word2id)
                tensorized_sents = tensorized_sents.to(self.device)
            
                batch_tagids = self.best_model.test(
                    tensorized_sents, lengths, tag2id)

                id2tag = dict((id_, tag) for tag, id_ in tag2id.items())
                for i, ids in enumerate(batch_tagids):
                    tag_list = []
                    if self.crf:
                        for j in range(lengths[i] - 1):  # crf解码过程中，end被舍弃
                            tag_list.append(id2tag[ids[j].item()])
                    else:
                        for j in range(lengths[i]):
                            tag_list.append(id2tag[ids[j].item()])
                    pred_tag_lists.append(tag_list)

        # indices存有根据长度排序后的索引映射的信息
        # 比如若indices = [1, 2, 0] 则说明原先索引为1的元素映射到的新的索引是0，
        # 索引为2的元素映射到新的索引是1...
        # 下面根据indices将pred_tag_lists和tag_lists转化为原来的顺序
        ind_maps = sorted(list(enumerate(indices)), key=lambda e: e[1])
        indices, _ = list(zip(*ind_maps))
        pred_tag_lists = [pred_tag_lists[i] for i in indices]
        tag_lists = [tag_lists[i] for i in indices]

        return pred_tag_lists, tag_lists

In [15]:
def bilstm_train_and_eval(train_data, dev_data, test_data,
                          word2id, tag2id, crf=True, remove_O=False):
    train_word_lists, train_tag_lists = train_data
    dev_word_lists, dev_tag_lists = dev_data
    test_word_lists, test_tag_lists = test_data

    start = time.time()
    vocab_size = len(word2id)
    out_size = len(tag2id)
    bilstm_model = BILSTM_Model(vocab_size, out_size, crf=crf)
    bilstm_model.train(train_word_lists, train_tag_lists,
                       dev_word_lists, dev_tag_lists, word2id, tag2id)

    model_name = "bilstm_crf" if crf else "bilstm"
    save_model(bilstm_model, "./models/"+ "new" +model_name+".pkl")

    print("训练完毕,共用时{}秒.".format(int(time.time()-start)))
    pred_tag_lists, test_tag_lists = bilstm_model.test(
        test_word_lists, test_tag_lists, word2id, tag2id)

    return pred_tag_lists,test_tag_lists

In [108]:
#获取数据
data_path = 'data/all_data.txt'
word_lists,tag_lists,word2id,tag2id = build_corpus(data_path)

In [63]:
#数据处理
crf_word2id, crf_tag2id = extend_maps(word2id, tag2id, for_crf=True)
word_lists, tag_lists = prepocess_data_for_lstmcrf(word_lists, tag_lists)

In [64]:
len(word_lists),len(tag_lists)

(944823, 944823)

In [65]:
train_word_lists,dev_word_lists,test_word_lists = word_lists[:800000],word_lists[800000:900000],word_lists[900000:]
train_tag_lists,dev_tag_lists,test_tag_lists = tag_lists[:800000],tag_lists[800000:900000],tag_lists[900000:]

In [66]:
len(test_tag_lists)

44823

In [67]:
lstmcrf_pred = bilstm_train_and_eval(
    (train_word_lists, train_tag_lists),
    (dev_word_lists, dev_tag_lists),
    (test_word_lists, test_tag_lists),
    crf_word2id, crf_tag2id
)


Epoch 2, step/total_step: 23255/25001 93.02% Loss:0.2653
Epoch 2, step/total_step: 23260/25001 93.04% Loss:0.2574
Epoch 2, step/total_step: 23265/25001 93.06% Loss:0.2393
Epoch 2, step/total_step: 23270/25001 93.08% Loss:0.3358
Epoch 2, step/total_step: 23275/25001 93.10% Loss:0.2706
Epoch 2, step/total_step: 23280/25001 93.12% Loss:0.2356
Epoch 2, step/total_step: 23285/25001 93.14% Loss:0.2687
Epoch 2, step/total_step: 23290/25001 93.16% Loss:0.2401
Epoch 2, step/total_step: 23295/25001 93.18% Loss:0.2604
Epoch 2, step/total_step: 23300/25001 93.20% Loss:0.2813
Epoch 2, step/total_step: 23305/25001 93.22% Loss:0.2158
Epoch 2, step/total_step: 23310/25001 93.24% Loss:0.3183
Epoch 2, step/total_step: 23315/25001 93.26% Loss:0.2260
Epoch 2, step/total_step: 23320/25001 93.28% Loss:0.2508
Epoch 2, step/total_step: 23325/25001 93.30% Loss:0.3585
Epoch 2, step/total_step: 23330/25001 93.32% Loss:0.2698
Epoch 2, step/total_step: 23335/25001 93.34% Loss:0.3328
Epoch 2, step/total_step: 2334

In [68]:
pred_tag_lists,test_tag_lists = lstmcrf_pred

In [69]:
len(pred_tag_lists),len(test_tag_lists)

(44823, 44823)

In [70]:
len(pred_tag_lists[0]),len(test_tag_lists[0])

(47, 48)

In [71]:
for i in range(len(test_tag_lists)):
    del(test_tag_lists[i][-1])

In [72]:
len(pred_tag_lists[2]),len(test_tag_lists[2])

(41, 41)

In [73]:
# pred_tag_lists[1],test_tag_lists[1]

In [74]:
def flatten_lists(lists):
    flatten_list = []
    for l in lists:
        if type(l) == list:
            flatten_list += l
        else:
            flatten_list.append(l)
    return flatten_list

In [75]:
predict_list = flatten_lists(pred_tag_lists)
true_list = flatten_lists(test_tag_lists)

In [76]:
metrics = Metrics(true_list, predict_list, remove_O=False)
metrics.report_scores()
metrics.report_confusion_matrix()

           precision    recall  f1-score   support
    I-HOS     0.9945    0.9546    0.9741    178553
    E-ORG     0.7874    0.9752    0.8713    106388
    B-LOC     0.9533    0.9999    0.9760     23647
    B-HOS     0.9854    0.9548    0.9698     23229
    B-ORG     0.7840    0.9711    0.8676    106388
    I-LOC     0.9906    0.9992    0.9949      3922
    I-ORG     0.9300    0.9829    0.9557    118658
    E-HOS     0.9951    0.9693    0.9821     23229
    E-LOC     0.9535    0.9997    0.9760     23647
        O     0.9945    0.9677    0.9809   1908018
avg/total     0.9729    0.9685    0.9696   2515679

Confusion Matrix:
          I-HOS   E-ORG   B-LOC   B-HOS   B-ORG   I-LOC   I-ORG   E-HOS   E-LOC       O 
  I-HOS  170449     392       1     201     806      18      51       0     141    6494 
  E-ORG      90  103753       0       0      12       0    1736      11       6     780 
  B-LOC       0       1   23644       0       0       1       0       0       0       1 
  B-HOS      

In [16]:
def extract_entity(sent,word2id,model,tag2id):
    sent_list = []
    for i in range(len(sent)):
        sent_list.append(sent[i])
    sent_list.append('<end>')

    pred_tag_lists, test_tag_lists = model.test(
        [sent_list], [[]], word2id, tag2id)
  
    pred_tag_lists = pred_tag_lists[0]

    entity_idxs = []
    location_idxs = []
    hospital_idxs = []
    for i in range(len(pred_tag_lists)):
        entity_idx1 = []
        if pred_tag_lists[i]=='B-ORG':
            entity_idx1.append(i)
            for j in range(i+1,len(pred_tag_lists)):
                if pred_tag_lists[j]=='E-ORG':
                    entity_idx1.append(j)
                    break
        entity_idx2 = []
        if pred_tag_lists[i]=='B-LOC':
            entity_idx2.append(i)
            for j in range(i+1,len(pred_tag_lists)):
                if pred_tag_lists[j]=='E-LOC':
                    entity_idx2.append(j)
                    break
        entity_idx3 = []
        if pred_tag_lists[i]=='B-HOS':
            entity_idx3.append(i)
            for j in range(i+1,len(pred_tag_lists)):
                if pred_tag_lists[j]=='E-HOS':
                    entity_idx3.append(j)
                    break
            
        if entity_idx1!=[]:
            entity_idxs.append(entity_idx1)
        if entity_idx2!=[]:
            location_idxs.append(entity_idx2)
        if entity_idx3!=[]:
            hospital_idxs.append(entity_idx3)
    
    entity_list = []
    for pos in entity_idxs:
        words = sent[pos[0]:(pos[1]+1)]
        entity_list.append(words)
    
    location_list = []
    for pos in location_idxs:
        words = sent[pos[0]:(pos[1]+1)]
        location_list.append(words)

    hospital_list = []
    for pos in hospital_idxs:
        words = sent[pos[0]:(pos[1]+1)]
        hospital_list.append(words)
    data_dict = {'项目':list(set(entity_list)),'地区':list(set(location_list)),'机构':list(set(hospital_list))}
    return data_dict

In [78]:
import json
json.dump(crf_word2id,open('data/new_word2id.txt','w'))
json.dump(crf_tag2id,open('data/new_tag2id.txt','w'))

In [17]:
model = load_model('models/pro_city_hosbilstm_crf.pkl')

In [23]:
crf_word2id = json.load(open('data/pro_city_hos_word2id.txt'))
crf_tag2id = json.load(open('data/pro_city_hos_tag2id.txt'))

In [25]:
crf_tag2id

{'O': 0,
 'B-ORG': 1,
 'I-ORG': 2,
 'E-ORG': 3,
 'B-LOC': 4,
 'E-LOC': 5,
 'I-LOC': 6,
 '<pad>': 7,
 '<start>': 8,
 '<end>': 9}

In [26]:
sent = '我的鼻子非常扁平，而且鼻子也非常不好看，听说我这种情况去做综合隆鼻的手术是比较好的，想问一下北京的综合隆鼻价格是多少？请问扬州广陵艾菲斯医疗美容门诊部的价格怎么样?'
extract_entity(sent,crf_word2id,model,crf_tag2id)

IndexError: list index out of range

In [114]:
sent = '武汉轻工大学怎么样?'
extract_entity(sent,crf_word2id,model,crf_tag2id)

{'项目': [], '地区': ['武汉'], '机构': []}