## 书中中文分词示例的MindSpore实现

该实现完全参考了MindSpore官网上r1.7的“LSTM+CRF实现序列标注”教程。该教程网址为：https://www.mindspore.cn/tutorials/application/zh-CN/r1.7/nlp/sequence_labeling.html

其中条件随机场部分代码的原理，可参考官网和原版书相关内容。

In [1]:
def compute_score(emissions, tags, seq_ends, mask, trans, start_trans, end_trans):
    # emissions: (seq_length, batch_size, num_tags)
    # tags: (seq_length, batch_size)
    # mask: (seq_length, batch_size)

    seq_length, batch_size = tags.shape
    mask = mask.astype(emissions.dtype)

    # 将score设置为初始转移概率
    # shape: (batch_size,)
    score = start_trans[tags[0]]
    # score += 第一次发射概率
    # shape: (batch_size,)
    score += emissions[0, mnp.arange(batch_size), tags[0]]

    for i in range(1, seq_length):
        # 标签由i-1转移至i的转移概率（当mask == 1时有效）
        # shape: (batch_size,)
        score += trans[tags[i - 1], tags[i]] * mask[i]

        # 预测tags[i]的发射概率（当mask == 1时有效）
        # shape: (batch_size,)
        score += emissions[i, mnp.arange(batch_size), tags[i]] * mask[i]

    # 结束转移
    # shape: (batch_size,)
    last_tags = tags[seq_ends, mnp.arange(batch_size)]
    # score += 结束转移概率
    # shape: (batch_size,)
    score += end_trans[last_tags]

    return score

In [2]:
def compute_normalizer(emissions, mask, trans, start_trans, end_trans):
    # emissions: (seq_length, batch_size, num_tags)
    # mask: (seq_length, batch_size)

    seq_length = emissions.shape[0]

    # 将score设置为初始转移概率，并加上第一次发射概率
    # shape: (batch_size, num_tags)
    score = start_trans + emissions[0]

    for i in range(1, seq_length):
        # 扩展score的维度用于总score的计算
        # shape: (batch_size, num_tags, 1)
        broadcast_score = score.expand_dims(2)

        # 扩展emission的维度用于总score的计算
        # shape: (batch_size, 1, num_tags)
        broadcast_emissions = emissions[i].expand_dims(1)

        # 根据公式(7)，计算score_i
        # 此时broadcast_score是由第0个到当前Token所有可能路径
        # 对应score的log_sum_exp
        # shape: (batch_size, num_tags, num_tags)
        next_score = broadcast_score + trans + broadcast_emissions

        # 对score_i做log_sum_exp运算，用于下一个Token的score计算
        # shape: (batch_size, num_tags)
        next_score = mnp.log(mnp.sum(mnp.exp(next_score), axis=1))

        # 当mask == 1时，score才会变化
        # shape: (batch_size, num_tags)
        score = mnp.where(mask[i].expand_dims(1), next_score, score)

    # 最后加结束转移概率
    # shape: (batch_size, num_tags)
    score += end_trans
    # 对所有可能的路径得分求log_sum_exp
    # shape: (batch_size,)
    return mnp.log(mnp.sum(mnp.exp(score), axis=1))

In [3]:
def viterbi_decode(emissions, mask, trans, start_trans, end_trans):
    # emissions: (seq_length, batch_size, num_tags)
    # mask: (seq_length, batch_size)

    seq_length = mask.shape[0]

    score = start_trans + emissions[0]
    history = ()

    for i in range(1, seq_length):
        broadcast_score = score.expand_dims(2)
        broadcast_emission = emissions[i].expand_dims(1)
        next_score = broadcast_score + trans + broadcast_emission

        # 求当前Token对应score取值最大的标签，并保存
        indices = next_score.argmax(axis=1)
        history += (indices,)

        next_score = next_score.max(axis=1)
        score = mnp.where(mask[i].expand_dims(1), next_score, score)

    score += end_trans

    return score, history

def post_decode(score, history, seq_length):
    # 使用Score和History计算最佳预测序列
    batch_size = seq_length.shape[0]
    seq_ends = seq_length - 1
    # shape: (batch_size,)
    best_tags_list = []

    # 依次对一个Batch中每个样例进行解码
    for idx in range(batch_size):
        # 查找使最后一个Token对应的预测概率最大的标签，
        # 并将其添加至最佳预测序列存储的列表中
        best_last_tag = score[idx].argmax(axis=0)
        best_tags = [int(best_last_tag.asnumpy())]

        # 重复查找每个Token对应的预测概率最大的标签，加入列表
        for hist in reversed(history[:seq_ends[idx]]):
            best_last_tag = hist[idx][best_tags[-1]]
            best_tags.append(int(best_last_tag.asnumpy()))

        # 将逆序求解的序列标签重置为正序
        best_tags.reverse()
        best_tags_list.append(best_tags)

    return best_tags_list

In [4]:
import mindspore
import mindspore.nn as nn
import mindspore.numpy as mnp
from mindspore import Parameter
from mindspore.common.initializer import initializer, Uniform

def sequence_mask(seq_length, max_length, batch_first=False):
    """根据序列实际长度和最大长度生成mask矩阵"""
    range_vector = mnp.arange(0, max_length, 1, seq_length.dtype)
    result = range_vector < seq_length.view(seq_length.shape + (1,))
    if batch_first:
        return result.astype(mindspore.int64)
    return result.astype(mindspore.int64).swapaxes(0, 1)

class CRF(nn.Cell):
    def __init__(self, num_tags: int, batch_first: bool = False, reduction: str = 'sum') -> None:
        if num_tags <= 0:
            raise ValueError(f'invalid number of tags: {num_tags}')
        super().__init__()
        if reduction not in ('none', 'sum', 'mean', 'token_mean'):
            raise ValueError(f'invalid reduction: {reduction}')
        self.num_tags = num_tags
        self.batch_first = batch_first
        self.reduction = reduction
        self.start_transitions = Parameter(initializer(Uniform(0.1), (num_tags,)), name='start_transitions')
        self.end_transitions = Parameter(initializer(Uniform(0.1), (num_tags,)), name='end_transitions')
        self.transitions = Parameter(initializer(Uniform(0.1), (num_tags, num_tags)), name='transitions')

    def construct(self, emissions, tags=None, seq_length=None):
        if tags is None:
            return self._decode(emissions, seq_length)
        return self._forward(emissions, tags, seq_length)

    def _forward(self, emissions, tags=None, seq_length=None):
        if self.batch_first:
            batch_size, max_length = tags.shape
            emissions = emissions.swapaxes(0, 1)
            tags = tags.swapaxes(0, 1)
        else:
            max_length, batch_size = tags.shape

        if seq_length is None:
            seq_length = mnp.full((batch_size,), max_length, mindspore.int64)

        mask = sequence_mask(seq_length, max_length)

        # shape: (batch_size,)
        numerator = compute_score(emissions, tags, seq_length-1, mask, self.transitions, self.start_transitions, self.end_transitions)
        # shape: (batch_size,)
        denominator = compute_normalizer(emissions, mask, self.transitions, self.start_transitions, self.end_transitions)
        # shape: (batch_size,)
        llh = denominator - numerator

        if self.reduction == 'none':
            return llh
        if self.reduction == 'sum':
            return llh.sum()
        if self.reduction == 'mean':
            return llh.mean()
        return llh.sum() / mask.astype(emissions.dtype).sum()

    def _decode(self, emissions, seq_length=None):
        if self.batch_first:
            batch_size, max_length = emissions.shape[:2]
            emissions = emissions.swapaxes(0, 1)
        else:
            batch_size, max_length = emissions.shape[:2]

        if seq_length is None:
            seq_length = mnp.full((batch_size,), max_length, mindspore.int64)

        mask = sequence_mask(seq_length, max_length)

        return viterbi_decode(emissions, mask, self.transitions, self.start_transitions, self.end_transitions)

In [5]:
class BiLSTM_CRF(nn.Cell):
    def __init__(self, vocab_size, embedding_dim, hidden_dim, num_tags, padding_idx=0):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=padding_idx)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim // 2, bidirectional=True, batch_first=True)
        self.hidden2tag = nn.Dense(hidden_dim, num_tags, 'he_uniform')
        self.crf = CRF(num_tags, batch_first=True)

    def construct(self, inputs, seq_length, tags=None):
        embeds = self.embedding(inputs)
        outputs, _ = self.lstm(embeds, seq_length=seq_length)
        feats = self.hidden2tag(outputs)

        crf_outs = self.crf(feats, tags, seq_length)
        return crf_outs

## 以上为官网上实现的模型，下面用该模型来实现中文分词示例

In [6]:
import numpy as np

file = open("traindata.txt", encoding='utf-8')
test_str = "中国首次火星探测任务天问一号探测器实施近火捕获制动"

new_sents = []
sents_labels = []
for line in file.readlines():
    line = line.split()
    new_sent = ''
    sent_labels = ''
    for word in line:
        if len(word) == 1:
            new_sent += word
            sent_labels += 'S'
        elif len(word) >= 2:
            new_sent += word
            sent_labels += 'B' + 'M'*(len(word)-2) + 'E'
    if new_sent != '':
        new_sents.append([new_sent])
        sents_labels.append([sent_labels])
print("训练样本准备完毕！")
print('共有数据 %d 条' % len(new_sents))
print('平均长度：', np.mean([len(d[0]) for d in new_sents]))

训练样本准备完毕！
共有数据 62946 条
平均长度： 8.67100371747212


In [7]:
import re
import numpy as np

# 重要参数
tags = {'S': 0, 'B': 1, 'M': 2, 'E': 3, 'X': 4} # 标签
embedding_size = 32 # 词向量大小
maxlen = 32 # 序列长度，长于则截断，短于则填充0
hidden_size = 32

In [8]:
# 1.提取出所有用到的字，形成字典
stat = {}
for i in range(len(new_sents)):
    for v in new_sents[i][0]:
        stat[v] = stat.get(v, 0) + 1
stat = sorted(stat.items(), key=lambda x:x[1], reverse=True)
vocab = [s[0] for s in stat]
print("不同字的个数：" + str(len(vocab)))
char2id = {c : i + 1 for i, c in enumerate(vocab)} # 编号0为填充值，因此从1开始编号
id2char = {i + 1 : c for i, c in enumerate(vocab)}
print("字典创建完毕！")

不同字的个数：3878
字典创建完毕！


In [9]:
# 2.将训练语句转化为训练样本
trainX = []
trainX_len = []
trainY = []
for i in range(len(new_sents)):
    x = [0] * maxlen # 默认填充值
    y = [4] * maxlen # 默认标签X
    sent = new_sents[i][0]
    labe = sents_labels[i][0]
    replace_len = len(sent)
    if len(sent) > maxlen:
        replace_len = maxlen
    for j in range(replace_len):
        x[j] = char2id[sent[j]]
        y[j] = tags[labe[j]]
    trainX.append(x)
    trainX_len.append(len(sent))
    trainY.append(y)
trainX = np.array(trainX)
trainX_len = np.array(trainX_len)
trainY = np.array(trainY)
#trainY = tf.keras.utils.to_categorical(trainY, 5)
print("训练样本准备完毕，训练样本共" + str(len(trainX)) + "句。")

训练样本准备完毕，训练样本共62946句。


In [10]:
trainX = mindspore.Tensor(trainX, mindspore.int64)
trainX_len = mindspore.Tensor(trainX_len, mindspore.int64)
trainY = mindspore.Tensor(trainY, mindspore.int64)

In [11]:
# 3.搭建模型，并训练

model = BiLSTM_CRF(len(char2id)+1, embedding_size, hidden_size, len(tags))
optimizer = nn.Adam(model.trainable_params(), learning_rate=0.01, weight_decay=1e-4)

train_one_step = nn.TrainOneStepCell(model, optimizer)

train_one_step.compile(trainX, trainX_len, trainY)

In [15]:
from tqdm import tqdm

steps = 3
with tqdm(total=steps) as t:
    for i in range(steps):
        loss = train_one_step(trainX, trainX_len, trainY)
        t.set_postfix(loss=loss)
        t.update(1)

100%|████████████████████████| 10/10 [1:03:49<00:00, 382.92s/it, loss=578476.4]


In [18]:
# 4.利用训练好的模型进行分词
def predict(testsent):
    # 将汉字句子转换成模型需要的输入形式
    x = [0] * maxlen
    replace_len = len(testsent)
    if len(testsent) > maxlen:
        replace_len = maxlen
    for j in range(replace_len):
        x[j] = char2id[testsent[j]]
    # 调用模型进行预测
    xx = mindspore.Tensor([x], mindspore.int64)
    xx_len = mindspore.Tensor([len(testsent)], mindspore.int64)
    score, history = model(xx, xx_len) 
    predict = post_decode(score, history, xx_len)
    #print(predict)
    # 根据模型预测结果对输入句子进行切分
    label = np.array(predict)[0]
    s = ''
    for i in range(len(testsent)):
        tag = label[i]
        if tag == 0 or tag == 3: # 单字和词结尾加空格切分
            s += testsent[i] + ' '
        elif tag ==1 or tag == 2:
            s += testsent[i]
    print(s)

In [19]:
predict(test_str)

中国 首次 火星 探测 任务 天问 一号 探测器 实施 近火 捕获 制动 
