### 训练样本预处理 

In [1]:
import numpy as np

file = open("traindata.txt", encoding='utf-8')
test_str = "中国首次火星探测任务天问一号探测器实施近火捕获制动"

new_sents = []
sents_labels = []
for line in file.readlines():
    line = line.split()
    new_sent = ''
    sent_labels = ''
    for word in line:
        if len(word) == 1:
            new_sent += word
            sent_labels += 'S'
        elif len(word) >= 2:
            new_sent += word
            sent_labels += 'B' + 'M'*(len(word)-2) + 'E'
    if new_sent != '':
        new_sents.append([new_sent])
        sents_labels.append([sent_labels])
print("训练样本准备完毕！")
print('共有数据 %d 条' % len(new_sents))
print('平均长度：', np.mean([len(d[0]) for d in new_sents]))

训练样本准备完毕！
共有数据 62946 条
平均长度： 8.67100371747212


In [2]:
new_sents[0]

['坚持从严治党落实管党治党责任']

In [3]:
sents_labels[0]

['BEBEBEBESSBEBE']

### 隐马模型实现

In [4]:
# 统计初始概率矩阵pi
state = ['S', 'B', 'M', 'E']
pi = np.zeros(4)
for i in range(len(sents_labels)):
    if sents_labels[i][0][0] == 'S':
        pi[0] += 1
    if sents_labels[i][0][0] == 'B':
        pi[1] += 1
pi /= np.sum(pi)

In [5]:
# 统计转移概率矩阵A和观测概率矩阵B
A = np.zeros((4, 4))
B = np.zeros((4, 65536)) # GB2312编码
for i in range(len(sents_labels)):
    for j in range(len(sents_labels[i][0])):
        B[state.index(sents_labels[i][0][j]), ord(new_sents[i][0][j])] += 1 # 观测频率加1
    for j in range(len(sents_labels[i][0]) - 1):
        A[state.index(sents_labels[i][0][j]), state.index(sents_labels[i][0][j+1])] += 1 # 转移频率加1

In [6]:
for i in range(4):
    if np.sum(A[i]) != 0:
        A[i] = A[i] / np.sum(A[i])
print(A)

[[0.33219523 0.66780477 0.         0.        ]
 [0.         0.         0.13972527 0.86027473]
 [0.         0.         0.29685786 0.70314214]
 [0.34044534 0.65955466 0.         0.        ]]


In [7]:
for i in range(4):
    B[i] /= np.sum(B[i])

In [8]:
from hmmlearn import hmm
model = hmm.MultinomialHMM(n_components=4)
model.startprob_ = pi
model.emissionprob_ = B
model.transmat_ = A

In [9]:
test_data = []
for i in range(len(test_str)): # 得到编码
    test_data.append(ord(test_str[i]))
test_data = np.array(test_data).reshape(-1, 1)
states = model.predict(test_data)
print(states)

[1 3 1 3 1 3 1 3 1 3 0 1 2 3 1 2 3 1 3 1 3 1 3 1 3]


In [10]:
test_out = ""
for i in range(len(states)):
    test_out += test_str[i]
    if states[i] == 0 or states[i] == 3:
        test_out += ' '
test_out = test_out.strip()
print(test_out)

中国 首次 火星 探测 任务 天 问一号 探测器 实施 近火 捕获 制动


### 条件随机场实现 

In [37]:
# 将训练语料改成crf++的格式，并写入文件crf_train_file
crf_train_file = "crf_train_file"
output_file = open(crf_train_file, 'w', encoding='utf-8')
for i in range(len(new_sents)):
    for j in range(len(new_sents[i][0])):
        output_file.write(new_sents[i][0][j] + ' ' + sents_labels[i][0][j] + '\n')
    output_file.write('\n')
output_file.close()

# 将测试文本改成crf++的格式，并写入文件crf_test_file
crf_test_file = "crf_test_file"
output_file = open(crf_test_file, 'w', encoding='utf-8')
for i in range(len(test_str)):
    output_file.write(test_str[i] + '\n')
output_file.close()

#### 将crf_learn.exe、crf_test.exe、libcrfpp.dll文件拷贝到目录下，定义一个模板文件：template。在控制台环境下，执行“crf_learn template crf_train_file crf_model”命令进行训练，得到模型文件：crf_model。

####  在控制台环境下，执行“crf_test -m crf_model crf_test_file > crf_test_output”命令得到测试语句的输出文件：crf_test_output。

In [43]:
# 将测试语句的分词输出改写方便观看的格式。
crf_test_output = "crf_test_output"
input_file = open(crf_test_output, encoding='utf-8')
str = ""
for line in input_file.readlines():
    line = line.split()
    if len(line) == 2:
        if line[1] == 'E' or line[1] == 'S':
            str += line[0] + ' '
        else:
            str += line[0]
input_file.close()
print(str)

中国 首次 火星 探测 任务 天问 一 号 探测器 实施 近火 捕获 制动 


### TensorFlow2框架下循环神经网络实现 

In [4]:
import re
import numpy as np
import tensorflow as tf
import tensorflow.keras as keras

# 重要参数
tags = {'S': 0, 'B': 1, 'M': 2, 'E': 3, 'X': 4} # 标签
embedding_size = 32 # 词向量大小
maxlen = 32 # 序列长度，长于则截断，短于则填充0
hidden_size = 32
batch_size = 64
epochs = 1
checkpointfilepath = 'weights.best.hdf5' # 中间结果保存文件
modepath = 'dz.h5' # 模型保存文件

In [5]:
# 1.提取出所有用到的字，形成字典
stat = {}
for i in range(len(new_sents)):
    for v in new_sents[i][0]:
        stat[v] = stat.get(v, 0) + 1
stat = sorted(stat.items(), key=lambda x:x[1], reverse=True)
vocab = [s[0] for s in stat]
print("不同字的个数：" + str(len(vocab)))
char2id = {c : i + 1 for i, c in enumerate(vocab)} # 编号0为填充值，因此从1开始编号
id2char = {i + 1 : c for i, c in enumerate(vocab)}
print("字典创建完毕！")

不同字的个数：3878
字典创建完毕！


In [6]:
# 2.将训练语句转化为训练样本
trainX = []
trainY = []
for i in range(len(new_sents)):
    x = [0] * maxlen # 默认填充值
    y = [4] * maxlen # 默认标签X
    sent = new_sents[i][0]
    labe = sents_labels[i][0]
    replace_len = len(sent)
    if len(sent) > maxlen:
        replace_len = maxlen
    for j in range(replace_len):
        x[j] = char2id[sent[j]]
        y[j] = tags[labe[j]]
    trainX.append(x)
    trainY.append(y)
trainX = np.array(trainX)
trainY = tf.keras.utils.to_categorical(trainY, 5)
print("训练样本准备完毕，训练样本共" + str(len(trainX)) + "句。")

训练样本准备完毕，训练样本共62946句。


In [17]:
# 3.搭建模型，并训练
from tensorflow.keras.layers import Input, Dense, Embedding, LSTM, Dropout, TimeDistributed, Bidirectional
from tensorflow.keras.models import Model
from tensorflow.keras.callbacks import ModelCheckpoint
X = Input(shape=(maxlen,), dtype='int32')
embedding = Embedding(input_dim=len(vocab)+1, output_dim=embedding_size, input_length=maxlen, mask_zero=True)(X)
blstm = Bidirectional(LSTM(hidden_size, return_sequences=True), merge_mode='concat')(embedding)
blstm = Dropout(0.4)(blstm)
blstm = Bidirectional(LSTM(hidden_size, return_sequences=True), merge_mode='concat')(blstm)
blstm = Dropout(0.4)(blstm)
output = TimeDistributed(Dense(5, activation='softmax'))(blstm)
model = Model(X, output)
model.summary()

Model: "model"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 32)]              0         
_________________________________________________________________
embedding (Embedding)        (None, 32, 32)            124128    
_________________________________________________________________
bidirectional (Bidirectional (None, 32, 64)            16640     
_________________________________________________________________
dropout (Dropout)            (None, 32, 64)            0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, 32, 64)            24832     
_________________________________________________________________
dropout_1 (Dropout)          (None, 32, 64)            0         
_________________________________________________________________
time_distributed (TimeDistri (None, 32, 5)             325   

In [18]:
import os
if os.path.exists(checkpointfilepath): # 与下面的checkpoint起到及时保存训练结果的作用
    print("加载前次训练模型参数。。。")
    model.load_weights(checkpointfilepath)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
checkpoint = ModelCheckpoint(checkpointfilepath, monitor='acc', verbose=1, save_best_only=True,
                            mode='max')
model.fit(trainX, trainY, batch_size=batch_size, epochs=epochs, callbacks=[checkpoint])
model.save(modepath)
#print(model.evaluate(trainX, trainY, batch_size=batch_size))



In [19]:
# 4.利用训练好的模型进行分词
def predict(testsent):
    # 将汉字句子转换成模型需要的输入形式
    x = [0] * maxlen
    replace_len = len(testsent)
    if len(testsent) > maxlen:
        replace_len = maxlen
    for j in range(replace_len):
        x[j] = char2id[testsent[j]]
    # 调用模型进行预测
    label = model.predict([x]) 
    # 根据模型预测结果对输入句子进行切分
    label = np.array(label)[0]
    s = ''
    for i in range(len(testsent)):
        tag = np.argmax(label[i])
        if tag == 0 or tag == 3: # 单字和词结尾加空格切分
            s += testsent[i] + ' '
        elif tag ==1 or tag == 2:
            s += testsent[i]
    print(s)

In [20]:
predict(test_str)

中国 首次 火星 探测 任务 天问 一 号 探测器 实施 近火 捕获 制动 
