操作系统：Linux

Python版本: 3.6.7

tensorflow版本：'1.13.0-rc1'

参与者 github： 
1. [zxxwin](https://github.com/zxxwin?tab=repositories)    
2. [bubblezhong](https://github.com/bubblezhong?tab=repositories)

In [1]:
!wget https://spaces.ac.cn/usr/uploads/2016/10/1372394625.zip
!unzip 1372394625.zip
!ls

--2019-02-17 04:49:13--  https://spaces.ac.cn/usr/uploads/2016/10/1372394625.zip
Resolving spaces.ac.cn (spaces.ac.cn)... 114.215.107.121
Connecting to spaces.ac.cn (spaces.ac.cn)|114.215.107.121|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 6386880 (6.1M) [application/zip]
Saving to: ‘1372394625.zip.1’


2019-02-17 04:49:16 (3.60 MB/s) - ‘1372394625.zip.1’ saved [6386880/6386880]

Archive:  1372394625.zip
replace msr_train.txt? [y]es, [n]o, [A]ll, [N]one, [r]ename: y
  inflating: msr_train.txt           
1372394625.zip	1372394625.zip.1  ckpt	data  msr_train.txt  sample_data


In [0]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
from tqdm import tqdm
import time

In [3]:
# 以字符串的形式读入所有数据
with open('msr_train.txt', 'rb') as msr_data:
    texts = msr_data.read().decode('gbk')
sentences = texts.split('\r\n')  # 根据换行切分


# 将不规范的内容（如每行的开头）去掉
def clean(s): 
    if '“/s' not in s: 
        return s.replace(' ”/s', '')
    elif '”/s' not in s:
        return s.replace('“/s ', '')
    elif '‘/s' not in s:
        return s.replace(' ’/s', '')
    elif '’/s' not in s:
        return s.replace('‘/s ', '')
    else:
        return s
# 把所有的句子拼接起来
texts = ''.join(map(clean, sentences)) 
print('Length of texts is %d' % len(texts))
print('Example of texts: \n', texts[:300])

Length of texts is 20247877
Example of texts: 
  人/b  们/e  常/s  说/s  生/b  活/e  是/s  一/s  部/s  教/b  科/m  书/e  ，/s  而/s  血/s  与/s  火/s  的/s  战/b  争/e  更/s  是/s  不/b  可/m  多/m  得/e  的/s  教/b  科/m  书/e  ，/s  她/s  确/b  实/e  是/s  名/b  副/m  其/m  实/e  的/s  ‘/s  我/s  的/s  大/b  学/e  ’/s  。/s   心/s  静/s  渐/s  知/s  春/s  似/s  海/s  ，/s  花/s  深/s  每/s  觉/s  影/s


In [4]:
# 重新以标点来划分，标点的形式如： ，/s      "/e      "/s
sentences = re.split('[，。！？、‘’“”]/[bems]', texts)
print('Sentences number:', len(sentences))
print('Sentence Example:\n', sentences[0])
print('Sentence Example:\n', sentences[1])

Sentences number: 331739
Sentence Example:
  人/b  们/e  常/s  说/s  生/b  活/e  是/s  一/s  部/s  教/b  科/m  书/e  
Sentence Example:
   而/s  血/s  与/s  火/s  的/s  战/b  争/e  更/s  是/s  不/b  可/m  多/m  得/e  的/s  教/b  科/m  书/e  


In [5]:
# 将一句话的文字和对应的标签分离
def get_words_labels(sentence):
  # sentence 可能是空格组成的字符串，因此返回的 words_labels 可能为空数组
  words_labels = re.findall('(.)/(.)', sentence)
  if words_labels:
    words_labels = np.asarray(words_labels)
    words = words_labels[:,0]
    labels = words_labels[:, 1]
    return words, labels
  else:
    return None

sentence_words = []
sentence_labels = []

# Tqdm 可以在 Python 长循环中添加一个进度提示信息，用户只需要封装任意的迭代器 tqdm(iterator)。
for sentence in tqdm(iter(sentences)):
    words_labels = get_words_labels(sentence)
    # 保证返回的 words_labels 非空 
    if words_labels:
      sentence_words.append(words_labels[0])
      sentence_labels.append(words_labels[1])


print("sentence_words 长度：", len(sentence_words))
print("sentence_words示例：")
print(sentence_words[1])
print("sentence_labels示例：")
print(sentence_labels[1])

331739it [00:04, 82584.86it/s]

sentence_words 长度： 321533
sentence_words示例：
['而' '血' '与' '火' '的' '战' '争' '更' '是' '不' '可' '多' '得' '的' '教' '科' '书']
sentence_labels示例：
['s' 's' 's' 's' 's' 'b' 'e' 's' 's' 'b' 'm' 'm' 'e' 's' 'b' 'm' 'e']





In [6]:
df_data = pd.DataFrame(index = range(len(sentence_words)))
df_data["sentence_words"] = sentence_words
df_data["sentence_labels"] = sentence_labels
sentence_length = list(map(lambda sentence: len(sentence), sentence_words))
df_data["sentence_length"] = sentence_length
df_data.head(5)

# 将所有句子的字全部合并到数组all_words中
all_words = []
for sentence_word in tqdm(iter(sentence_words)):
  all_words.extend(sentence_word)

pd_all_words = pd.Series(all_words)
# 对每个值进行计数并且排序
pd_all_words_count = pd_all_words.value_counts()
# pd_all_words_count.head(5)

# 得到pd_all_words_count的索引，也就得到了未重复的字的序列，并将其作为字典
dict_words = pd_all_words_count.index
# 得到与dict_words长度相同的序列，作为id。 
# 从 1 开始是因为我们用 0 来填充不满32个字符的空位
dict_words_ids = range(1, len(dict_words) + 1)

# x 用来填充不满32个字符的空位
word_labels = [ 'x', 's', 'b', 'm', 'e']
word_labels_ids = range(len(word_labels))

# 以文字为索引，方便得到文字的id
word2id = pd.Series(dict_words_ids, index = dict_words)
# 以id为索引，方便得到id对应的文字
id2word = pd.Series(dict_words, index = dict_words_ids)
# 对 labels 进行同样处理
label2id = pd.Series(word_labels_ids, index = word_labels)
id2label = pd.Series(word_labels, index = word_labels_ids)

# 字典长度
dict_size = len(dict_words)
print("字典的长度", dict_size)

321533it [00:01, 205511.44it/s]


字典的长度 5158


In [7]:
maxLen = 32

# 把句子中的文字转换成id，固定长度为32，不足32的部分用0补齐
def sentence_padding(sentence):
  sentence_ids = list(word2id[sentence])
  if len(sentence_ids) > maxLen:
    return sentence_ids[: maxLen]
  sentence_ids.extend( [0] * (maxLen - len(sentence_ids)) )
  return sentence_ids


# 把文字对应的标记转换成id，固定长度为32，不足32的部分用标记x对应的0补齐
def label_padding(labels):
  label_ids = list(label2id[labels])
  if len(label_ids) > maxLen:
    return label_ids[: maxLen]
  label_ids.extend( [0] * (maxLen - len(label_ids)) )
  return label_ids
  

%time df_data['X'] = df_data["sentence_words"].apply(sentence_padding)
%time df_data['Y'] = df_data["sentence_labels"].apply(label_padding)

CPU times: user 2min 40s, sys: 5.62 s, total: 2min 45s
Wall time: 2min 37s
CPU times: user 2min 40s, sys: 5.72 s, total: 2min 45s
Wall time: 2min 37s


In [8]:
X = np.asarray(list( df_data['X'].values ))
Y = np.asarray(list( df_data['Y'].values ))

print("X.shape", X.shape)
print("Y.shape", Y.shape)
print("X样例",X[0])
print("Y样例",Y[0])

X.shape (321533, 32)
Y.shape (321533, 32)
X样例 [  8  43 320  88  36 198   7   2  41 163 124 245   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0]
Y样例 [2 4 1 1 2 4 1 1 1 2 3 4 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]


In [9]:
import pickle
import os

# 保存处理好的数据

if not os.path.exists('data/'):
  os.makedirs('data/')

with open('data/data.pkl', 'wb') as output:
  %time pickle.dump(X, output)
  %time pickle.dump(Y, output)
  pickle.dump(word2id, output)
  pickle.dump(id2word, output)
  pickle.dump(label2id, output)
  pickle.dump(id2label, output)
print("数据已保存至data/data.pkl")

CPU times: user 37.7 ms, sys: 129 ms, total: 167 ms
Wall time: 167 ms
CPU times: user 32.6 ms, sys: 94.2 ms, total: 127 ms
Wall time: 190 ms
数据已保存至data/data.pkl


In [10]:
# 导入数据
import pickle

with open('data/data.pkl', 'rb') as _input:
  X = pickle.load(_input)
  Y = pickle.load(_input)
  word2id = pickle.load(_input)
  id2word = pickle.load(_input)
  label2id = pickle.load(_input)
  id2label = pickle.load(_input)



# 划分测试集/训练集/验证集
dataLen = len(X)
testNum = int(dataLen * 0.2)
X_test = X[dataLen - testNum:dataLen, :]
Y_test = Y[dataLen - testNum:dataLen, :]


X_train_valid = X[0:dataLen - testNum, :]
Len = len(X_train_valid)
validNum = int(Len * 0.2)
X_valid = X[Len - validNum: Len,:]
Y_valid = Y[Len - validNum: Len,:]

X_train = X[:Len - validNum,:]
Y_train = Y[:Len - validNum,:]
print(X_test.shape)
print(X_valid.shape)
print(X_train.shape)


(64306, 32)
(51445, 32)
(205782, 32)


In [11]:
import tensorflow as tf
from tensorflow.contrib import rnn
import numpy as np


# 重置计算图，避免变量重复
tf.reset_default_graph()

decay = 0.85
max_epoch = 5
max_max_epoch = 10

timestep_size = maxLen = 32
# 样本中不同字的个数+1(padding 0)，根据处理数据的时候得到
dict_size = 5158 + 1

input_size = embedding_size = 64
class_num = 5
hidden_size = 128
# bi-lstm 层数
layer_num = 2
max_grad_norm = 5.0
model_save_path = 'ckpt/bi-lstm.ckpt'


lr = tf.placeholder(tf.float32, name="lr")
keep_prob = tf.placeholder(tf.float32, name="keep_prob")
batch_size = tf.placeholder(tf.int32, name="batch_size")

embedding = tf.get_variable("embedding", [dict_size, embedding_size], dtype=tf.float32)

def lstm_cell():
  cell = rnn.LSTMCell(hidden_size)
  cell = rnn.DropoutWrapper(cell, output_keep_prob = keep_prob)
  return cell

def bi_lstm(X):
  # [batchsize, timestep_size] => [batchsize, timestep_size, embedding_size]
  inputs = tf.nn.embedding_lookup(embedding, X)
  # print(inputs.shape) # (?, 32, 64)
  
  # 2层前向lstm
  cell_fw = rnn.MultiRNNCell( [lstm_cell() for _ in range(layer_num)], state_is_tuple = True )
  # 2层反向lstm
  cell_bw = rnn.MultiRNNCell( [lstm_cell() for _ in range(layer_num)], state_is_tuple = True )
  
  
  # 将inputs转换为static_bidirectional_rnn要求的形式
  # 即，把inputs.shape = [batchsize, timestep_size, embedding_size] 转为：
  # timestep_size 个张量, 每个张量的shape = [batchsize, embedding_size]
  inputs = tf.unstack(inputs, timestep_size, 1)
  
  # 由 timestep_num=32 个 shape 为 [batch_size, hidden_size] 的张量构成的列表
  # 32 个时间步，每个时间步都输入了batch_size个数据，并且每个数据都输出hidden_size大小的输出
  try:
    outputs,_,_ = tf.contrib.rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, dtype=tf.float32)
  except Exception:
    # 旧版tensorflow的返回
    outputs = tf.contrib.rnn.static_bidirectional_rnn(cell_fw, cell_bw, inputs, dtype=tf.float32)
  # 32，32个时间步
  # print(len(outputs))  
  
  # 注意，bi-lstm中正向有128个输出，反向有128个输出，最终两个输出会连接在一起:256=128+128
  # print(outputs[0].shape)  # (batch_size, 256) 
  # print(outputs[1].shape)  # (batch_size, 256) 
  # 这是最后一个时间步的输出
  # print(outputs[-1].shape)  # (batch_size, 256) 
  # ---------------------------------------------------------------------
  # 将timestep_num个维度为[batch_size, hidden_size]的矩阵按hidden_size维度拼接
  # 得到的矩阵维度：[batch_size, hidden_size * timestep_num]
  output = tf.concat(outputs, 1) # shape (64306, 32*256 = 8192)
  #  将outputs转换成 [T, hidden_size * 2]
  output = tf.reshape(output, [-1, hidden_size * 2]) 
  return output

X_input = tf.placeholder(tf.int32, [None, timestep_size], name="X_input")
Y_input = tf.placeholder(tf.int32, [None, timestep_size], name="Y_input")

bilstm_output = bi_lstm(X_input)

softmax_w = tf.Variable(tf.truncated_normal(shape = [hidden_size * 2, class_num], mean = 0, stddev = 0.5))
softmax_b = tf.Variable(tf.truncated_normal(shape = [class_num], mean = 0, stddev = 0.5))

y_pred = tf.add( tf.matmul(bilstm_output, softmax_w), softmax_b, name="y_pred" )

# 注意Y_input.shape = [batch_size, timestep_size]，需要“拉直”
correct_prediction = tf.equal(tf.cast(tf.argmax(y_pred, 1), tf.int32), tf.reshape(Y_input, [-1]))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32), name="accuracy")
cost = tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = tf.reshape(Y_input, [-1]), logits = y_pred), name="cost")


# ***** 梯度裁剪 *******
tvars = tf.trainable_variables()  # 获取模型的所有参数
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars), max_grad_norm)  # 获取损失函数对于每个参数的梯度
optimizer = tf.train.AdamOptimizer(learning_rate=lr)   # 优化器

# 梯度下降计算
train_op = optimizer.apply_gradients( zip(grads, tvars),
    global_step=tf.contrib.framework.get_or_create_global_step())
# print('Finished creating the bi-lstm model.')

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
This class is equivalent as tf.keras.layers.LSTMCell, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
This class is equivalent as tf.keras.layers.StackedRNNCells, and will be replaced by that in Tensorflow 2.0.
Instructions for updating:
Please use `keras.layers.Bidirectional(keras.layers.RNN(cell, unroll=True))`, which is equivalent to this API
Instructions for updating:
Please use `keras.layers.RNN(cell, unroll=True)`, which is equivalent to this API
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.
Instructions for updating:
Use tf.cast instead.
Instructions for updating:
Please switch to tf.train.get_or_create_global_step


In [12]:
# 将输入特征和label分割成batch_size大小
def get_batch_X_Y(features, labels, batch_size):
  for start in range(0, len(features), batch_size):
    end = min(start + batch_size, len(features))
    yield features[start:end], labels[start:end], int(start/batch_size)

# 计算验证集的准确率
def show_valid_acc(sess):
  valid_batch_size = 512
  valid_accuracy = 0.0
  for valid_batch_x, valid_batch_y, valid_batch_index in get_batch_X_Y(X_valid, Y_valid, valid_batch_size):
    valid_acc = sess.run(accuracy, 
              feed_dict={
                X_input: valid_batch_x,
                Y_input: valid_batch_y,
                lr:1e-5,
                batch_size:valid_batch_size,
                keep_prob: 1.0
              })
    valid_accuracy += valid_acc
  valid_accuracy /= int(len(Y_valid) / valid_batch_size)
  return valid_accuracy

# 开始训练
with tf.Session() as sess:
  sess.run(tf.global_variables_initializer())
  train_batch_size = 128
  epoch_num = 6
  # 每个 epoch 中包含的 batch 数
  train_batch_num = int( len(X_train) / train_batch_size)  
  # print(train_batch_num) # 1607
  display_num = 50
  display_batch =  int( len(X_train) / display_num) 
  saver = tf.train.Saver() 

  learning_rate = 1e-4
  decay = 0.85
  mid_epoch = 5
  total_epoch = 6

  for epoch in range(total_epoch):
    if epoch > mid_epoch:
      learning_rate = learning_rate * ( (decay) ** (epoch - mid_epoch) )
    print("epoch %d :" % (epoch + 1))
    start_time = time.time()
    train_loss = 0
    valid_accr = 0
    for batch_x, batch_y, batch_index in get_batch_X_Y(X_train, Y_train, train_batch_size):
      feed_dict = {
              X_input: batch_x,
              Y_input: batch_y,
              lr:learning_rate,
              batch_size:train_batch_size,
              keep_prob: 0.5
            }
      _, loss, accr = sess.run([train_op, cost, accuracy], feed_dict=feed_dict)
      if (batch_index + 1) % 400 == 0:
        print("   准确率 =", show_valid_acc(sess), "loss =", loss)
  save_path = saver.save(sess, model_save_path)



epoch 1 :
   准确率 = 0.761001033782959 loss = 0.62942934
   准确率 = 0.7805332505702972 loss = 0.36704862
   准确率 = 0.7735305523872376 loss = 0.5307258
   准确率 = 0.8031831902265548 loss = 0.40916482
epoch 2 :
   准确率 = 0.8464404124021531 loss = 0.45108005
   准确率 = 0.8739796763658524 loss = 0.25456482
   准确率 = 0.9029435861110687 loss = 0.3392535
   准确率 = 0.9135383301973343 loss = 0.2291247
epoch 3 :
   准确率 = 0.9202643746137619 loss = 0.29349682
   准确率 = 0.9231205457448959 loss = 0.20426863
   准确率 = 0.9240581953525543 loss = 0.2872843
   准确率 = 0.9291651558876037 loss = 0.19206083
epoch 4 :
   准确率 = 0.9341693586111068 loss = 0.24466524
   准确率 = 0.9367320609092712 loss = 0.18867947
   准确率 = 0.9381745254993439 loss = 0.24587512
   准确率 = 0.9415771108865738 loss = 0.15257683
epoch 5 :
   准确率 = 0.9434335339069366 loss = 0.20684972
   准确率 = 0.9439535462856292 loss = 0.17195609
   准确率 = 0.9429237735271454 loss = 0.23020278
   准确率 = 0.9455888968706131 loss = 0.1468554
epoch 6 :
   准确率 = 0.946572838425636

In [13]:
import pickle
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf

# 重置计算图，避免变量重复
tf.reset_default_graph()

loaded_graph = tf.Graph()

save_model_path = "ckpt/bi-lstm.ckpt"

with tf.Session(graph=loaded_graph) as sess:
  loader = tf.train.import_meta_graph(save_model_path + '.meta')
  loader.restore(sess, save_model_path)
  
  loaded_x = loaded_graph.get_tensor_by_name('X_input:0')
  loaded_y = loaded_graph.get_tensor_by_name('Y_input:0')
  loaded_keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
  loaded_lr = loaded_graph.get_tensor_by_name('lr:0')
  loaded_batch_size = loaded_graph.get_tensor_by_name('batch_size:0')
  loaded_acc = loaded_graph.get_tensor_by_name('accuracy:0')
  
  # 测试集合的准确率
  test_batch_size = 512
  test_accuracy = 0.0
  for test_batch_x, test_batch_y, test_batch_index in get_batch_X_Y(X_test, Y_test, test_batch_size):
    test_acc = sess.run(loaded_acc, 
              feed_dict={
                loaded_x: test_batch_x,
                loaded_y: test_batch_y,
                loaded_lr:1e-5,
                loaded_batch_size:test_batch_size,
                loaded_keep_prob: 1.0
              })
    test_accuracy += test_acc
  test_accuracy /= int(len(Y_test) / test_batch_size)
  print("测试集准确率 =", test_accuracy)
# sess.close()

Instructions for updating:
Use standard file APIs to check for files with this prefix.
INFO:tensorflow:Restoring parameters from ckpt/bi-lstm.ckpt
测试集准确率 = 0.9486892585754394


In [14]:
# A统计状态转移的频数
A = {
      'sb':0,
      'ss':0,
      'be':0,
      'bm':0,
      'me':0,
      'mm':0,
      'eb':0,
      'es':0
     }
# zy 表示转移概率矩阵
zy = {}
for label in sentence_labels:
    for t in range(len(label) - 1):
        key = label[t] + label[t+1]
        A[key] += 1.0
        
zy['sb'] = A['sb'] / (A['sb'] + A['ss'])
zy['ss'] = 1.0 - zy['sb']
zy['be'] = A['be'] / (A['be'] + A['bm'])
zy['bm'] = 1.0 - zy['be']
zy['me'] = A['me'] / (A['me'] + A['mm'])
zy['mm'] = 1.0 - zy['me']
zy['eb'] = A['eb'] / (A['eb'] + A['es'])
zy['es'] = 1.0 - zy['eb']
keys = sorted(zy.keys())
print('the transition probability: ')
for key in keys:
    print(key, zy[key])


zy = {i:np.log(zy[i]) for i in zy.keys()}
with open('data/data_zy.pkl', 'wb') as output:
  pickle.dump(zy, output)

the transition probability: 
be 0.8287395142819345
bm 0.1712604857180655
eb 0.5923696618295927
es 0.4076303381704073
me 0.5048718297888326
mm 0.4951281702111674
sb 0.6232520322915978
ss 0.37674796770840224


In [22]:
import numpy as np
import re
import tensorflow as tf
import pickle
import pandas as pd


with open('data/data.pkl', 'rb') as _input:
  X = pickle.load(_input)
  Y = pickle.load(_input)
  word2id = pickle.load(_input)
  id2word = pickle.load(_input)
  label2id = pickle.load(_input)
  id2label = pickle.load(_input)


with open('data/data_zy.pkl', 'rb') as _input:
  zy = pickle.load(_input)

maxLen = 32

# 重置计算图，避免变量重复
tf.reset_default_graph()

loaded_graph = tf.Graph()

save_model_path = "ckpt/bi-lstm.ckpt"

with tf.Session(graph=loaded_graph) as sess:
  loader = tf.train.import_meta_graph(save_model_path + '.meta')
  loader.restore(sess, save_model_path)
  
  loaded_x = loaded_graph.get_tensor_by_name('X_input:0')
  loaded_y = loaded_graph.get_tensor_by_name('Y_input:0')
  loaded_keep_prob = loaded_graph.get_tensor_by_name('keep_prob:0')
  loaded_lr = loaded_graph.get_tensor_by_name('lr:0')
  loaded_batch_size = loaded_graph.get_tensor_by_name('batch_size:0')
  loaded_y_pred = loaded_graph.get_tensor_by_name('y_pred:0')
  
  
  def viterbi(nodes):
      """
      维特比译码：除了第一层以外，每一层有4个节点。
      计算当前层（第一层不需要计算）四个节点的最短路径：
         对于本层的每一个节点，计算出路径来自上一层的各个节点的新的路径长度（概率）。保留最大值（最短路径）。
         上一层每个节点的路径保存在 paths 中。计算本层的时候，先用paths_ 暂存，然后把本层的最大路径保存到 paths 中。
         paths 采用字典的形式保存（路径：路径长度）。
         一直计算到最后一层，得到四条路径，将长度最短（概率值最大的路径返回）
      """
      paths = {'b': nodes[0]['b'], 's':nodes[0]['s']} # 第一层，只有两个节点
      for layer in range(1, len(nodes)):  # 后面的每一层
          paths_ = paths.copy()  # 先保存上一层的路径
          # node_now 为本层节点， node_last 为上层节点
          paths = {}  # 清空 path 
          for node_now in nodes[layer].keys():
              # 对于本层的每个节点，找出最短路径
              sub_paths = {} 
              # 上一层的每个节点到本层节点的连接
              for path_last in paths_.keys():
                  if path_last[-1] + node_now in zy.keys(): # 若转移概率不为 0 
                      sub_paths[path_last + node_now] = paths_[path_last] + nodes[layer][node_now] + zy[path_last[-1] + node_now]
              # 最短路径,即概率最大的那个
              sr_subpaths = pd.Series(sub_paths)
              sr_subpaths = sr_subpaths.sort_values()  # 升序排序
              node_subpath = sr_subpaths.index[-1]  # 最短路径
              node_value = sr_subpaths[-1]   # 最短路径对应的值
              # 把 node_now 的最短路径添加到 paths 中
              paths[node_subpath] = node_value
      # 所有层求完后，找出最后一层中各个节点的路径最短的路径
      sr_paths = pd.Series(paths)
      sr_paths = sr_paths.sort_values()  # 按照升序排序
      return sr_paths.index[-1]  # 返回最短路径（概率值最大的路径）


  def text2ids(text):
      """把字片段text转为 ids."""
      words = list(text)
      ids = list(word2id[words])
      if len(ids) >= maxLen:  # 长则弃掉
          print(u'输出片段超过%d部分无法处理' % (maxLen)) 
          return ids[:maxLen]
      ids.extend([0]*(maxLen-len(ids))) # 短则补全
      ids = np.asarray(ids).reshape([-1, maxLen])
      return ids


  def simple_cut(text):
      """对一个片段text（标点符号把句子划分为多个片段）进行预测。"""
      if text:
          text_len = len(text)
          X_batch = text2ids(text)  # 这里每个 batch 是一个样本
          fetches = [loaded_y_pred]
          feed_dict = {loaded_x:X_batch, loaded_lr:1.0, loaded_batch_size:1, loaded_keep_prob:1.0}
          _y_pred = sess.run(fetches, feed_dict)[0][:text_len]  # padding填充的部分直接丢弃
          nodes = [dict(zip(['s','b','m','e'], each[1:])) for each in _y_pred]
          tags = viterbi(nodes)
          words = []
          for i in range(len(text)):
              if tags[i] in ['s', 'b']:
                  words.append(text[i])
              else:
                  words[-1] += text[i]
          return words
      else:
          return []


  def cut_word(sentence):
      """首先将一个sentence根据标点和英文符号/字符串划分成多个片段text，然后对每一个片段分词。"""
      not_cuts = re.compile('([0-9\da-zA-Z ]+)|[。，、？！.\.\?,!]')
      result = []
      start = 0
      for seg_sign in not_cuts.finditer(sentence):
          result.extend(simple_cut(sentence[start:seg_sign.start()]))
          result.append(sentence[seg_sign.start():seg_sign.end()])
          start = seg_sign.end()
      result.extend(simple_cut(sentence[start:]))
      return result
  # 例子
  sentence = '新京报讯 （记者 滕朝）2月17日，由郭帆导演，吴京特别出演，屈楚萧、李光洁、吴孟达、赵今麦主演的《流浪地球》票房达到36.51亿（含预售票房），超过《红海行动》的36.50亿，成为目前内地影史票房亚军，正在向内地影史票房冠军《战狼2》的56.8亿目标冲刺。至此，演员吴京的作品包揽了内地票房榜的前两位。'
  result = cut_word(sentence)
  rss = ''
  for each in result:
      rss = rss + each + ' / '
  print(rss)

INFO:tensorflow:Restoring parameters from ckpt/bi-lstm.ckpt
新京 / 报讯 /   / （ / 记者 /   / 滕朝 / ） / 2 / 月 / 17 / 日 / ， / 由 / 郭帆导演 / ， / 吴京 / 特别 / 出演 / ， / 屈楚 / 萧 / 、 / 李光洁 / 、 / 吴孟达 / 、 / 赵今麦 / 主演 / 的 / 《 / 流浪 / 地球 / 》 / 票房 / 达到 / 36 / . / 51 / 亿（ / 含 / 预售 / 票房 / ） / ， / 超过 / 《 / 红海 / 行动 / 》 / 的 / 36 / . / 50 / 亿 / ， / 成为 / 目前 / 内地 / 影史 / 票房 / 亚军 / ， / 正在 / 向 / 内地 / 影史 / 票房 / 冠军 / 《 / 战狼 / 2 / 》 / 的 / 56 / . / 8 / 亿目标 / 冲刺 / 。 / 至此 / ， / 演员 / 吴京 / 的 / 作品 / 包揽 / 了 / 内 / 地票 / 房榜 / 的 / 前 / 两位 / 。 / 


参考链接：
1. http://spaces.ac.cn/archives/3924/ 
2. https://github.com/yongyehuang/deepnlp/blob/master/deepnlp/pos/pos_model_bilstm.py
3. https://blog.csdn.net/Jerr__y/article/details/70471066