# 基于字符的中文分词

> 参考资料
> 
> 《TensorFlow入门与实战》
> 
> 

In [1]:
import os
import sys
import time

import codecs
import re
import numpy as np
import tensorflow as tf

import random

In [2]:
flags = tf.app.flags
flags.DEFINE_integer("batch_size", 10, "Numbers of training examples each step processes ")
flags.DEFINE_integer("embedding_size", 200, " ")

flags.DEFINE_string("word2vec_path", "/Users/sunlu/Workspaces/PyCharm/Github/tensorflowbook/Chapter6/wordseg/embeding.txt", " ")
flags.DEFINE_string("train_file", "/Users/sunlu/Workspaces/PyCharm/Github/tensorflowbook/Chapter6/wordseg/msr_training.utf8", " ")
flags.DEFINE_string("test_file", "/Users/sunlu/Workspaces/PyCharm/Github/tensorflowbook/Chapter6/wordseg/msr_test_gold.utf8", " ")
flags.DEFINE_string("model_save_path", "./save/", " ")

flags.DEFINE_float("dropout", 0.75, " ")
flags.DEFINE_float("gpu_memory_fraction", 0.1, " ")
FLAGS = flags.FLAGS

In [3]:
INITIAL_LEARNING_RATE = 0.003

In [4]:
word2vec_dict={}
word2vec_num=0
word2vec_size=FLAGS.embedding_size
    
train_list = []
label_list = []

test_list = []
test_label_list = []
test_index_in_epoch = 0      #取test batch的时候记录位置
train_index_in_epoch = 0     #取test train batch的时候记录位置
    
batch_size = FLAGS.batch_size
embedding_size = FLAGS.embedding_size
window_size = 5

In [5]:
def stringlist2floatlist(list):
    floatlist=[]
    for i in range(len(list)):
        floatlist.append(float(list[i]))
    return floatlist

In [6]:
def init_word2vec_dict(word2vec_path):
    '''
    将word2vec训练好的字向量加载到内存
    '''
    if not os.path.exists(word2vec_path):
        print("file %s not exist" % word2vec_path)
        return
    with codecs.open(word2vec_path, "r","utf-8") as f:
        #第一行是大小值，先略过
        isfirstline = True
        for line in f:
            list = line.strip().split(' ')
            if isfirstline == True:
                isfirstline=False
                word2vec_num=int(list[0])
                word2vec_size=int(list[1])
                continue
            key=list[0]
            value=list[1:]
            value=stringlist2floatlist(value)
            word2vec_dict[key]=value

In [7]:
init_word2vec_dict(FLAGS.word2vec_path)
max_feature_len = 600

In [8]:
def load_input_and_label(file_path):
    '''
    将数据读入内存。
    每一句话的前面用三个PAD填充
    Args:
      file_path: 输入的经过转换的文件
    '''
    all_data_list = []
    all_label_list = []

    with codecs.open(file_path, 'r', 'utf-8') as input_data:
      
      for line in input_data:
            char_list = []
            label_list = []
            
            word_list = line.strip().split()
            if len(word_list) ==0:
                continue
            
            char_list.append("PAD")
            char_list.append("PAD")
            char_list.append("PAD")
            label_list.append("PAD")
            label_list.append("PAD")
            label_list.append("PAD")
            
            for word in word_list:
                word_array = word.split('/')
                char_list.append(word_array[0])
                label_list.append(word_array[1])
            
            char_list.append("PAD")
            char_list.append("PAD")
            char_list.append("PAD")
            label_list.append("PAD")
            label_list.append("PAD")
            label_list.append("PAD")
            
            all_data_list.append(char_list)
            all_label_list.append(label_list)
            
    return all_data_list, all_label_list

In [9]:
def read_train_file():
    ''' 
    读取训练文件， 
    首先根据训练文件中分词的情况，
    将训练文件中每个字做一个S/B/M/E的标记
    然后将数据的正确输入和label保存到内存中
    '''
    converted_file = "/Users/sunlu/Workspaces/PyCharm/Github/tensorflowbook/Chapter6/wordseg/msr_training.convert"
    #self._character_tagging(FLAGS.train_file, converted_file)
    train_list, label_list = load_input_and_label(converted_file)

In [10]:
def read_test_file():
    ''' 读取一个测试文件， 读取测试数据'''
    converted_file = "/Users/sunlu/Workspaces/PyCharm/Github/tensorflowbook/Chapter6/wordseg/msr_test_gold.convert"
    #self._character_tagging(FLAGS.test_file,converted_file)
    test_list, test_label_list  = load_input_and_label(converted_file)

In [11]:
def inference(input_data, seq_len):
    ''' 向前计算过程描述
    Args:
        layer_number:表示训练的层数， 用来在进行逐层训练的时候指定
    '''
    num_hidden = 120
    num_layers = 2

    w3 = tf.get_variable("w3", [num_hidden*2, 256],
                         initializer=tf.random_normal_initializer(stddev=0.1))
    b3 = tf.get_variable("b3", [256],
                         initializer=tf.constant_initializer(0.0))
    w2 = tf.get_variable("w2", [256, 4],
                         initializer=tf.random_normal_initializer(stddev=0.1))
    b2 = tf.get_variable("b2", [4],
                         initializer=tf.constant_initializer(0.0))

    fw_cell = tf.contrib.rnn.LSTMCell(
        num_hidden, initializer=tf.random_normal_initializer(stddev=0.1))
    bw_cell = tf.contrib.rnn.LSTMCell(
        num_hidden, initializer=tf.random_normal_initializer(stddev=0.1))
    
    fw_cells = [fw_cell] * num_layers
    bw_cells = [bw_cell] * num_layers

    outputs, _, _ = tf.contrib.rnn.stack_bidirectional_dynamic_rnn(
        fw_cells, bw_cells, input_data, sequence_length=seq_len, dtype=tf.float32
    )
    outputs = tf.reshape(outputs, [-1, num_hidden * 2])

    fc3 = tf.nn.relu(tf.nn.xw_plus_b(outputs, w3, b3))
    #再做一个线性变化
    logits = tf.nn.xw_plus_b(fc3,w2 , b2)
    logits = tf.reshape(logits, [batch_size, -1, 4])
    return logits

In [12]:
def get_vector(key):
    '''
    通过一个字获取这个字的字向量， 
    如果没有的话， 就随机生成一个， 并且将随机生成的保存
    '''

    if key=="PAD":
        return get_zero_list(word2vec_size)

    if key not in word2vec_dict:
        #如果没有这个key，就生成一个随机的， 放到字典里
        value=[]
        for i in range(word2vec_size):
            value.append(random.uniform(-1,1))
        word2vec_dict[key]=value
        word2vec_num+=1
        return value
    return word2vec_dict[key]

In [13]:
def gen_label_value(k):
    s = [1,0,0,0]
    b = [0,1,0,0]
    m = [0,0,1,0]
    e = [0,0,0,1]

    if k=='S':
        return 0
    if k=='B':
        return 1
    if k=='M':
        return 2
    if k=='E':
        return 3
    return 0

In [14]:
def next_train_batch(batch_size):
    '''读取测试训练数据本身的数据'''
    start = train_index_in_epoch

    context_size = window_size // 2
    data_set = []
    label_set = []
    seq_len_list = []

    getsize = 0
    while getsize < batch_size:
        seq_len = 0
        if start >= len(train_list):
            start = 0
            
        char_list = train_list[start]
        char_vector_list = []
        char_label_list = []
        
        for i in range(len(char_list)):
            char = char_list[i]
            if char == "PAD":
                continue
            
            word_context=[]
            for k in range(-context_size, -context_size + window_size):
                char = char_list[i+k]
                word_context.extend(get_vector(char))
                
            char_vector_list.append(word_context)
            
            char_label_list.append(gen_label_value(label_list[start][i]))
            seq_len += 1
            
        #补0
        pad_vector = []
        for k in range(window_size):
            pad_vector.extend(get_vector("PAD"))

        
        pad_number = max_feature_len - seq_len
        
        char_vector_list = char_vector_list + [pad_vector] * pad_number
        
        char_label_list = char_label_list + [0] * pad_number
        
        if seq_len > max_feature_len:
            start += 1
            continue
        
        data_set.append(char_vector_list)
        label_set.append(char_label_list)
        seq_len_list.append(seq_len)
        start += 1
        getsize += 1
    train_index_in_epoch = start
    return data_set, label_set, seq_len_list

In [15]:
def next_test_batch(batch_size):
    ''' 读取训练数据一个batch ,得到字向量 '''
    start = test_index_in_epoch

    context_size = int(window_size / 2)
    data_set = []
    label_set = []
    seq_len_list = []

    getsize = 0
    while getsize < batch_size:
        seq_len = 0
        if start >= len(test_list):
            start = 0
            
        char_list = test_list[start]
        char_vector_list = []
        char_label_list = []
        
        for i in range(len(char_list)):
            char = char_list[i]
            if char == "PAD":
                continue
            
            word_context=[]
            for k in range(-context_size, -context_size + window_size):
                char = char_list[i + k]
                word_context.extend(get_vector(char))
                
            char_vector_list.append(word_context)
            char_label_list.append(gen_label_value(test_label_list[start][i]))
            seq_len += 1
            
        #补0
        pad_vector = []
        for k in range(window_size):
            pad_vector.extend(get_vector("0"))
        
        pad_number = max_feature_len - seq_len
        
        char_vector_list = char_vector_list + [pad_vector] * pad_number
        char_label_list = char_label_list + [0] * pad_number
        
        if seq_len > max_feature_len:
            start += 1
            continue
            
        data_set.append(char_vector_list)
        label_set.append(char_label_list)
        seq_len_list.append(seq_len)
        start += 1
        getsize += 1
        
    test_index_in_epoch = start
    return data_set, label_set, seq_len_list

In [16]:
def run_training():
    '''
    训练过程
    '''
    read_train_file()
    read_test_file()
    print("read file complete")

    global_step = tf.get_variable('global_step', [],initializer=tf.constant_initializer(0), trainable=False)
    opt = tf.train.GradientDescentOptimizer(INITIAL_LEARNING_RATE)
    input_data = tf.placeholder(tf.float32, [batch_size, max_feature_len, window_size * embedding_size])
    labels = tf.placeholder(tf.int32, [batch_size, max_feature_len])
    seq_len = tf.placeholder(tf.int32, [batch_size])
    logits = inference( input_data, seq_len)
    cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=labels, logits=logits, name="cross_entropy")
    cross_entropy_mean = tf.reduce_mean(cross_entropy, name='xentropy_mean')
    
    grads = opt.compute_gradients(cross_entropy_mean)
    apply_gradient_op = opt.apply_gradients(grads, global_step=global_step)

    train_op = tf.group(apply_gradient_op )
    
    total_correct = 0
    for i in range(batch_size):
        per_logits_reshape = logits[i][:seq_len[i]]
        per_labels_reshape = labels[i][:seq_len[i]]
        correct = tf.nn.in_top_k(per_logits_reshape, per_labels_reshape, 1)
        eval_correct = tf.reduce_sum(tf.cast(correct, tf.int32))
        total_correct += eval_correct
    saver = tf.train.Saver()
      
    init = tf.initialize_all_variables()
    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=FLAGS.gpu_memory_fraction)
    with tf.Session(config=tf.ConfigProto(
        allow_soft_placement=True,
        log_device_placement=False,
        gpu_options=gpu_options)) as sess:
        
        sess.run(init)
        print("init variable complete")
        
        for i in range(2000):#10000000
            train_batch, label_batch, seq_len_list = next_train_batch(batch_size )
            
            start_time = time.time()
            
            feed_dict ={input_data:train_batch, labels:label_batch, seq_len: seq_len_list}
            _, loss_value = sess.run([train_op,cross_entropy_mean],feed_dict=feed_dict)
            duration = time.time() - start_time
            if i % 50 == 0 and i > 0:
                print("step: %d loss %f" % (i, loss_value))
            if i % 100 == 0 and i > 0:
                test_batch, test_label_batch, test_seq_len_list = next_test_batch(batch_size  )
                feed_dict ={input_data:test_batch, labels:test_label_batch, seq_len: test_seq_len_list}
                _, loss_value,true_count   = sess.run([train_op,cross_entropy_mean, total_correct],feed_dict=feed_dict)
                total_count = 0
                
                for k in range(len(test_seq_len_list)):
                    total_count += test_seq_len_list[k]
                print("test: %d loss %f correct_ratio %f" % (i, loss_value, true_count/total_count))
            if i % 1000 == 0 and i > 0:
                saver.save(sess, os.path.join(FLAGS.model_save_path, "model.ckpt"), global_step=i)
                print("save model at %s, step: %d" %( os.path.join(FLAGS.model_save_path, "model.ckpt"), i))

In [17]:
def main(_):
    run_training()

In [18]:
if __name__ == "__main__":
    tf.app.run()

read file complete


  from ._conv import register_converters as _register_converters


ValueError: Trying to share variable stack_bidirectional_rnn/cell_0/bidirectional_rnn/fw/lstm_cell/kernel, but specified shape (360, 480) and found shape (1120, 480).