In [1]:
# -*- coding: utf-8 -*-
import numpy as np
import tensorflow as tf
from tensorflow.contrib import rnn
import random
import time
from collections import Counter

start_time = time.time()


def elapsed(sec):
    if sec < 60:
        return str(sec) + " sec"
    elif sec < (60 * 60):
        return str(sec / 60) + " min"
    else:
        return str(sec / (60 * 60)) + " hr"


# Target log path
tf.reset_default_graph()
training_file = 'wordstest.txt'


# 中文多文件
def readalltxt(txt_files):
    labels = []
    for txt_file in txt_files:
        target = get_ch_lable(txt_file)
        labels.append(target)
    return labels


# 中文字
def get_ch_lable(txt_file):
    labels = ""
    with open(txt_file, 'rb') as f:
        for label in f:
            labels = labels + label.decode('utf-8')
            # labels = labels + label.decode('gb2312')
    return labels


# 优先转文件里的字符到向量
def get_ch_lable_v(txt_file, word_num_map, txt_label=None):
    words_size = len(word_num_map)
    to_num = lambda word: word_num_map.get(word, words_size)
    if txt_file != None:
        txt_label = get_ch_lable(txt_file)

    labels_vector = list(map(to_num, txt_label))
    return labels_vector

In [2]:
training_data = get_ch_lable(training_file)
print("Loaded training data...")

counter = Counter(training_data)
words = sorted(counter)
words_size = len(words)
word_num_map = dict(zip(words, range(words_size)))

print('字表大小:', words_size)
wordlabel = get_ch_lable_v(training_file, word_num_map)

Loaded training data...
字表大小: 69


In [3]:
# 定义参数
learning_rate = 0.001
training_iters = 10000
display_step = 1000
n_input = 4

n_hidden1 = 256
n_hidden2 = 512
n_hidden3 = 512
# 定义占位符
x = tf.placeholder("float", [None, n_input, 1])
wordy = tf.placeholder("float", [None, words_size])

In [4]:
x1 = tf.reshape(x, [-1, n_input])
x2 = tf.split(x1, n_input, 1)
# 2-layer LSTM，每层有 n_hidden 个units
rnn_cell = rnn.MultiRNNCell([rnn.LSTMCell(n_hidden1), rnn.LSTMCell(n_hidden2), rnn.LSTMCell(n_hidden3)])

# 通过RNN得到输出
outputs, states = rnn.static_rnn(rnn_cell, x2, dtype=tf.float32)

# 通过全连接输出指定维度
pred = tf.contrib.layers.fully_connected(outputs[-1], words_size, activation_fn=None)

In [5]:
# 定义loss与优化器
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=pred, labels=wordy))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss)

# 模型评估
correct_pred = tf.equal(tf.argmax(pred, 1), tf.argmax(wordy, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See `tf.nn.softmax_cross_entropy_with_logits_v2`.



In [None]:
savedir = "C:\\Users\\zq\\NLP"
saver = tf.train.Saver(max_to_keep=1)  # 生成saver

# 启动session
with tf.Session() as session:
    session.run(tf.global_variables_initializer())
    step = 0
    offset = random.randint(0, n_input + 1)
    end_offset = n_input + 1
    acc_total = 0
    loss_total = 0

    kpt = tf.train.latest_checkpoint(savedir)
    print("kpt:", kpt)
    startepo = 0
    if kpt != None:
        saver.restore(session, kpt)
        ind = kpt.find("-")
        startepo = int(kpt[ind + 1:])
        print(startepo)
        step = startepo

    while step < training_iters:

        # 随机取一个位置偏移
        if offset > (len(training_data) - end_offset):
            offset = random.randint(0, n_input + 1)

        inwords = [[wordlabel[i]] for i in range(offset, offset + n_input)]  # 按照指定的位置偏移获得后4个文字向量，当作输入

        inwords = np.reshape(np.array(inwords), [-1, n_input, 1])

        out_onehot = np.zeros([words_size], dtype=float)
        out_onehot[wordlabel[offset + n_input]] = 1.0
        out_onehot = np.reshape(out_onehot, [1, -1])  # 所有的字都变成onehot

        _, acc, lossval, onehot_pred = session.run([optimizer, accuracy, loss, pred],
                                                   feed_dict={x: inwords, wordy: out_onehot})
        loss_total += lossval
        acc_total += acc
        if (step + 1) % display_step == 0:
            print("Iter= " + str(step + 1) + ", Average Loss= " + \
                  "{:.6f}".format(loss_total / display_step) + ", AverageAccuracy= " + \
                  "{:.2f}%".format(100 * acc_total / display_step))
            acc_total = 0
            loss_total = 0
            in2 = [words[wordlabel[i]] for i in range(offset, offset + n_input)]
            out2 = words[wordlabel[offset + n_input]]
            out_pred = words[int(tf.argmax(onehot_pred, 1).eval())]
            print("%s - [%s] vs [%s]" % (in2, out2, out_pred))
            saver.save(session, savedir + "rnnwordtest.cpkt", global_step=step)
        step += 1
        offset += (n_input + 1)  # 调整下一次迭代使用的偏移量

    print("Finished!")
    saver.save(session, savedir + "rnnwordtest.cpkt", global_step=step)
    print("Elapsed time: ", elapsed(time.time() - start_time))
    
    
    while True:
        prompt = "请输入%s个字: " % n_input
        sentence = input(prompt)
        inputword = sentence.strip()

        if len(inputword) != n_input:
            print("您输入的字符长度为：", len(inputword), "请输入4个字")
            continue
        try:
          
            inputword = get_ch_lable_v(None, word_num_map, inputword)

            for i in range(32):
                keys = np.reshape(np.array(inputword), [-1, n_input, 1])
                onehot_pred = session.run(pred, feed_dict={x: keys})
                onehot_pred_index = int(tf.argmax(onehot_pred, 1).eval())
                sentence = "%s%s" % (sentence, words[onehot_pred_index])
                inputword = inputword[1:]
                inputword.append(onehot_pred_index)
            print(sentence)
        except:
            print("该字我还没学会")

kpt: None
Iter= 1000, Average Loss= 3.643567, AverageAccuracy= 9.10%
['再', '琐', '碎', '，'] - [我] vs [蕴]
Iter= 2000, Average Loss= 1.866052, AverageAccuracy= 43.80%
['信', '念', '，', '默'] - [守] vs [守]
Iter= 3000, Average Loss= 0.697497, AverageAccuracy= 74.30%
['要', '心', '头', '悬'] - [挂] vs [挂]
Iter= 4000, Average Loss= 0.377227, AverageAccuracy= 86.00%
['琐', '碎', '，', '我'] - [们] vs [们]
Iter= 5000, Average Loss= 0.213252, AverageAccuracy= 92.10%
['心', '，', '前', '行'] - [的] vs [的]
Iter= 6000, Average Loss= 0.158144, AverageAccuracy= 94.20%
['的', '灯', '光', '，'] - [我] vs [我]
Iter= 7000, Average Loss= 0.320147, AverageAccuracy= 90.40%
['、', '再', '普', '通'] - [、] vs [，]
Iter= 8000, Average Loss= 0.188951, AverageAccuracy= 92.80%
['默', '守', '一', '种'] - [精] vs [精]
Iter= 9000, Average Loss= 0.174070, AverageAccuracy= 94.80%
['都', '要', '坚', '持'] - [一] vs [一]
Iter= 10000, Average Loss= 0.197675, AverageAccuracy= 92.80%
['，', '只', '要', '心'] - [头] vs [头]
Finished!
Elapsed time:  6.947554747263591 min
请输入