In [1]:
from collections import Counter
import numpy as np
import tensorflow.contrib.keras as kr
import tensorflow as tf
import time
from datetime import timedelta
import os
from sklearn import metrics

import moxing as mox
mox.file.shift('os', 'mox')

INFO:root:Using MoXing-v1.14.1-ddfd6c9a
INFO:root:Using OBS-Python-SDK-3.1.2


In [2]:
trainDataPath = "s3://corpus-text-classification1/data/train_5500.label.txt"
testDataPath = "s3://corpus-text-classification1/data/TREC_10.label.txt"
# vocabPath = "s3://corpus-text-classification1/data/vocab_freq.txt"
savePath = "s3://corpus-text-classification1/saveModel_adv_biLSTM/saveModel_adv_biLSTM"

In [10]:
def readfile(filePath):
    """读取文件内容，返回文本和标签列表"""
    contents, labels = [], []
    with open(filePath, 'r', encoding='utf-8', errors='ignore') as f:
        for line in f:
            try:
                word = line.strip().split()
                label = word[0].split(":")[0]
                content = word[1:]
                
                contents.append(content)
                labels.append(label)
            except:
                pass
    return contents, labels


def readCategory():
    """读取分类目录，固定id"""
    categories = ['ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM']
    cat_to_id = dict(zip(categories, range(len(categories))))
    return categories, cat_to_id


def read_vocab(vocabPath):
    """读取词汇表"""
    with open(vocabPath, 'r', encoding='utf-8', errors='ignore') as fp:
        words = [_.strip() for _ in fp.readlines()]
    word_to_id = dict(zip(words, range(len(words))))
    return words, word_to_id


def buildVocab(contents_train, contents_test):
    """根据训练集构建词汇表，存储"""
    # extend都可以用来合并两个列表，不同点在于extend是在原列表修改，而 + 是生成新的列表
    contents_all = contents_train + contents_test
    all_data = []
    for content in contents_all:
        all_data.extend(content)

    # 遍历得到每个单词及其出现的次数，组成字典返回
    counter = Counter(all_data)   # Counter({'?': 2, ',': 2, 'NUM:dist': 1, 'How': 1})
    # 统计得到出现频率最高的前    
    count_pairs = counter.most_common()  # [('?', 2), (',', 2), ('NUM:dist', 1), ('How', 1)]
    words, freqs = zip(*count_pairs)  # zip(*) 可以看做是解压，即与zip()相反   ('?', ',', 'NUM:dist', 'How')
    # 添加一个 <PAD> 来将所有文本pad为同一长度
    words = list(words)   # list(Counter(all_data).keys())，但要保证顺序，只能这样了 Counter(a).keys()会改变原来的顺序
    # open(vocabPath, 'w', encoding='utf-8', errors='ignore').write('\n'.join(words) + '\n')
    word_to_id = dict(zip(words, range(len(words))))
    return list(freqs), words, word_to_id

In [4]:
# 构建adversarailLSTM模型
class AdversarailLSTM(object):

    def __init__(self, wordEmbedding, inputX, inputY, dropout, indexFreqs):
        # 定义输入
        self.inputX = inputX
        self.inputY = inputY

        self.dropoutKeepProb = dropout

        # 根据词频计算权重
        # indexFreqs[0], indexFreqs[1] = 20000, 10000
        weights = tf.cast(tf.reshape(indexFreqs / tf.reduce_sum(indexFreqs), [1, len(indexFreqs)]), dtype=tf.float32)

        # 词嵌入层
        with tf.name_scope("wordEmbedding"):
            # 利用预训练的词向量初始化词嵌入矩阵
            normWordEmbedding = self._normalize(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), weights)
            # self.W = tf.Variable(tf.cast(wordEmbedding, dtype=tf.float32, name="word2vec"), name="W")
            # normWordEmbedding = wordEmbedding
            self.embeddedWords = tf.nn.embedding_lookup(normWordEmbedding, self.inputX)

        # 计算softmax交叉熵损失
        with tf.name_scope("loss"):
            with tf.variable_scope("Bi-LSTM", reuse=None):
                self.predictions = self._Bi_LSTMAttention(self.embeddedWords)
                # self.y_pred_cls = tf.cast(tf.greater_equal(self.predictions, 0.5), tf.float32, name="binaryPreds")
                self.y_pred_cls = tf.argmax(tf.nn.softmax(self.predictions),1)  # 预测类别 tf.argmax：返回每一行或每一列的最大值 1为里面（每一行），0为外面（每一列）
                # losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY)
                losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY)
                loss = tf.reduce_mean(losses)

        
        with tf.name_scope("perturloss"):
            with tf.variable_scope("Bi-LSTM", reuse=True):
                perturWordEmbedding = self._addPerturbation(self.embeddedWords, loss)
                print("perturbSize:{}".format(perturWordEmbedding))
                perturPredictions = self._Bi_LSTMAttention(perturWordEmbedding)
                # perturLosses = tf.nn.sigmoid_cross_entropy_with_logits(logits=perturPredictions, labels=self.inputY)
                perturLosses = tf.nn.softmax_cross_entropy_with_logits(logits=perturPredictions, labels=self.inputY)
                perturLoss = tf.reduce_mean(perturLosses)

        self.loss = loss + perturLoss
        
        self.loss = loss
        
        
    def _Bi_LSTMAttention(self, embeddedWords):
        # 定义两层双向LSTM的模型结构
        with tf.name_scope("Bi-LSTM"):
            fwHiddenLayers = []
            bwHiddenLayers = []
            for idx, hiddenSize in enumerate(hiddenSizes):
                with tf.name_scope("Bi-LSTM" + str(idx)):
                    # 定义前向网络结构
                    lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(
                        tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                        output_keep_prob=self.dropoutKeepProb)

                    # 定义反向网络结构
                    lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(
                        tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                        output_keep_prob=self.dropoutKeepProb)

                fwHiddenLayers.append(lstmFwCell)
                bwHiddenLayers.append(lstmBwCell)

            # 实现多层的LSTM结构， state_is_tuple=True，则状态会以元祖的形式组合(h, c)，否则列向拼接
            fwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=fwHiddenLayers, state_is_tuple=True)
            bwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=bwHiddenLayers, state_is_tuple=True)
            # 采用动态rnn，可以动态地输入序列的长度，若没有输入，则取序列的全长
            # outputs是一个元组(output_fw, output_bw), 其中两个元素的维度都是[batch_size, max_time, hidden_size], fw和bw的hiddensize一样
            # self.current_state是最终的状态，二元组(state_fw, state_bw), state_fw=[batch_size, s], s是一个元组(h, c)
            outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(fwMultiLstm, bwMultiLstm,
                                                                          self.embeddedWords, dtype=tf.float32,
                                                                          scope="bi-lstm" + str(idx))

        # 在bi-lstm+attention论文中，将前向和后向的输出相加
        with tf.name_scope("Attention"):
            H = outputs[0] + outputs[1]

            # 得到attention的输出
            output = self.attention(H)
            outputSize = hiddenSizes[-1]
            print("outputSize:{}".format(outputSize))

        # 全连接层的输出
        with tf.name_scope("output"):
            outputW = tf.get_variable(
                "outputW",
                shape=[outputSize, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())

            outputB = tf.Variable(tf.constant(0.1, shape=[num_classes]), name="outputB")

            predictions = tf.nn.xw_plus_b(output, outputW, outputB, name="predictions")

            return predictions

    def attention(self, H):
        """
        利用Attention机制得到句子的向量表示
        """
        # 获得最后一层lstm神经元的数量
        hiddenSize = hiddenSizes[-1]

        # 初始化一个权重向量，是可训练的参数
        W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1))

        # 对bi-lstm的输出用激活函数做非线性转换
        M = tf.tanh(H)

        # 对W和M做矩阵运算，W=[batch_size, time_step, hidden_size], 计算前做维度转换成[batch_size * time_step, hidden_size]
        # newM = [batch_size, time_step, 1], 每一个时间步的输出由向量转换成一个数字
        newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))

        # 对newM做维度转换成[batch_size, time_step]
        restoreM = tf.reshape(newM, [-1, seq_length])

        # 用softmax做归一化处理[batch_size, time_step]
        self.alpha = tf.nn.softmax(restoreM)

        # 利用求得的alpha的值对H进行加权求和，用矩阵运算直接操作
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, seq_length, 1]))

        # 将三维压缩成二维sequeezeR = [batch_size, hissen_size]
        sequeezeR = tf.squeeze(r)

        sentenceRepren = tf.tanh(sequeezeR)

        # 对attention的输出可以做dropout处理
        output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb)

        return output

    def _normalize(self, wordEmbedding, weights):
        """
        对word embedding 结合权重做标准化处理
        """
        mean = tf.matmul(weights, wordEmbedding)
        powWordEmbedding = tf.pow(wordEmbedding - mean, 2.)

        var = tf.matmul(weights, powWordEmbedding)
        stddev = tf.sqrt(1e-6 + var)

        return (wordEmbedding - mean) / stddev

    def _addPerturbation(self, embedded, loss):
        """
        添加波动到word embedding
        """
        grad, = tf.gradients(
            loss,
            embedded,
            aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
        grad = tf.stop_gradient(grad)
        perturb = self._scaleL2(grad, epsilon)
        # print("perturbSize:{}".format(embedded+perturb))
        return embedded + perturb

    def _scaleL2(self, x, norm_length):
        # shape(x) = [batch, num_step, d]
        # divide x by max(abs(x)) for a numerically stable L2 norm
        # 2norm(x) = a * 2norm(x/a)
        # scale over the full sequence, dim(1, 2)
        alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12
        l2_norm = alpha * tf.sqrt(tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keep_dims=True) + 1e-6)
        x_unit = x / l2_norm
        return norm_length * x_unit

In [5]:
contents_train, labels_train = readfile(trainDataPath)
contents_test, labels_test = readfile(testDataPath)
contents_val, labels_val = contents_train[-452:], labels_train[-452:]
contents_train, labels_train = contents_train[:-452], labels_train[:-452]

In [6]:
len(contents_val),len(labels_train),len(contents_test)

(452, 5000, 500)

In [7]:
contents_train[:2]

[['How',
  'did',
  'serfdom',
  'develop',
  'in',
  'and',
  'then',
  'leave',
  'Russia',
  '?'],
 ['What', 'films', 'featured', 'the', 'character', 'Popeye', 'Doyle', '?']]

In [8]:
category_test = set(labels_test)
category_train = set(labels_train)
print(len(labels_test),len(category_test))
print(len(labels_train),len(category_train))
category_test,category_train

500 6
5000 6


({'ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'},
 {'ABBR', 'DESC', 'ENTY', 'HUM', 'LOC', 'NUM'})

In [11]:
indexFreqs,words, word_to_id = buildVocab(contents_train, contents_test)
categories, cat_to_id = readCategory()  # cat_to_id {'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}
# words, word_to_id = read_vocab(vocabPath)
vocab_size = len(words)
num_classes = len(categories)

In [12]:
num_classes, vocab_size, words[:3], indexFreqs[:3]

(6, 9318, ['?', 'the', 'What'], [5407, 3615, 3325])

In [13]:
contents_all = contents_train + contents_test
seq_length = 0
for content in contents_all:
    if seq_length < len(content):
        seq_length = len(content)   # seq_length = 37

In [14]:
seq_length

37

In [15]:
cat_to_id

{'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

In [16]:
def process_file(contents, labels, word_to_id, cat_to_id, pad_max_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    # contents, labels = readfile(filePath)
    data_id, label_id = [], []
    # 将文本内容转换为对应的id形式
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(cat_to_id[labels[i]])
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    y_pad = kr.utils.to_categorical(label_id, num_classes=len(cat_to_id))  # 将标签转换为one-hot表示
    ''' https://blog.csdn.net/nima1994/article/details/82468965
    to_categorical(y, num_classes=None, dtype='float32')
        将整型标签转为onehot。y为int数组，num_classes为标签类别总数，大于max(y)（标签从0开始的）。
        返回：如果num_classes=None，返回len(y) * [max(y)+1]（维度，m*n表示m行n列矩阵，下同），否则为len(y) * num_classes。
    '''
    return x_pad, y_pad


def get_time_dif(start_time):
    """获取已使用时间"""
    end_time = time.time()
    time_dif = end_time - start_time
    return timedelta(seconds=int(round(time_dif)))

In [17]:
print("Loading training and validation and testing data...")
start_time = time.time()
x_train, y_train = process_file(contents_train, labels_train, word_to_id, cat_to_id, seq_length)  # seq_length = 600
x_val, y_val = process_file(contents_val, labels_val, word_to_id, cat_to_id, seq_length)
x_test, y_test = process_file(contents_test, labels_test, word_to_id, cat_to_id, seq_length)
time_dif = get_time_dif(start_time)
print("Loading data Time usage:", time_dif)

Loading training and validation and testing data...
Loading data Time usage: 0:00:00


In [18]:
contents_val[:3]

[['What', 'is', 'a', 'person', "'s", 'socioeconomic', 'position', '?'],
 ['What',
  'do',
  'you',
  'say',
  'to',
  'a',
  'friend',
  'who',
  'ignores',
  'you',
  'for',
  'other',
  'friends',
  '?'],
 ['How', 'many', 'yards', 'are', 'in', '1', 'mile', '?']]

In [19]:
x_val[:3]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    2,    3,    6,
         254,    8, 1746,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    2,   20,   28,  194,   10,    6,  893,   85,
          28,   14,  386,    0],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0,    0,    7,   21, 6469,   13,
           5,  216,  535,    0]], dtype=int32)

In [20]:
seq_length,num_classes

(37, 6)

In [21]:
embedding_dim = 300
dropout_keep_prob = 0.5
hiddenSizes = [128]  # 定义LSTM的隐藏层（一层，128个神经元）
epsilon = 5
learning_rate = 0.001
num_epochs = 10
batch_size = 64
print_per_batch = 20  # 每多少轮输出一次结果

In [22]:
inputX = tf.placeholder(tf.int32, [None, seq_length], name="inputX")
inputY = tf.placeholder(tf.float32, [None, num_classes], name="inputY")
dropoutKeepProb = tf.placeholder(tf.float32, name="dropoutKeepProb")
embedding = tf.get_variable('embedding', [vocab_size, embedding_dim])

In [23]:
sess = tf.Session()
lstm = AdversarailLSTM(embedding,inputX,inputY,dropout_keep_prob,indexFreqs)

outputSize:128
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


perturbSize:Tensor("perturloss/Bi-LSTM/add_2:0", shape=(?, 37, 300), dtype=float32)
outputSize:128


In [28]:
saver = tf.train.Saver()

In [24]:
num_filters = 256
kernel_size = 5
hidden_dim = 128
learning_rate = 1e-3
dropout_keep_prob = 0.5

num_epochs = 20
batch_size = 64
print_per_batch = 20  # 每多少轮输出一次结果
save_per_batch = 5  # 每多少轮存入tensorboard

In [25]:
globalStep = tf.Variable(0, name="globalStep", trainable=False)
# 定义优化函数，传入学习速率参数
optimizer = tf.train.AdamOptimizer(learning_rate)
# 计算梯度,得到梯度和变量
gradsAndVars = optimizer.compute_gradients(lstm.loss)
# 将梯度应用到变量下，生成训练器
trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)

# 准确率
correct_pred = tf.equal(tf.argmax(lstm.inputY, 1), lstm.y_pred_cls)
acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))

# 初始化所有变量
sess.run(tf.global_variables_initializer())

In [26]:
def batch_iter(x_pad, y_pad, batch_size):
    """生成批次数据"""
    data_len = len(x_pad)
    num_batch = int((data_len - 1) / batch_size) + 1
    # np.arange()生成0到data_len的等差数列，默认等差为1；np.random.permutation()打乱生成的等差序列的顺序
    # 下面三句语句是为了将训练或测试文本的顺序打乱，因为原文本中每个分类的样本全部挨在一起，这样每个batch训练的都是同一个分类，不太好，打乱后每个batch可包含不同分类
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x_pad[indices]
    y_shuffle = y_pad[indices]

    # 返回所有batch的数据
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
        
        
def evaluate(sess, x_pad, y_pad, loss1, acc1):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_pad)
    batch_eval = batch_iter(x_pad, y_pad, batch_size)  # 128
    total_loss = 0.0
    total_acc = 0.0
    for x_batch1, y_batch1 in batch_eval:
        batch_len = len(x_batch1)
        feed_dict1 = {inputX: x_batch1, inputY: y_batch1, dropoutKeepProb: 1.0}
        lossTmp, accTmp = sess.run([loss1, acc1], feed_dict=feed_dict1)
        total_loss += lossTmp * batch_len
        total_acc += accTmp * batch_len

    return total_loss / data_len, total_acc / data_len

In [29]:
print('Training and evaluating...')
start_time = time.time()
total_batch = 0  # 总批次
best_acc_val = 0.0  # 最佳验证集准确率
last_improved = 0  # 记录上一次提升批次
require_improvement = 500  # 如果超过500轮未提升，提前结束训练
flag = False

for epoch in range(num_epochs):
    print('Epoch:', epoch + 1)
    batch_train = batch_iter(x_train, y_train, batch_size)
    for x_batch, y_batch in batch_train:
        feed_dict = {inputX: x_batch, inputY: y_batch, dropoutKeepProb: dropout_keep_prob}
        sess.run(trainOp, feed_dict=feed_dict)  # 运行优化
        total_batch += 1

        if total_batch % print_per_batch == 0:
            # 每多少轮次输出在训练集和验证集上的性能
            feed_dict[dropoutKeepProb] = 1.0
            loss_train, acc_train = sess.run([lstm.loss, acc], feed_dict=feed_dict)
            loss_val, acc_val = evaluate(sess, x_val, y_val, lstm.loss, acc)
            if acc_val > best_acc_val:
                # 保存最好结果
                best_acc_val = acc_val
                last_improved = total_batch
                saver.save(sess=sess, save_path=savePath)
                improved_str = '*'  # 对最好结果进行标记
            else:
                improved_str = ''

            time_dif = get_time_dif(start_time)
            msg = 'Iter: {0:>6}, Train Loss: {1:>6.2}, Train Acc: {2:>7.2%},' \
                  + ' Val Loss: {3:>6.2}, Val Acc: {4:>7.2%}, Time: {5} {6}'
            print(msg.format(total_batch, loss_train, acc_train, loss_val, acc_val, time_dif, improved_str))

        if total_batch - last_improved > require_improvement:
            # 验证集正确率长期不提升，提前结束训练
            print("No optimization for a long time, auto-stopping...")
            flag = True
            break  # 跳出循环
    if flag:  # 同上
        break

Training and evaluating...
Epoch: 1
Iter:     20, Train Loss:   0.59, Train Acc:  76.56%, Val Loss:   0.71, Val Acc:  73.67%, Time: 0:00:10 *
Iter:     40, Train Loss:   0.37, Train Acc:  90.62%, Val Loss:   0.55, Val Acc:  81.19%, Time: 0:00:20 *
Iter:     60, Train Loss:    0.4, Train Acc:  84.38%, Val Loss:    0.5, Val Acc:  82.08%, Time: 0:00:30 *
Epoch: 2
Iter:     80, Train Loss:   0.07, Train Acc: 100.00%, Val Loss:   0.45, Val Acc:  84.07%, Time: 0:00:39 *
Iter:    100, Train Loss:  0.099, Train Acc:  95.31%, Val Loss:   0.48, Val Acc:  83.19%, Time: 0:00:48 
Iter:    120, Train Loss:  0.059, Train Acc:  96.88%, Val Loss:   0.46, Val Acc:  85.18%, Time: 0:00:58 *
Iter:    140, Train Loss:   0.14, Train Acc:  96.88%, Val Loss:   0.45, Val Acc:  85.18%, Time: 0:01:06 
Epoch: 3
Iter:    160, Train Loss:  0.013, Train Acc: 100.00%, Val Loss:   0.48, Val Acc:  85.40%, Time: 0:01:16 *
Iter:    180, Train Loss:  0.027, Train Acc: 100.00%, Val Loss:   0.49, Val Acc:  85.18%, Time: 0:01

In [30]:
def evaluate_model():
    # 读取保存的模型
    saver.restore(sess=sess, save_path=savePath)
    start_time = time.time()
    print('Testing...')
    loss_test, acc_test = evaluate(sess, x_test, y_test, lstm.loss, acc)
    msg = 'Test Loss: {0:>6.2}, Test Acc: {1:>7.2%}'
    print(msg.format(loss_test, acc_test))

    test_data_len = len(x_test)
    test_num_batch = int((test_data_len - 1) / batch_size) + 1

    y_test_cls = np.argmax(y_test, 1)  # 获得类别
    y_test_pred_cls = np.zeros(shape=len(x_test), dtype=np.int32)  # 保存预测结果  len(x_test) 表示有多少个文本

    for i in range(test_num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, test_data_len)
        feed_dict = {
            inputX: x_test[start_id:end_id],
            dropoutKeepProb: 1.0
        }
        y_test_pred_cls[start_id:end_id] = sess.run(lstm.y_pred_cls, feed_dict=feed_dict)

    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_test_cls, y_test_pred_cls, target_names=categories))
    '''
    sklearn中的classification_report函数用于显示主要分类指标的文本报告．在报告中显示每个类的精确度，召回率，F1值等信息。
        y_true：1维数组，或标签指示器数组/稀疏矩阵，目标值。 
        y_pred：1维数组，或标签指示器数组/稀疏矩阵，分类器返回的估计值。 
        labels：array，shape = [n_labels]，报表中包含的标签索引的可选列表。 
        target_names：字符串列表，与标签匹配的可选显示名称（相同顺序）。 
        原文链接：https://blog.csdn.net/akadiao/article/details/78788864
    '''

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_test_cls, y_test_pred_cls)
    '''
    混淆矩阵是机器学习中总结分类模型预测结果的情形分析表，以矩阵形式将数据集中的记录按照真实的类别与分类模型作出的分类判断两个标准进行汇总。
    这个名字来源于它可以非常容易的表明多个类别是否有混淆（也就是一个class被预测成另一个class）
    https://blog.csdn.net/u011734144/article/details/80277225
    '''
    print(cm)

    time_dif = get_time_dif(start_time)
    print("Time usage:", time_dif)

In [31]:
evaluate_model()

INFO:tensorflow:Restoring parameters from s3://corpus-text-classification1/saveModel_adv_biLSTM/saveModel_adv_biLSTM


INFO:tensorflow:Restoring parameters from s3://corpus-text-classification1/saveModel_adv_biLSTM/saveModel_adv_biLSTM


Testing...
Test Loss:   0.45, Test Acc:  88.80%
Precision, Recall and F1-Score...
              precision    recall  f1-score   support

        ABBR       1.00      0.78      0.88         9
        DESC       0.88      0.92      0.90       138
        ENTY       0.78      0.78      0.78        94
         HUM       0.88      0.92      0.90        65
         LOC       0.90      0.90      0.90        81
         NUM       0.96      0.90      0.93       113

   micro avg       0.88      0.88      0.88       500
   macro avg       0.90      0.87      0.88       500
weighted avg       0.89      0.88      0.88       500

Confusion Matrix...
[[  7   2   0   0   0   0]
 [  0 127   7   1   2   1]
 [  0   9  73   7   3   2]
 [  0   1   3  60   1   0]
 [  0   1   6   0  73   1]
 [  0   5   4   0   2 102]]
Time usage: 0:00:02


In [37]:
def predict(predict_sentences, word_to_id, cat_to_id, pad_max_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    
    data_id = []
    # 将文本内容转换为对应的id形式
    for i in range(len(predict_sentences)):
        data_id.append([word_to_id[x] for x in predict_sentences[i].strip().split() if x in word_to_id])
        
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    feed_dict = {
        inputX: x_pad,
        dropoutKeepProb: 1.0
    }
    predict_result = sess.run(lstm.y_pred_cls, feed_dict=feed_dict)
    return predict_result

In [33]:
cat_to_id

{'ABBR': 0, 'DESC': 1, 'ENTY': 2, 'HUM': 3, 'LOC': 4, 'NUM': 5}

In [38]:
'''
LOC:mount Where are the Rocky Mountains ?
DESC:def What are invertebrates ?
NUM:temp What is the temperature at the center of the earth ?
'''
predict(["Where are the Rocky Mountains ?","What are invertebrates ?","What is the temperature at the center of the earth ?"],word_to_id, cat_to_id, seq_length)

array([4, 1, 5])