In [1]:
import numpy as np
import tensorflow as tf
import sys
import time
from datetime import timedelta
import tensorflow.contrib.keras as kr
from sklearn import metrics
from sklearn.model_selection import KFold

import moxing as mox
mox.file.shift('os', 'mox')

INFO:root:Using MoXing-v1.14.1-ddfd6c9a
INFO:root:Using OBS-Python-SDK-3.1.2


In [2]:
trainDataPath = "s3://corpus-2/dataset/corpus_hf.txt"
vocabPath = "s3://corpus-text-classification1/data/glove.6B.100d.txt"
savePath = "s3://corpus-2/model/ad_biLSTM2/lstm_model"

In [3]:
def dataset_split(trainDataPath):

    train_data = []
    seq_length = 0
    with open(trainDataPath, 'r', encoding='utf-8') as f:
        for line in f.readlines():
            word = line.split()
            label = int(word[0].split(":")[0])
            content = word[1:]
            train_data.append([content,label])
            
            if len(content) > seq_length:
                seq_length = len(content)
            
    np.random.shuffle(train_data)
    return np.asarray(train_data), seq_length


def loadGloVe(filename):
    vocab = []
    embd = []
    print('Loading GloVe!')
    # vocab.append('unk') #装载不认识的词
    # embd.append([0] * emb_size) #这个emb_size可能需要指定
    file = open(filename,'r',encoding='utf-8')
    for line in file.readlines():
        row = line.strip().split(' ')
        vocab.append(row[0])
        embd.append([float(ei) for ei in row[1:]])
    file.close()
    print('Completed!')
    return vocab,embd


def process_file(contents, labels, word_to_id, num_classes, pad_max_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    # contents, labels = readfile(filePath)
    data_id, label_id = [], []
    # 将文本内容转换为对应的id形式
    for i in range(len(contents)):
        data_id.append([word_to_id[x] for x in contents[i] if x in word_to_id])
        label_id.append(labels[i] - 1)  # label_id.append(cat_to_id[labels[i]])
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    y_pad = kr.utils.to_categorical(label_id, num_classes=num_classes)  # 将标签转换为one-hot表示
    ''' https://blog.csdn.net/nima1994/article/details/82468965
    to_categorical(y, num_classes=None, dtype='float32')
        将整型标签转为onehot。y为int数组，num_classes为标签类别总数，大于max(y)（标签从0开始的）。
        返回：如果num_classes=None，返回len(y) * [max(y)+1]（维度，m*n表示m行n列矩阵，下同），否则为len(y) * num_classes。
    '''
    return x_pad, y_pad

In [4]:
categories = ['Retrieve Value', 'Filter', 'Compute Derived Value', 'Find Extremum', 'Sort', 
                  'Determine Range', 'Characterize Distribution', 'Find Anomalies', 'Cluster', 'Correlate']
num_classes = len(categories)

vocab, embd = loadGloVe(vocabPath)
vocab_size = len(vocab)
embedding_dim = len(embd[0])
embedding = np.asarray(embd)
word_to_id = dict(zip(vocab, range(vocab_size)))

train_data, seq_length = dataset_split(trainDataPath)
x_train, y_train = process_file(train_data[:,0], train_data[:,1], word_to_id, num_classes, seq_length)

print(len(embedding),embedding_dim,vocab_size, seq_length)

Loading GloVe!
Completed!
400000 100 400000 41


In [5]:
def batch_iter(x_pad, y_pad, batch_size):
    """生成批次数据"""
    data_len = len(x_pad)
    num_batch = int((data_len - 1) / batch_size) + 1
    # np.arange()生成0到data_len的等差数列，默认等差为1；np.random.permutation()打乱生成的等差序列的顺序
    # 下面三句语句是为了将训练或测试文本的顺序打乱，因为原文本中每个分类的样本全部挨在一起，这样每个batch训练的都是同一个分类，不太好，打乱后每个batch可包含不同分类
    indices = np.random.permutation(np.arange(data_len))
    x_shuffle = x_pad[indices]
    y_shuffle = y_pad[indices]

    # 返回所有batch的数据
    for i in range(num_batch):
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, data_len)
        yield x_shuffle[start_id:end_id], y_shuffle[start_id:end_id]
        
        
def evaluate(sess, model, x_pad, y_pad, loss1, acc1, batch_size):
    """评估在某一数据上的准确率和损失"""
    data_len = len(x_pad)
    batch_eval = batch_iter(x_pad, y_pad, batch_size)  # 128
    total_loss = 0.0
    total_acc = 0.0
    for x_batch1, y_batch1 in batch_eval:
        batch_len = len(x_batch1)
        feed_dict1 = {model.inputX: x_batch1, model.inputY: y_batch1, model.dropoutKeepProb: 1.0}
        lossTmp, accTmp = sess.run([loss1, acc1], feed_dict=feed_dict1)
        total_loss += lossTmp * batch_len
        total_acc += accTmp * batch_len

    return total_loss / data_len, total_acc / data_len


def model_train(model, x_train, y_train, categories):
    
    # save_path = "%s/%s/%s/%s" % (savePath, split_type, fold_id, fold_id)
    # 创建session
    session = tf.Session()
    session.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()

    print('Training and evaluating...')
    
    total_batch = 0  # 总批次
    best_acc_train = 0.0  # 最佳验证集准确率
    last_improved = 0  # 记录上一次提升批次
    require_improvement = 200  # 如果超过1000轮未提升，提前结束训练
    print_per_batch = 100
    flag = False

    for epoch in range(num_epochs):  # 20
        start_time = time.time()
        
        print('Epoch:', epoch + 1)
        batch_train = batch_iter(x_train, y_train, batch_size)
        for x_batch, y_batch in batch_train:
            feed_dict = {model.inputX: x_batch, model.inputY: y_batch, model.dropoutKeepProb: dropout_keep_prob}
            session.run(model.trainOp, feed_dict=feed_dict)  # 运行优化
            total_batch += 1

            if total_batch % print_per_batch == 0:
                # 每多少轮次输出在训练集和验证集上的性能
                # feed_dict[model.dropoutKeepProb] = 1.0
                # loss_train, acc_train = session.run([model.loss, model.acc], feed_dict=feed_dict)
                
                loss_train, acc_train = evaluate(session, model, x_train, y_train, model.loss, model.acc, batch_size)
                
                if acc_train > best_acc_train:
                    # 保存最好结果
                    best_acc_train = acc_train
                    last_improved = total_batch
                    improved_str = '*'
                    
                    if best_acc_train > 0.9:
                        saver.save(sess=session, save_path=savePath)
                else:
                    improved_str = ''
                
                duration = time.time() - start_time
                output = 'Iter: {:>1}, Train Loss: {:>6.4}, Train Acc: {:>6.2%}, Time: {:.2f}s {}'
                print(output.format(total_batch, loss_train, acc_train, duration, improved_str))

            if total_batch - last_improved > require_improvement:
                # 验证集正确率长期不提升，提前结束训练
                print("No optimization for a long time, auto-stopping...")
                
                train_data_len = len(x_train)
                train_num_batch = int((train_data_len - 1) / batch_size) + 1

                y_train_cls = np.argmax(y_train, 1)  # 获得类别
                y_train_pred_cls = np.zeros(shape=len(x_train), dtype=np.int32)  # 保存预测结果  len(x_test) 表示有多少个文本

                for i in range(train_num_batch):  # 逐批次处理
                    start_id = i * batch_size
                    end_id = min((i + 1) * batch_size, train_data_len)
                    feed_dict = {
                        model.inputX: x_train[start_id:end_id],
                        model.dropoutKeepProb: 1.0
                    }
                    y_train_pred_cls[start_id:end_id] = session.run(model.y_pred_cls, feed_dict=feed_dict)

                accuracy_score = metrics.accuracy_score(y_train_cls, y_train_pred_cls)
                # 评估
                print("Precision, Recall and F1-Score...")
                print(metrics.classification_report(y_train_cls, y_train_pred_cls, target_names=categories))
                '''
                sklearn中的classification_report函数用于显示主要分类指标的文本报告．在报告中显示每个类的精确度，召回率，F1值等信息。
                    y_true：1维数组，或标签指示器数组/稀疏矩阵，目标值。 
                    y_pred：1维数组，或标签指示器数组/稀疏矩阵，分类器返回的估计值。 
                    labels：array，shape = [n_labels]，报表中包含的标签索引的可选列表。 
                    target_names：字符串列表，与标签匹配的可选显示名称（相同顺序）。 
                    原文链接：https://blog.csdn.net/akadiao/article/details/78788864
                '''

                # 混淆矩阵
                print("Confusion Matrix...")
                cm = metrics.confusion_matrix(y_train_cls, y_train_pred_cls)
                '''
                混淆矩阵是机器学习中总结分类模型预测结果的情形分析表，以矩阵形式将数据集中的记录按照真实的类别与分类模型作出的分类判断两个标准进行汇总。
                这个名字来源于它可以非常容易的表明多个类别是否有混淆（也就是一个class被预测成另一个class）
                https://blog.csdn.net/u011734144/article/details/80277225
                '''
                print(cm)
                
                flag = True
                break  # 跳出循环
        if flag:  # 同上
            break

    session.close()
    return accuracy_score

In [6]:
# 构建adversarailLSTM模型
class AdversarailLSTM(object):

    def __init__(self, wordEmbedding):
        # 定义输入
        self.inputX = tf.placeholder(tf.int32, [None, seq_length], name='inputX')
        self.inputY = tf.placeholder(tf.int32, [None, num_classes], name='inputY')

        self.dropoutKeepProb = tf.placeholder(tf.float64, name='keep_prob')

        # 词嵌入层
        with tf.name_scope("wordEmbedding"):
            wordEmbedding = tf.Variable(initial_value=wordEmbedding)
            self.embeddedWords = tf.nn.embedding_lookup(wordEmbedding, self.inputX)

        # 计算softmax交叉熵损失
        with tf.name_scope("loss"):
            with tf.variable_scope("Bi-LSTM", reuse=None):
                self.predictions = self._Bi_LSTMAttention(self.embeddedWords)
                # self.y_pred_cls = tf.cast(tf.greater_equal(self.predictions, 0.5), tf.float32, name="binaryPreds")
                self.y_pred_cls = tf.argmax(tf.nn.softmax(self.predictions),1)  # 预测类别 tf.argmax：返回每一行或每一列的最大值 1为里面（每一行），0为外面（每一列）
                # losses = tf.nn.sigmoid_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY)
                losses = tf.nn.softmax_cross_entropy_with_logits(logits=self.predictions, labels=self.inputY)
                loss = tf.reduce_mean(losses)

        
        with tf.name_scope("perturloss"):
            with tf.variable_scope("Bi-LSTM", reuse=True):
                perturWordEmbedding = self._addPerturbation(self.embeddedWords, loss)
                print("perturbSize:{}".format(perturWordEmbedding))
                perturPredictions = self._Bi_LSTMAttention(perturWordEmbedding)
                # perturLosses = tf.nn.sigmoid_cross_entropy_with_logits(logits=perturPredictions, labels=self.inputY)
                perturLosses = tf.nn.softmax_cross_entropy_with_logits(logits=perturPredictions, labels=self.inputY)
                perturLoss = tf.reduce_mean(perturLosses)

        self.loss = loss + perturLoss
        
        globalStep = tf.Variable(0, name="globalStep", trainable=False)
        # 定义优化函数，传入学习速率参数
        optimizer = tf.train.AdamOptimizer(learning_rate)
        # 计算梯度,得到梯度和变量
        gradsAndVars = optimizer.compute_gradients(self.loss)
        # 将梯度应用到变量下，生成训练器
        self.trainOp = optimizer.apply_gradients(gradsAndVars, global_step=globalStep)

        # 准确率
        correct_pred = tf.equal(tf.argmax(self.inputY, 1), self.y_pred_cls)
        self.acc = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
        
        # self.loss = loss
        
        
    def _Bi_LSTMAttention(self, embeddedWords):
        # 定义两层双向LSTM的模型结构
        with tf.name_scope("Bi-LSTM"):
            fwHiddenLayers = []
            bwHiddenLayers = []
            for idx, hiddenSize in enumerate(hiddenSizes):
                with tf.name_scope("Bi-LSTM" + str(idx)):
                    # 定义前向网络结构
                    lstmFwCell = tf.nn.rnn_cell.DropoutWrapper(
                        tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                        output_keep_prob=self.dropoutKeepProb)

                    # 定义反向网络结构
                    lstmBwCell = tf.nn.rnn_cell.DropoutWrapper(
                        tf.nn.rnn_cell.LSTMCell(num_units=hiddenSize, state_is_tuple=True),
                        output_keep_prob=self.dropoutKeepProb)

                fwHiddenLayers.append(lstmFwCell)
                bwHiddenLayers.append(lstmBwCell)

            # 实现多层的LSTM结构， state_is_tuple=True，则状态会以元祖的形式组合(h, c)，否则列向拼接
            fwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=fwHiddenLayers, state_is_tuple=True)
            bwMultiLstm = tf.nn.rnn_cell.MultiRNNCell(cells=bwHiddenLayers, state_is_tuple=True)
            # 采用动态rnn，可以动态地输入序列的长度，若没有输入，则取序列的全长
            # outputs是一个元组(output_fw, output_bw), 其中两个元素的维度都是[batch_size, max_time, hidden_size], fw和bw的hiddensize一样
            # self.current_state是最终的状态，二元组(state_fw, state_bw), state_fw=[batch_size, s], s是一个元组(h, c)
            outputs, self.current_state = tf.nn.bidirectional_dynamic_rnn(fwMultiLstm, bwMultiLstm,
                                                                          self.embeddedWords, dtype=tf.float64,
                                                                          scope="bi-lstm" + str(idx))

        # 在bi-lstm+attention论文中，将前向和后向的输出相加
        with tf.name_scope("Attention"):
            H = outputs[0] + outputs[1]

            # 得到attention的输出
            output = self.attention(H)
            outputSize = hiddenSizes[-1]
            print("outputSize:{}".format(outputSize))

        # 全连接层的输出
        with tf.name_scope("output"):
            outputW = tf.get_variable(
                "outputW", dtype=tf.float64,
                shape=[outputSize, num_classes],
                initializer=tf.contrib.layers.xavier_initializer())

            outputB = tf.Variable(tf.constant(0.1, dtype=tf.float64, shape=[num_classes]), name="outputB")

            predictions = tf.nn.xw_plus_b(output, outputW, outputB, name="predictions")

            return predictions

    def attention(self, H):
        """
        利用Attention机制得到句子的向量表示
        """
        # 获得最后一层lstm神经元的数量
        hiddenSize = hiddenSizes[-1]

        # 初始化一个权重向量，是可训练的参数
        W = tf.Variable(tf.random_normal([hiddenSize], stddev=0.1, dtype=tf.float64))

        # 对bi-lstm的输出用激活函数做非线性转换
        M = tf.tanh(H)

        # 对W和M做矩阵运算，W=[batch_size, time_step, hidden_size], 计算前做维度转换成[batch_size * time_step, hidden_size]
        # newM = [batch_size, time_step, 1], 每一个时间步的输出由向量转换成一个数字
        newM = tf.matmul(tf.reshape(M, [-1, hiddenSize]), tf.reshape(W, [-1, 1]))

        # 对newM做维度转换成[batch_size, time_step]
        restoreM = tf.reshape(newM, [-1, seq_length])

        # 用softmax做归一化处理[batch_size, time_step]
        self.alpha = tf.nn.softmax(restoreM)

        # 利用求得的alpha的值对H进行加权求和，用矩阵运算直接操作
        r = tf.matmul(tf.transpose(H, [0, 2, 1]), tf.reshape(self.alpha, [-1, seq_length, 1]))

        # 将三维压缩成二维sequeezeR = [batch_size, hissen_size]
        sequeezeR = tf.squeeze(r)

        sentenceRepren = tf.tanh(sequeezeR)

        # 对attention的输出可以做dropout处理
        output = tf.nn.dropout(sentenceRepren, self.dropoutKeepProb)

        return output

    def _normalize(self, wordEmbedding, weights):
        """
        对word embedding 结合权重做标准化处理
        """
        mean = tf.matmul(weights, wordEmbedding)
        powWordEmbedding = tf.pow(wordEmbedding - mean, 2.)

        var = tf.matmul(weights, powWordEmbedding)
        stddev = tf.sqrt(1e-6 + var)

        return (wordEmbedding - mean) / stddev

    def _addPerturbation(self, embedded, loss):
        """
        添加波动到word embedding
        """
        grad, = tf.gradients(
            loss,
            embedded,
            aggregation_method=tf.AggregationMethod.EXPERIMENTAL_ACCUMULATE_N)
        grad = tf.stop_gradient(grad)
        perturb = self._scaleL2(grad, epsilon)
        # print("perturbSize:{}".format(embedded+perturb))
        return embedded + perturb

    def _scaleL2(self, x, norm_length):
        # shape(x) = [batch, num_step, d]
        # divide x by max(abs(x)) for a numerically stable L2 norm
        # 2norm(x) = a * 2norm(x/a)
        # scale over the full sequence, dim(1, 2)
        alpha = tf.reduce_max(tf.abs(x), (1, 2), keep_dims=True) + 1e-12
        l2_norm = alpha * tf.sqrt(tf.reduce_sum(tf.pow(x / alpha, 2), (1, 2), keep_dims=True) + 1e-6)
        x_unit = x / l2_norm
        return norm_length * x_unit

In [7]:
hiddenSizes = [128]  # 定义LSTM的隐藏层（一层，128个神经元）
epsilon = 5

num_filters = 256
kernel_size = 5
hidden_dim = 128
learning_rate = 1e-3
dropout_keep_prob = 0.5

num_epochs = 50
batch_size = 64
print_per_batch = 30  # 每多少轮输出一次结果

lstm = AdversarailLSTM(embedding)

outputSize:128
Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



Instructions for updating:

Future major versions of TensorFlow will allow gradients to flow
into the labels input on backprop by default.

See @{tf.nn.softmax_cross_entropy_with_logits_v2}.



Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


Instructions for updating:
keep_dims is deprecated, use keepdims instead


perturbSize:Tensor("perturloss/Bi-LSTM/add_2:0", shape=(?, 41, 100), dtype=float64)
outputSize:128


In [8]:
model_train(lstm, x_train, y_train, categories)

Training and evaluating...
Epoch: 1
Iter: 100, Train Loss:  2.929, Train Acc: 53.89%, Time: 28.94s *
Iter: 200, Train Loss:  1.748, Train Acc: 73.14%, Time: 56.43s *
Epoch: 2
Iter: 300, Train Loss:  1.337, Train Acc: 79.07%, Time: 24.64s *
Iter: 400, Train Loss: 0.9882, Train Acc: 85.27%, Time: 52.17s *
Epoch: 3
Iter: 500, Train Loss: 0.7826, Train Acc: 88.14%, Time: 21.94s *
Iter: 600, Train Loss: 0.6662, Train Acc: 90.35%, Time: 66.12s *
Epoch: 4
Iter: 700, Train Loss: 0.5454, Train Acc: 92.23%, Time: 39.04s *
Iter: 800, Train Loss: 0.4667, Train Acc: 93.35%, Time: 98.81s *
Epoch: 5
Iter: 900, Train Loss: 0.4527, Train Acc: 93.44%, Time: 28.79s *
Iter: 1000, Train Loss: 0.2972, Train Acc: 96.15%, Time: 76.83s *
Iter: 1100, Train Loss: 0.3076, Train Acc: 95.75%, Time: 104.19s 
Epoch: 6
Iter: 1200, Train Loss: 0.2374, Train Acc: 96.76%, Time: 44.35s *
Iter: 1300, Train Loss: 0.2157, Train Acc: 97.06%, Time: 93.98s *
Epoch: 7
Iter: 1400, Train Loss: 0.2283, Train Acc: 96.94%, Time: 24.4

InternalError: : Unable to connect to endpoint
	 [[Node: save/SaveV2 = SaveV2[dtypes=[DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, ..., DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/bias/_85, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam/_87, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam_1/_89, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/kernel/_91, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam/_93, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam_1/_95, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/bias/_97, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam/_99, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam_1/_101, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/kernel/_103, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam/_105, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam_1/_107, Bi-LSTM/outputW/_109, Bi-LSTM/outputW/Adam/_111, Bi-LSTM/outputW/Adam_1/_113, beta1_power/_115, beta2_power/_117, globalStep, loss/Bi-LSTM/Attention/Variable/_119, loss/Bi-LSTM/Attention/Variable/Adam/_121, loss/Bi-LSTM/Attention/Variable/Adam_1/_123, loss/Bi-LSTM/output/outputB/_125, loss/Bi-LSTM/output/outputB/Adam/_127, loss/Bi-LSTM/output/outputB/Adam_1/_129, perturloss/Bi-LSTM/Attention/Variable/_131, perturloss/Bi-LSTM/Attention/Variable/Adam/_133, perturloss/Bi-LSTM/Attention/Variable/Adam_1/_135, perturloss/Bi-LSTM/output/outputB/_137, perturloss/Bi-LSTM/output/outputB/Adam/_139, perturloss/Bi-LSTM/output/outputB/Adam_1/_141, wordEmbedding/Variable/_143, wordEmbedding/Variable/Adam/_145, wordEmbedding/Variable/Adam_1/_147)]]

Caused by op 'save/SaveV2', defined at:
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/ioloop.py", line 832, in start
    self._run_callback(self._callbacks.popleft())
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/ioloop.py", line 605, in _run_callback
    ret = callback()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 536, in <lambda>
    self.io_loop.add_callback(lambda : self._handle_events(self.socket, 0))
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2856, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-8-fbcb6bdbf79a>", line 1, in <module>
    model_train(lstm, x_train, y_train, categories)
  File "<ipython-input-5-6470836b0686>", line 41, in model_train
    saver = tf.train.Saver()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1338, in __init__
    self.build()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1347, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1384, in _build
    build_save=build_save, build_restore=build_restore)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 832, in _build_internal
    save_tensor = self._AddSaveOps(filename_tensor, saveables)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 350, in _AddSaveOps
    save = self.save_op(filename_tensor, saveables)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 266, in save_op
    tensors)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1687, in save_v2
    shape_and_slices=shape_and_slices, tensors=tensors, name=name)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

InternalError (see above for traceback): : Unable to connect to endpoint
	 [[Node: save/SaveV2 = SaveV2[dtypes=[DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, ..., DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save/Const_0_0, save/SaveV2/tensor_names, save/SaveV2/shape_and_slices, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/bias/_85, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam/_87, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam_1/_89, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/kernel/_91, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam/_93, Bi-LSTM/bi-lstm0/bw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam_1/_95, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/bias/_97, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam/_99, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/bias/Adam_1/_101, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/kernel/_103, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam/_105, Bi-LSTM/bi-lstm0/fw/multi_rnn_cell/cell_0/lstm_cell/kernel/Adam_1/_107, Bi-LSTM/outputW/_109, Bi-LSTM/outputW/Adam/_111, Bi-LSTM/outputW/Adam_1/_113, beta1_power/_115, beta2_power/_117, globalStep, loss/Bi-LSTM/Attention/Variable/_119, loss/Bi-LSTM/Attention/Variable/Adam/_121, loss/Bi-LSTM/Attention/Variable/Adam_1/_123, loss/Bi-LSTM/output/outputB/_125, loss/Bi-LSTM/output/outputB/Adam/_127, loss/Bi-LSTM/output/outputB/Adam_1/_129, perturloss/Bi-LSTM/Attention/Variable/_131, perturloss/Bi-LSTM/Attention/Variable/Adam/_133, perturloss/Bi-LSTM/Attention/Variable/Adam_1/_135, perturloss/Bi-LSTM/output/outputB/_137, perturloss/Bi-LSTM/output/outputB/Adam/_139, perturloss/Bi-LSTM/output/outputB/Adam_1/_141, wordEmbedding/Variable/_143, wordEmbedding/Variable/Adam/_145, wordEmbedding/Variable/Adam_1/_147)]]


In [9]:
with tf.Session() as session:
    
    session.run(tf.global_variables_initializer())
    
    saver = tf.train.Saver()
    saver.restore(sess=session, save_path=savePath)
    
    train_data_len = len(x_train)
    train_num_batch = int((train_data_len - 1) / batch_size) + 1

    y_train_cls = np.argmax(y_train, 1)  # 获得类别
    y_train_pred_cls = np.zeros(shape=len(x_train), dtype=np.int32)  # 保存预测结果  len(x_test) 表示有多少个文本

    for i in range(train_num_batch):  # 逐批次处理
        start_id = i * batch_size
        end_id = min((i + 1) * batch_size, train_data_len)
        feed_dict = {
            lstm.inputX: x_train[start_id:end_id],
            lstm.dropoutKeepProb: 1.0
        }
        y_train_pred_cls[start_id:end_id] = session.run(lstm.y_pred_cls, feed_dict=feed_dict)

    accuracy_score = metrics.accuracy_score(y_train_cls, y_train_pred_cls)
    print(accuracy_score)
    # 评估
    print("Precision, Recall and F1-Score...")
    print(metrics.classification_report(y_train_cls, y_train_pred_cls, target_names=categories))
    '''
    sklearn中的classification_report函数用于显示主要分类指标的文本报告．在报告中显示每个类的精确度，召回率，F1值等信息。
        y_true：1维数组，或标签指示器数组/稀疏矩阵，目标值。 
        y_pred：1维数组，或标签指示器数组/稀疏矩阵，分类器返回的估计值。 
        labels：array，shape = [n_labels]，报表中包含的标签索引的可选列表。 
        target_names：字符串列表，与标签匹配的可选显示名称（相同顺序）。 
        原文链接：https://blog.csdn.net/akadiao/article/details/78788864
    '''

    # 混淆矩阵
    print("Confusion Matrix...")
    cm = metrics.confusion_matrix(y_train_cls, y_train_pred_cls)
    print(cm)

INFO:tensorflow:Restoring parameters from s3://corpus-2/model/ad_biLSTM2/lstm_model


INFO:tensorflow:Restoring parameters from s3://corpus-2/model/ad_biLSTM2/lstm_model


DataLossError: Checksum does not match: stored 999530730 vs. calculated on the restored bytes 2317565041
	 [[Node: save_1/RestoreV2 = RestoreV2[dtypes=[DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, ..., DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save_1/Const_0_0, save_1/RestoreV2/tensor_names, save_1/RestoreV2/shape_and_slices)]]
	 [[Node: save_1/RestoreV2/_17 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_18_save_1/RestoreV2", tensor_type=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]

Caused by op 'save_1/RestoreV2', defined at:
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/runpy.py", line 193, in _run_module_as_main
    "__main__", mod_spec)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/runpy.py", line 85, in _run_code
    exec(code, run_globals)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel_launcher.py", line 16, in <module>
    app.launch_new_instance()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/traitlets/config/application.py", line 658, in launch_instance
    app.start()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelapp.py", line 478, in start
    self.io_loop.start()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/ioloop.py", line 888, in start
    handler_func(fd_obj, events)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 450, in _handle_events
    self._handle_recv()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 480, in _handle_recv
    self._run_callback(callback, msg)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/zmq/eventloop/zmqstream.py", line 432, in _run_callback
    callback(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tornado/stack_context.py", line 277, in null_wrapper
    return fn(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 283, in dispatcher
    return self.dispatch_shell(stream, msg)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 233, in dispatch_shell
    handler(stream, idents, msg)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/kernelbase.py", line 399, in execute_request
    user_expressions, allow_stdin)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/ipkernel.py", line 208, in do_execute
    res = shell.run_cell(code, store_history=store_history, silent=silent)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/ipykernel/zmqshell.py", line 537, in run_cell
    return super(ZMQInteractiveShell, self).run_cell(*args, **kwargs)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2728, in run_cell
    interactivity=interactivity, compiler=compiler, result=result)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2850, in run_ast_nodes
    if self.run_code(code, result):
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/IPython/core/interactiveshell.py", line 2910, in run_code
    exec(code_obj, self.user_global_ns, self.user_ns)
  File "<ipython-input-9-7010d69d8e96>", line 5, in <module>
    saver = tf.train.Saver()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1338, in __init__
    self.build()
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1347, in build
    self._build(self._filename, build_save=True, build_restore=True)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 1384, in _build
    build_save=build_save, build_restore=build_restore)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 835, in _build_internal
    restore_sequentially, reshape)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 472, in _AddRestoreOps
    restore_sequentially)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/training/saver.py", line 886, in bulk_restore
    return io_ops.restore_v2(filename_tensor, names, slices, dtypes)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/ops/gen_io_ops.py", line 1463, in restore_v2
    shape_and_slices=shape_and_slices, dtypes=dtypes, name=name)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/framework/op_def_library.py", line 787, in _apply_op_helper
    op_def=op_def)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 3392, in create_op
    op_def=op_def)
  File "/home/ma-user/anaconda3/envs/TensorFlow-1.8/lib/python3.6/site-packages/tensorflow/python/framework/ops.py", line 1718, in __init__
    self._traceback = self._graph._extract_stack()  # pylint: disable=protected-access

DataLossError (see above for traceback): Checksum does not match: stored 999530730 vs. calculated on the restored bytes 2317565041
	 [[Node: save_1/RestoreV2 = RestoreV2[dtypes=[DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, ..., DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE, DT_DOUBLE], _device="/job:localhost/replica:0/task:0/device:CPU:0"](_arg_save_1/Const_0_0, save_1/RestoreV2/tensor_names, save_1/RestoreV2/shape_and_slices)]]
	 [[Node: save_1/RestoreV2/_17 = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device_incarnation=1, tensor_name="edge_18_save_1/RestoreV2", tensor_type=DT_DOUBLE, _device="/job:localhost/replica:0/task:0/device:GPU:0"]()]]


In [None]:
def predict(predict_sentences, word_to_id, pad_max_length):
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    
    data_id = []
    # 将文本内容转换为对应的id形式
    for i in range(len(predict_sentences)):
        data_id.append([word_to_id[x] for x in predict_sentences[i].lower().strip().split() if x in word_to_id])
        
    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, pad_max_length)
    ''' https://blog.csdn.net/TH_NUM/article/details/80904900
    pad_sequences(sequences, maxlen=None, dtype=’int32’, padding=’pre’, truncating=’pre’, value=0.) 
        sequences：浮点数或整数构成的两层嵌套列表
        maxlen：None或整数，为序列的最大长度。大于此长度的序列将被截短，小于此长度的序列将在后部填0.
        dtype：返回的numpy array的数据类型
        padding：‘pre’或‘post’，确定当需要补0时，在序列的起始还是结尾补
        truncating：‘pre’或‘post’，确定当需要截断序列时，从起始还是结尾截断
        value：浮点数，此值将在填充时代替默认的填充值0
    '''
    feed_dict = {
        lstm.inputX: x_test[start_id:end_id],
        lstm.dropoutKeepProb: 1.0
    }
    predict_result = sess.run(lstm.y_pred_cls, feed_dict=feed_dict)
    predict_result = [i+1 for i in predict_result]
    return predict_result

session = tf.Session()

def predict11(predict_sentences, probability_threshold=0.26):  # 0.26189747
    """
    将文件转换为id表示,并且将每个单独的样本长度固定为pad_max_lengtn
    """
    data_id = []
    # 将文本内容转换为对应的id形式
    for psi in predict_sentences:

        data_id.append([word_to_id[x] for x in preprocess_sentence(psi).split() if x in word_to_id])

    # 使用keras提供的pad_sequences来将文本pad为固定长度
    x_pad = kr.preprocessing.sequence.pad_sequences(data_id, seq_length)
    feed_dict = {
        lstm.inputX: x_pad,
        lstm.dropoutKeepProb: 1.0
    }
    predict_result = session.run(tf.nn.softmax(lstm.predictions), feed_dict=feed_dict)
    # print(predict_result)
    result = []
    for i in predict_result:
        if max(i) > probability_threshold:
            result.append(i.argmax()+1)
        else:
            result.append(0)
    return result

In [None]:
def preprocess_sentence(sent):
    new_sent = ''
    for i in range(len(sent)):
        if sent[i] in string.punctuation:
            if i > 0 and i < len(sent) - 1:
                if sent[i] in ",." and sent[i-1].isdigit() and sent[i+1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i] == "%" and sent[i-1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i] == "$" and (sent[i-1].isdigit() or sent[i+1].isdigit()):
                    new_sent += sent[i]
                    continue
                if sent[i-1] != ' ':
                    new_sent += ' ' + sent[i]
                elif sent[i+1] != ' ':
                    new_sent += sent[i] + ' '
                else:
                    new_sent += sent[i]
            elif i == 0:
                if sent[i] == "$" and sent[i+1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i+1] != ' ':
                    new_sent += sent[i] + ' '
                else:
                    new_sent += sent[i]
            else:
                if sent[i] == "%" and sent[i-1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i] == "$" and sent[i-1].isdigit():
                    new_sent += sent[i]
                    continue
                if sent[i-1] != ' ':
                    new_sent += ' ' + sent[i]
                else:
                    new_sent += sent[i]
        else:
            new_sent += sent[i]
    return new_sent.strip().lower()

In [None]:
predict_sentences = ["In the sixtieth ceremony , where were all of the winners from ?",  # 7
                         "On how many devices has the app \" CF SHPOP ! \" been installed ?",  # 1
                         "List center - backs by what their transfer _ fee was .",  # 5
                         "can you tell me what is arkansas 's population on the date july 1st of 2002 ?",  # 1
                         "show the way the number of likes were distributed .",  # 7
                         "is it true that people living on average depends on higher gdp of a country"  # 10
                         ]

print(predict11(predict_sentences))