In [1]:
import tensorflow as tf
import numpy as np

  from ._conv import register_converters as _register_converters


### 定义rnn解码部分

In [2]:
def rnn_decoder_with_attention(decoder_inputs, initial_state, cell, attention_states, batch_size):
    '''
    rnn解码过程——
      对decoder_inputs中的每个input:
        1.计算attention: context_vector = softmax(query * key) * value
        2.rnn cell运行: output, state = cell(input, state, context_vector)
    输入——
      decoder_inputs: 一个2D-tensor的列表，每个tensor的shape=[batch_size, input_size]
      initial_state: cell的初始状态，此处使用的是由encoder最后一个输出计算的结果，shape=[batch_size, cell.state_size]，
                     此处state_size=hidden_size * 2
      cell: 使用的rnn_cell，此处使用带attention的双向gru cell
      loop_function: 测试的时候使用的，TODO
      attention_states: encoder的outputs，是一个3D-tensor，shape=[batch_size, sentence_len, hidden_size * 2]
    '''
    #定义一些量
    state = initial_state
    attention_states_origin = attention_states #[batch_size, sentence_len, state_size]
    _, sentence_len, state_size = attention_states.get_shape().as_list()
    outputs = []
    for idx in range(len(decoder_inputs)):
        #训练时把上一轮的state作为输入
        #1.计算attention
#         Q = tf.get_variable('Q', shape=[state_size, state_size], initializer=tf.random_normal_initializer(stddev=0.1))
#         K = tf.get_variable('K', shape=[state_size, state_size], initializer=tf.random_normal_initializer(stddev=0.1))
#         V = tf.get_variable('V', shape=[state_size, state_size], initializer=tf.random_normal_initializer(stddev=0.1))
        query = tf.reshape(state, (-1, 1, state_size)) #[batch_size, 1, state_size]
        key = attention_states #[batch_size, state_size, sentence_len]
        attention_logits = [tf.matmul(query[i], tf.transpose(key[i])) for i in range(batch_size)] #batch中的每个query与输入的key计算匹配程度
        attention_logits = tf.random_normal([batch_size, sentence_len])
        attention_logits = tf.reshape(attention_logits, [batch_size, sentence_len]) #[batch_size, sentence_len]
        p_attention = tf.nn.softmax(attention_logits) #计算概率，[batch_size, sentence_len]
        p_attention = tf.reshape(p_attention, [batch_size, sentence_len, 1])
        context_vector = tf.multiply(attention_states_origin, p_attention) #按位乘，在第三维自动扩展，[batch_size, sentence_len, state_size]
        context_vector = tf.reduce_sum(context_vector, axis=1) #[batch_size, state_size]
        #2.运行cell
        output, state = cell(decoder_inputs[idx], state, context_vector)
        outputs.append(output)
    return outputs, state

### 定义seq2seq

In [5]:
class Seq2Seq_with_attention:
    def __init__(self, num_classes, batch_size, sentence_len, 
                 embed_size, vocab_size, hidden_size, learning_rate, 
                 decay_steps, decay_rate, is_training=True, decode_sent_length=5):
        #初始化hyperparameter
        self.num_classes = num_classes
        self.batch_size = batch_size
        self.sentence_len = sentence_len
        self.embed_size = embed_size
        self.vocab_size = vocab_size
        self.hidden_size = hidden_size
        self.learning_rate = learning_rate
        self.is_training = is_training
        self.decode_sent_length = decode_sent_length
        self.initializer = tf.random_normal_initializer(stddev=0.1)
        
        #输入
        self.input_x = tf.placeholder(tf.int32, [None, self.sentence_len], 'input_x')
        self.decoder_input = tf.placeholder(tf.int32, [None, self.decode_sent_length], 'decoder_input') #
        self.input_y = tf.placeholder(tf.int32, [None, self.decode_sent_length], 'input_y')
#         self.dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
        
        #epoch信息
        self.global_step = tf.Variable(0, trainable=False, dtype=tf.int32, name='global_step')
        self.epoch_step = tf.Variable(0, trainable=False, dtype=tf.int32, name='epoch_step')
        self.epoch_increment = tf.assign(self.epoch_step, tf.add(self.epoch_step, tf.constant(1)))
        self.decay_steps, self.decay_rate = decay_steps, decay_rate
        
        self.init_weight()
        self.logits = self.inference()
        
        self.predictions = tf.argmax(self.logits, axis=2,name='prediction')
        self.loss_val = self.loss()
        self.train_op = self.train()
    
    def inference(self):
        '''
        embedding --> encode using gru --> decode using gru with attention --> linear classifier
        '''
        #1. embedding
        self.embedding_sentence = tf.nn.embedding_lookup(self.Embedding, self.input_x) #[batch_size, sentence_len, embed_size]
        #2. encoder with GRU
        h_t_list_forward = self.gru_cell_run(self.embedding_sentence, self.gru_cell) #list,每个元素为[batch_size, hidden_size]
        h_t_list_backward = self.gru_cell_run(self.embedding_sentence, self.gru_cell, forward=False) 
        encoder_outputs_list = [tf.concat((f,b), axis=1) for f, b in zip(h_t_list_forward, h_t_list_backward)]
        #list,每个元素为[batch_size, hidden_size * 2]
        encoder_outputs = tf.stack(encoder_outputs_list, axis=1) #[batch_size, sentence_len, hidden_size * 2]
        #3. decode using GRU with attention
        #a. 初始化decoder的第一个状态
        initial_state = tf.tanh(tf.matmul(h_t_list_backward[0], self.W_init_state) + self.b_init_state) #[batch_size, hidden_size * 2]
        #b. embedding decoder input
        embedding_decoder_input = tf.nn.embedding_lookup(self.Embedding_label, self.decoder_input) #[batch_size, deccode_len, embed_size]
        embedding_decoder_input_split = tf.split(embedding_decoder_input, self.decode_sent_length, axis=1)
        embedding_decoder_input_list = [tf.squeeze(x,axis=1) for x in embedding_decoder_input_split]#list,每个元素为[batch_size, embed_size]
        #c. run cell with attention
        cell = self.gru_cell_decode
        attention_states = encoder_outputs
        loop_function = None
        outputs, state = rnn_decoder_with_attention(embedding_decoder_input_list, initial_state, 
                                                    cell, attention_states, self.batch_size)
        # outputs: list，每个元素为[batch_size, output_size]，这里output_size = hidden_size*2(只用了一个gru，注意维度)
        decoder_output = tf.stack(outputs, axis=1) #[batch_size, decode_len, hidden_size * 2]
        decoder_output = tf.reshape(decoder_output, [-1, self.hidden_size * 2]) #[batch_size * decode_len, hidden_size * 2]
        logits = tf.matmul(decoder_output, self.W_logits) + self.b_logits   #[batch_size * decode_len, num_classes]
        logits = tf.reshape(logits, [self.batch_size, self.decode_sent_length, self.num_classes])# [batch_size, decode_len, num_classes]
        return logits
        
    def loss(self, l2_lambda=0.001):
        losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=self.input_y, logits=self.logits)#[batch_size, decode_sen_len]
        loss1 = tf.reduce_mean(tf.reduce_sum(losses, axis=1))
        l2_loss = tf.add_n([tf.nn.l2_loss(t) for t in tf.trainable_variables()]) * l2_lambda
        return loss1 + l2_loss
        
    def train(self):
        learning_rate = tf.train.exponential_decay(self.learning_rate,self.global_step,self.decay_steps,self.decay_rate,True)
        train_op = tf.contrib.layers.optimize_loss(self.loss_val, self.global_step,learning_rate, 'Adam')
        return train_op
    
    def gru_cell(self, X_t, h_t_minus_1):
        #reset gate
        r_t = tf.sigmoid(tf.matmul(h_t_minus_1, self.W_r) + tf.matmul(X_t, self.U_r) + self.b_r) #[batch_size, hidden_size]
        #生成候选h_t
        h_t_cand = tf.tanh(tf.matmul(r_t * h_t_minus_1, self.W_c) + tf.matmul(X_t, self.U_c) + self.b_c) #[batch_size, hidden_size]
        #update gate
        z_t = tf.sigmoid(tf.matmul(h_t_minus_1, self.W_z) + tf.matmul(X_t, self.U_z) + self.b_z) #[batch_size, hidden_size]
        #生成新的h_t
        h_t = (1 - z_t) * h_t_minus_1 + z_t * h_t_cand #[batch_size, hidden_size]
        return h_t
        
    def gru_cell_run(self, embedding_sentence, gru_cell, forward=True):
        '''
        input：embedding_sentence: [batch_size, sentence_len, embed_size]
        output：list of h_t,每个h_t的shape: [batch_size, hidden_size]
        '''
        embedding_sentence_split = tf.split(embedding_sentence, self.sentence_len, axis=1) #a list, each element is [batch_size,1,embed_size]
        embedding_sentence_list = [tf.squeeze(x, axis=1) for x in embedding_sentence_split] #a list, each element is [batch_size,embed_size]
        h_t = tf.ones((self.batch_size, self.hidden_size)) #初始化
        h_t_list = []
        if forward == False:
            embedding_sentence_list.reverse()
        for i in range(self.sentence_len):
            h_t = gru_cell(embedding_sentence_list[i], h_t)
            h_t_list.append(h_t)
        if forward == False:
            h_t_list.reverse()
        return h_t_list
    
    def gru_cell_decode(self, X_t, h_t_minus_1, context_vector):
        #reset gate
        r_t = tf.sigmoid(tf.matmul(h_t_minus_1, self.W_r_decode) + tf.matmul(X_t, self.U_r_decode) + self.b_r_decode) #[batch_size, hidden_size * 2]
        #生成候选h_t
        h_t_cand = tf.tanh(tf.matmul(r_t * h_t_minus_1, self.W_c_decode) + tf.matmul(X_t, self.U_c_decode)+ self.b_c_decode) #[batch_size, hidden_size * 2]
        #update gate
        z_t = tf.sigmoid(tf.matmul(h_t_minus_1, self.W_z_decode) + tf.matmul(X_t, self.U_z_decode) + self.b_z_decode) #[batch_size, hidden_size * 2]
        #生成新的h_t
        h_t = (1 - z_t) * h_t_minus_1 + z_t * h_t_cand #[batch_size, hidden_size * 2]
        #print(h_t.get_shape())
        #print(h_t_minus_1.get_shape())
        return h_t, h_t
        #return h_t_minus_1, h_t_minus_1
        
        
    def init_weight(self):
        '''定义所有weights'''
        #单词的embedding和label的embedding，其中label的embedding负责将decoder_input做embedding
        self.Embedding = tf.get_variable('Embedding', [self.vocab_size, self.embed_size], tf.float32, self.initializer)
        self.Embedding_label = tf.get_variable('Embedding_label', [self.num_classes, self.embed_size], tf.float32, self.initializer)
        #将encoder的第一个output映射为decoder的初始state
        self.W_init_state = tf.get_variable('W_init_state', [self.hidden_size, self.hidden_size * 2], tf.float32, self.initializer)
        self.b_init_state = tf.get_variable('b_init_state', [self.hidden_size * 2], tf.float32, self.initializer)
        #将decoder的outputs映射为logits
        self.W_logits = tf.get_variable('W_logits', [self.hidden_size * 2, self.num_classes], tf.float32, self.initializer)
        self.b_logits = tf.get_variable('b_logits', [self.num_classes], tf.float32, self.initializer)
        #encoder的gru门所需参数
            #reset gate
        self.W_r = tf.get_variable('W_r', [self.hidden_size, self.hidden_size], tf.float32, self.initializer)
        self.U_r = tf.get_variable('U_r', [self.embed_size, self.hidden_size], tf.float32, self.initializer)
        self.b_r = tf.get_variable('b_r', [self.hidden_size], tf.float32, self.initializer)
            #生成候选h_t
        self.W_c = tf.get_variable('W_c', [self.hidden_size, self.hidden_size], tf.float32, self.initializer)
        self.U_c = tf.get_variable('U_c', [self.embed_size, self.hidden_size], tf.float32, self.initializer)
        self.b_c = tf.get_variable('b_c', [self.hidden_size], tf.float32, self.initializer)
            #update gate
        self.W_z = tf.get_variable('W_z', [self.hidden_size, self.hidden_size], tf.float32, self.initializer)
        self.U_z = tf.get_variable('U_z', [self.embed_size, self.hidden_size], tf.float32, self.initializer)
        self.b_z = tf.get_variable('b_z', [self.hidden_size], tf.float32, self.initializer)
        #dncoder的gru门所需参数
            #reset gate
        self.W_r_decode = tf.get_variable('W_r_decode', [self.hidden_size * 2, self.hidden_size * 2], tf.float32, self.initializer)
        self.U_r_decode = tf.get_variable('U_r_decode', [self.embed_size, self.hidden_size * 2], tf.float32, self.initializer)
        self.C_r_decode = tf.get_variable('C_r_decode', [self.hidden_size * 2, self.hidden_size * 2], tf.float32, self.initializer)
        self.b_r_decode = tf.get_variable('b_r_decode', [self.hidden_size * 2], tf.float32, self.initializer)
            #生成候选h_t
        self.W_c_decode = tf.get_variable('W_c_decode', [self.hidden_size * 2, self.hidden_size * 2], tf.float32, self.initializer)
        self.U_c_decode = tf.get_variable('U_c_decode', [self.embed_size, self.hidden_size * 2], tf.float32, self.initializer)
        self.C_c_decode = tf.get_variable('C_c_decode', [self.hidden_size * 2, self.hidden_size * 2], tf.float32, self.initializer)
        self.b_c_decode = tf.get_variable('b_c_decode', [self.hidden_size * 2], tf.float32, self.initializer)
            #update gate
        self.W_z_decode = tf.get_variable('W_z_decode', [self.hidden_size * 2, self.hidden_size * 2], tf.float32, self.initializer)
        self.U_z_decode = tf.get_variable('U_z_decode', [self.embed_size, self.hidden_size * 2], tf.float32, self.initializer)
        self.C_z_decode = tf.get_variable('C_z_decode', [self.hidden_size * 2, self.hidden_size * 2], tf.float32, self.initializer)
        self.b_z_decode = tf.get_variable('b_z_decode', [self.hidden_size * 2], tf.float32, self.initializer)

#### 测试

In [10]:
import copy
def test():
    # below is a function test; if you use this for text classifiction, you need to tranform sentence to indices of vocabulary first. then feed data to the graph.
    num_classes = 9+2 #additional two classes:one is for _GO, another is for _END
    learning_rate = 0.0001
    batch_size = 1
    decay_steps = 1000
    decay_rate = 0.9
    sequence_length = 5
    vocab_size = 300
    embed_size = 100 #100
    hidden_size = 100
    is_training = True
    dropout_keep_prob = 1  # 0.5 #num_sentences
    decoder_sent_length=6
    l2_lambda=0.0001
    model = Seq2Seq_with_attention(num_classes, batch_size, sequence_length, 
                 embed_size, vocab_size, hidden_size, learning_rate, 
                 decay_steps, decay_rate, is_training, decoder_sent_length)
    with tf.Session() as sess:
        sess.run(tf.global_variables_initializer())
        for i in range(1500):
            #input_x = np.zeros((batch_size, sequence_length),dtype=np.int32) #[None, self.sequence_length]
            label_list=get_unique_labels()
            input_x = np.array([label_list],dtype=np.int32) #[2,3,4,5,6]
            label_list_original=copy.deepcopy(label_list)
            label_list.reverse()
            decoder_input=np.array([[0]+label_list],dtype=np.int32) #[[0,2,3,4,5,6]]
            input_y_label=np.array([label_list+[1]],dtype=np.int32) #[[2,3,4,5,6,1]]
            loss, predict, _ = sess.run([model.loss_val, model.predictions, model.train_op],
                                        feed_dict={model.input_x:input_x,model.decoder_input:decoder_input, model.input_y: input_y_label})
            print(i,"loss:", loss, "label_list_original as input x:",label_list_original,";input_y_label:", input_y_label, "prediction:", predict)
            
import random
def get_unique_labels():
    x=[2,3,4,5,6]
    random.shuffle(x)
    return x

tf.reset_default_graph()
test()

0 loss: 16.829098 label_list_original as input x: [6, 3, 2, 4, 5] ;input_y_label: [[5 4 2 3 6 1]] prediction: [[ 7  3  3 10  7 10]]
1 loss: 16.98711 label_list_original as input x: [4, 2, 6, 5, 3] ;input_y_label: [[3 5 6 2 4 1]] prediction: [[ 7  7  3 10 10 10]]
2 loss: 17.034191 label_list_original as input x: [2, 6, 5, 4, 3] ;input_y_label: [[3 4 5 6 2 1]] prediction: [[ 7  3  3  3  7 10]]
3 loss: 17.53306 label_list_original as input x: [5, 4, 2, 3, 6] ;input_y_label: [[6 3 2 4 5 1]] prediction: [[ 7  7 10 10 10 10]]
4 loss: 17.095333 label_list_original as input x: [5, 2, 3, 6, 4] ;input_y_label: [[4 6 3 2 5 1]] prediction: [[ 7  3  7 10 10 10]]
5 loss: 16.707285 label_list_original as input x: [4, 2, 3, 6, 5] ;input_y_label: [[5 6 3 2 4 1]] prediction: [[ 7  3  3  3 10 10]]
6 loss: 16.595322 label_list_original as input x: [4, 2, 5, 6, 3] ;input_y_label: [[3 6 5 2 4 1]] prediction: [[ 3  3  7 10 10 10]]
7 loss: 16.501522 label_list_original as input x: [2, 4, 3, 5, 6] ;input_y_lab

101 loss: 11.333114 label_list_original as input x: [6, 5, 4, 3, 2] ;input_y_label: [[2 3 4 5 6 1]] prediction: [[5 4 2 2 1 1]]
102 loss: 10.997601 label_list_original as input x: [3, 6, 4, 5, 2] ;input_y_label: [[2 5 4 6 3 1]] prediction: [[5 4 4 3 1 1]]
103 loss: 11.406214 label_list_original as input x: [5, 3, 2, 6, 4] ;input_y_label: [[4 6 2 3 5 1]] prediction: [[5 2 4 4 1 1]]
104 loss: 11.393845 label_list_original as input x: [2, 5, 4, 3, 6] ;input_y_label: [[6 3 4 5 2 1]] prediction: [[5 5 5 2 1 1]]
105 loss: 11.154627 label_list_original as input x: [6, 5, 2, 4, 3] ;input_y_label: [[3 4 2 5 6 1]] prediction: [[5 5 2 4 1 1]]
106 loss: 10.929858 label_list_original as input x: [3, 5, 6, 4, 2] ;input_y_label: [[2 4 6 5 3 1]] prediction: [[5 4 2 4 1 1]]
107 loss: 10.700422 label_list_original as input x: [3, 4, 5, 6, 2] ;input_y_label: [[2 6 5 4 3 1]] prediction: [[5 4 4 4 1 1]]
108 loss: 10.562973 label_list_original as input x: [3, 6, 5, 4, 2] ;input_y_label: [[2 4 5 6 3 1]] pred

203 loss: 9.620945 label_list_original as input x: [3, 4, 5, 6, 2] ;input_y_label: [[2 6 5 4 3 1]] prediction: [[3 4 5 4 1 1]]
204 loss: 10.3951 label_list_original as input x: [6, 2, 3, 4, 5] ;input_y_label: [[5 4 3 2 6 1]] prediction: [[3 2 2 6 6 1]]
205 loss: 10.1506195 label_list_original as input x: [2, 3, 5, 4, 6] ;input_y_label: [[6 4 5 3 2 1]] prediction: [[3 4 5 6 6 1]]
206 loss: 10.100275 label_list_original as input x: [5, 4, 3, 2, 6] ;input_y_label: [[6 2 3 4 5 1]] prediction: [[3 2 5 5 5 1]]
207 loss: 9.965739 label_list_original as input x: [4, 3, 6, 5, 2] ;input_y_label: [[2 5 6 3 4 1]] prediction: [[3 4 4 4 1 1]]
208 loss: 9.83842 label_list_original as input x: [2, 4, 6, 5, 3] ;input_y_label: [[3 5 6 4 2 1]] prediction: [[3 6 6 4 1 1]]
209 loss: 10.042435 label_list_original as input x: [4, 6, 2, 3, 5] ;input_y_label: [[5 3 2 6 4 1]] prediction: [[3 2 2 5 4 1]]
210 loss: 10.515951 label_list_original as input x: [2, 5, 3, 6, 4] ;input_y_label: [[4 6 3 5 2 1]] predictio

307 loss: 9.348182 label_list_original as input x: [2, 5, 4, 6, 3] ;input_y_label: [[3 6 4 5 2 1]] prediction: [[5 5 5 2 2 1]]
308 loss: 9.518299 label_list_original as input x: [6, 5, 2, 3, 4] ;input_y_label: [[4 3 2 5 6 1]] prediction: [[5 5 2 6 6 1]]
309 loss: 9.159875 label_list_original as input x: [2, 6, 4, 3, 5] ;input_y_label: [[5 3 4 6 2 1]] prediction: [[5 6 6 2 2 1]]
310 loss: 9.092233 label_list_original as input x: [4, 6, 2, 5, 3] ;input_y_label: [[3 5 2 6 4 1]] prediction: [[5 5 6 6 4 1]]
311 loss: 9.63146 label_list_original as input x: [2, 5, 3, 4, 6] ;input_y_label: [[6 4 3 5 2 1]] prediction: [[5 4 5 2 2 1]]
312 loss: 9.704812 label_list_original as input x: [4, 3, 2, 5, 6] ;input_y_label: [[6 5 2 3 4 1]] prediction: [[5 5 4 4 2 1]]
313 loss: 9.564499 label_list_original as input x: [6, 5, 4, 2, 3] ;input_y_label: [[3 2 4 5 6 1]] prediction: [[5 5 5 2 6 1]]
314 loss: 9.647152 label_list_original as input x: [6, 5, 3, 4, 2] ;input_y_label: [[2 4 3 5 6 1]] prediction: [

410 loss: 9.001268 label_list_original as input x: [5, 3, 6, 2, 4] ;input_y_label: [[4 2 6 3 5 1]] prediction: [[6 6 6 3 5 1]]
411 loss: 8.86809 label_list_original as input x: [3, 4, 6, 5, 2] ;input_y_label: [[2 5 6 4 3 1]] prediction: [[4 6 6 3 3 1]]
412 loss: 8.737765 label_list_original as input x: [3, 2, 4, 5, 6] ;input_y_label: [[6 5 4 2 3 1]] prediction: [[4 4 3 3 3 1]]
413 loss: 9.070168 label_list_original as input x: [4, 6, 5, 2, 3] ;input_y_label: [[3 2 5 6 4 1]] prediction: [[3 6 6 6 3 1]]
414 loss: 9.268518 label_list_original as input x: [6, 2, 5, 4, 3] ;input_y_label: [[3 4 5 2 6 1]] prediction: [[3 6 2 6 3 1]]
415 loss: 9.575106 label_list_original as input x: [6, 3, 4, 2, 5] ;input_y_label: [[5 2 4 3 6 1]] prediction: [[5 6 3 3 2 1]]
416 loss: 8.753754 label_list_original as input x: [5, 3, 2, 4, 6] ;input_y_label: [[6 4 2 3 5 1]] prediction: [[6 4 2 3 5 1]]
417 loss: 8.784008 label_list_original as input x: [2, 4, 6, 3, 5] ;input_y_label: [[5 3 6 4 2 1]] prediction: [

513 loss: 8.231101 label_list_original as input x: [4, 6, 5, 2, 3] ;input_y_label: [[3 2 5 6 4 1]] prediction: [[2 5 5 6 4 1]]
514 loss: 8.736057 label_list_original as input x: [6, 2, 4, 5, 3] ;input_y_label: [[3 5 4 2 6 1]] prediction: [[2 5 6 6 6 1]]
515 loss: 8.923291 label_list_original as input x: [5, 3, 2, 6, 4] ;input_y_label: [[4 6 2 3 5 1]] prediction: [[2 2 3 3 5 1]]
516 loss: 9.187911 label_list_original as input x: [4, 3, 5, 2, 6] ;input_y_label: [[6 2 5 3 4 1]] prediction: [[2 5 5 3 6 1]]
517 loss: 7.7687135 label_list_original as input x: [4, 3, 6, 5, 2] ;input_y_label: [[2 5 6 3 4 1]] prediction: [[2 5 6 3 4 1]]
518 loss: 9.037735 label_list_original as input x: [4, 5, 2, 3, 6] ;input_y_label: [[6 3 2 5 4 1]] prediction: [[2 5 5 5 4 1]]
519 loss: 8.103937 label_list_original as input x: [3, 6, 2, 5, 4] ;input_y_label: [[4 5 2 6 3 1]] prediction: [[2 5 6 6 3 1]]
520 loss: 7.82983 label_list_original as input x: [3, 4, 6, 5, 2] ;input_y_label: [[2 5 6 4 3 1]] prediction: 

615 loss: 8.9083605 label_list_original as input x: [4, 5, 3, 2, 6] ;input_y_label: [[6 2 3 5 4 1]] prediction: [[2 5 5 5 6 1]]
616 loss: 8.971346 label_list_original as input x: [4, 2, 5, 3, 6] ;input_y_label: [[6 3 5 2 4 1]] prediction: [[5 5 5 2 6 1]]
617 loss: 8.277754 label_list_original as input x: [5, 6, 4, 2, 3] ;input_y_label: [[3 2 4 6 5 1]] prediction: [[2 2 5 5 5 1]]
618 loss: 7.666077 label_list_original as input x: [6, 4, 5, 2, 3] ;input_y_label: [[3 2 5 4 6 1]] prediction: [[2 5 5 6 6 1]]
619 loss: 8.314017 label_list_original as input x: [4, 3, 2, 6, 5] ;input_y_label: [[5 6 2 3 4 1]] prediction: [[5 2 2 3 5 1]]
620 loss: 7.986639 label_list_original as input x: [3, 5, 6, 2, 4] ;input_y_label: [[4 2 6 5 3 1]] prediction: [[4 2 5 5 3 1]]
621 loss: 7.736133 label_list_original as input x: [3, 2, 4, 6, 5] ;input_y_label: [[5 6 4 2 3 1]] prediction: [[4 4 4 2 3 1]]
622 loss: 7.9207172 label_list_original as input x: [5, 3, 2, 4, 6] ;input_y_label: [[6 4 2 3 5 1]] prediction

708 loss: 6.9660316 label_list_original as input x: [3, 2, 5, 6, 4] ;input_y_label: [[4 6 5 2 3 1]] prediction: [[4 6 2 2 3 1]]
709 loss: 7.343308 label_list_original as input x: [2, 3, 4, 5, 6] ;input_y_label: [[6 5 4 3 2 1]] prediction: [[4 4 4 3 2 1]]
710 loss: 7.5960298 label_list_original as input x: [5, 2, 4, 3, 6] ;input_y_label: [[6 3 4 2 5 1]] prediction: [[6 4 2 2 5 1]]
711 loss: 7.5512238 label_list_original as input x: [3, 5, 4, 2, 6] ;input_y_label: [[6 2 4 5 3 1]] prediction: [[4 4 5 5 3 1]]
712 loss: 7.4875717 label_list_original as input x: [2, 4, 3, 6, 5] ;input_y_label: [[5 6 3 4 2 1]] prediction: [[3 6 3 2 2 1]]
713 loss: 7.164849 label_list_original as input x: [3, 4, 5, 2, 6] ;input_y_label: [[6 2 5 4 3 1]] prediction: [[4 4 5 4 3 1]]
714 loss: 7.407901 label_list_original as input x: [2, 3, 6, 4, 5] ;input_y_label: [[5 4 6 3 2 1]] prediction: [[4 6 6 3 2 1]]
715 loss: 7.159929 label_list_original as input x: [5, 3, 2, 4, 6] ;input_y_label: [[6 4 2 3 5 1]] predicti

809 loss: 7.1108646 label_list_original as input x: [4, 2, 3, 6, 5] ;input_y_label: [[5 6 3 2 4 1]] prediction: [[6 6 3 2 4 1]]
810 loss: 7.077905 label_list_original as input x: [2, 5, 6, 4, 3] ;input_y_label: [[3 4 6 5 2 1]] prediction: [[6 6 6 5 2 1]]
811 loss: 7.1176634 label_list_original as input x: [2, 6, 4, 5, 3] ;input_y_label: [[3 5 4 6 2 1]] prediction: [[3 5 6 6 2 1]]
812 loss: 7.1398697 label_list_original as input x: [2, 6, 5, 3, 4] ;input_y_label: [[4 3 5 6 2 1]] prediction: [[3 6 6 6 2 1]]
813 loss: 6.1919765 label_list_original as input x: [6, 3, 5, 4, 2] ;input_y_label: [[2 4 5 3 6 1]] prediction: [[2 5 5 6 6 1]]
814 loss: 6.507755 label_list_original as input x: [5, 4, 6, 2, 3] ;input_y_label: [[3 2 6 4 5 1]] prediction: [[2 2 6 5 5 1]]
815 loss: 6.840268 label_list_original as input x: [2, 5, 6, 4, 3] ;input_y_label: [[3 4 6 5 2 1]] prediction: [[3 6 6 5 2 1]]
816 loss: 6.1538095 label_list_original as input x: [4, 5, 2, 6, 3] ;input_y_label: [[3 6 2 5 4 1]] predict

910 loss: 6.118777 label_list_original as input x: [4, 2, 6, 3, 5] ;input_y_label: [[5 3 6 2 4 1]] prediction: [[3 6 6 2 4 1]]
911 loss: 6.2769537 label_list_original as input x: [6, 2, 5, 3, 4] ;input_y_label: [[4 3 5 2 6 1]] prediction: [[5 5 5 2 6 1]]
912 loss: 6.173663 label_list_original as input x: [6, 5, 2, 3, 4] ;input_y_label: [[4 3 2 5 6 1]] prediction: [[4 5 2 5 6 1]]
913 loss: 6.54642 label_list_original as input x: [2, 5, 3, 4, 6] ;input_y_label: [[6 4 3 5 2 1]] prediction: [[6 4 5 2 2 1]]
914 loss: 5.998345 label_list_original as input x: [4, 5, 3, 6, 2] ;input_y_label: [[2 6 3 5 4 1]] prediction: [[2 6 3 4 4 1]]
915 loss: 5.725356 label_list_original as input x: [6, 3, 5, 4, 2] ;input_y_label: [[2 4 5 3 6 1]] prediction: [[2 5 5 3 6 1]]
916 loss: 5.9125395 label_list_original as input x: [3, 4, 5, 6, 2] ;input_y_label: [[2 6 5 4 3 1]] prediction: [[2 5 5 4 3 1]]
917 loss: 5.3904934 label_list_original as input x: [3, 6, 5, 2, 4] ;input_y_label: [[4 2 5 6 3 1]] prediction

1007 loss: 4.6150537 label_list_original as input x: [6, 3, 2, 5, 4] ;input_y_label: [[4 5 2 3 6 1]] prediction: [[5 5 2 3 6 1]]
1008 loss: 5.2841415 label_list_original as input x: [3, 6, 4, 2, 5] ;input_y_label: [[5 2 4 6 3 1]] prediction: [[5 2 4 6 3 1]]
1009 loss: 5.2285757 label_list_original as input x: [2, 3, 6, 4, 5] ;input_y_label: [[5 4 6 3 2 1]] prediction: [[5 6 6 3 2 1]]
1010 loss: 5.0070806 label_list_original as input x: [4, 2, 5, 6, 3] ;input_y_label: [[3 6 5 2 4 1]] prediction: [[3 6 5 2 4 1]]
1011 loss: 5.628211 label_list_original as input x: [5, 3, 6, 2, 4] ;input_y_label: [[4 2 6 3 5 1]] prediction: [[2 2 6 3 5 1]]
1012 loss: 4.8755884 label_list_original as input x: [3, 6, 2, 4, 5] ;input_y_label: [[5 4 2 6 3 1]] prediction: [[5 4 2 6 3 1]]
1013 loss: 5.019257 label_list_original as input x: [5, 2, 4, 3, 6] ;input_y_label: [[6 3 4 2 5 1]] prediction: [[6 3 4 2 5 1]]
1014 loss: 5.339367 label_list_original as input x: [2, 5, 4, 6, 3] ;input_y_label: [[3 6 4 5 2 1]]

1108 loss: 5.0969276 label_list_original as input x: [2, 4, 3, 6, 5] ;input_y_label: [[5 6 3 4 2 1]] prediction: [[3 6 3 4 2 1]]
1109 loss: 4.5758924 label_list_original as input x: [3, 6, 2, 4, 5] ;input_y_label: [[5 4 2 6 3 1]] prediction: [[4 4 2 6 3 1]]
1110 loss: 4.5740304 label_list_original as input x: [5, 3, 4, 2, 6] ;input_y_label: [[6 2 4 3 5 1]] prediction: [[6 2 4 3 5 1]]
1111 loss: 5.0715146 label_list_original as input x: [2, 6, 5, 3, 4] ;input_y_label: [[4 3 5 6 2 1]] prediction: [[3 3 5 6 2 1]]
1112 loss: 4.2397556 label_list_original as input x: [5, 4, 3, 6, 2] ;input_y_label: [[2 6 3 4 5 1]] prediction: [[2 6 3 4 5 1]]
1113 loss: 4.480014 label_list_original as input x: [2, 4, 5, 6, 3] ;input_y_label: [[3 6 5 4 2 1]] prediction: [[3 6 5 4 2 1]]
1114 loss: 4.000188 label_list_original as input x: [3, 5, 2, 6, 4] ;input_y_label: [[4 6 2 5 3 1]] prediction: [[6 6 2 5 3 1]]
1115 loss: 4.1744823 label_list_original as input x: [3, 6, 5, 2, 4] ;input_y_label: [[4 2 5 6 3 1]

1210 loss: 4.4678235 label_list_original as input x: [6, 5, 3, 4, 2] ;input_y_label: [[2 4 3 5 6 1]] prediction: [[2 4 3 5 6 1]]
1211 loss: 3.7991064 label_list_original as input x: [2, 6, 5, 4, 3] ;input_y_label: [[3 4 5 6 2 1]] prediction: [[3 4 5 6 2 1]]
1212 loss: 3.6879013 label_list_original as input x: [6, 3, 4, 2, 5] ;input_y_label: [[5 2 4 3 6 1]] prediction: [[5 2 4 3 6 1]]
1213 loss: 4.2777767 label_list_original as input x: [2, 3, 4, 5, 6] ;input_y_label: [[6 5 4 3 2 1]] prediction: [[6 4 4 3 2 1]]
1214 loss: 4.2854805 label_list_original as input x: [5, 6, 3, 2, 4] ;input_y_label: [[4 2 3 6 5 1]] prediction: [[2 2 3 6 5 1]]
1215 loss: 4.4033427 label_list_original as input x: [4, 3, 6, 2, 5] ;input_y_label: [[5 2 6 3 4 1]] prediction: [[2 2 6 3 4 1]]
1216 loss: 3.8913016 label_list_original as input x: [3, 6, 4, 5, 2] ;input_y_label: [[2 5 4 6 3 1]] prediction: [[2 5 4 6 3 1]]
1217 loss: 4.0361423 label_list_original as input x: [4, 6, 2, 5, 3] ;input_y_label: [[3 5 2 6 4 

1310 loss: 3.1712751 label_list_original as input x: [4, 5, 3, 6, 2] ;input_y_label: [[2 6 3 5 4 1]] prediction: [[2 6 3 5 4 1]]
1311 loss: 3.2243946 label_list_original as input x: [6, 3, 2, 4, 5] ;input_y_label: [[5 4 2 3 6 1]] prediction: [[5 4 2 3 6 1]]
1312 loss: 3.5163693 label_list_original as input x: [5, 3, 6, 4, 2] ;input_y_label: [[2 4 6 3 5 1]] prediction: [[2 4 6 3 5 1]]
1313 loss: 3.3127136 label_list_original as input x: [4, 5, 2, 3, 6] ;input_y_label: [[6 3 2 5 4 1]] prediction: [[6 3 2 5 4 1]]
1314 loss: 3.2280824 label_list_original as input x: [5, 6, 2, 4, 3] ;input_y_label: [[3 4 2 6 5 1]] prediction: [[3 4 2 6 5 1]]
1315 loss: 4.1063213 label_list_original as input x: [2, 5, 3, 6, 4] ;input_y_label: [[4 6 3 5 2 1]] prediction: [[6 6 3 5 2 1]]
1316 loss: 3.3476648 label_list_original as input x: [3, 6, 4, 5, 2] ;input_y_label: [[2 5 4 6 3 1]] prediction: [[2 5 4 6 3 1]]
1317 loss: 4.2024508 label_list_original as input x: [3, 4, 6, 2, 5] ;input_y_label: [[5 2 6 4 3 

1411 loss: 3.264902 label_list_original as input x: [6, 5, 2, 3, 4] ;input_y_label: [[4 3 2 5 6 1]] prediction: [[4 3 2 5 6 1]]
1412 loss: 3.124485 label_list_original as input x: [6, 3, 4, 2, 5] ;input_y_label: [[5 2 4 3 6 1]] prediction: [[5 2 4 3 6 1]]
1413 loss: 2.9499183 label_list_original as input x: [4, 2, 6, 5, 3] ;input_y_label: [[3 5 6 2 4 1]] prediction: [[3 5 6 2 4 1]]
1414 loss: 2.9860272 label_list_original as input x: [4, 2, 5, 6, 3] ;input_y_label: [[3 6 5 2 4 1]] prediction: [[3 6 5 2 4 1]]
1415 loss: 3.1669226 label_list_original as input x: [5, 3, 4, 6, 2] ;input_y_label: [[2 6 4 3 5 1]] prediction: [[2 6 4 3 5 1]]
1416 loss: 3.272561 label_list_original as input x: [5, 6, 3, 4, 2] ;input_y_label: [[2 4 3 6 5 1]] prediction: [[2 4 3 6 5 1]]
1417 loss: 2.9991589 label_list_original as input x: [5, 4, 6, 2, 3] ;input_y_label: [[3 2 6 4 5 1]] prediction: [[3 2 6 4 5 1]]
1418 loss: 3.2087045 label_list_original as input x: [2, 3, 4, 5, 6] ;input_y_label: [[6 5 4 3 2 1]]