In [1]:
import os
import sys
import tensorflow as tf
from tensorflow.io import gfile
import tensorflow.compat.v1.logging as logging
import pprint
import pickle
import numpy as np
import cv2 as cv
from tensorflow import keras
import time
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

In [4]:
# # 设置内存自增长
# tf.debugging.set_log_device_placement(True)
# gpus = tf.config.experimental.list_physical_devices('GPU')
# # 打印物理GPU有几个，就是电脑实际装的个数
# print(len(gpus))
# for gpu in gpus:
#     # 设置 GPU 所占用内存自动增长
#     tf.config.experimental.set_memory_growth(gpu, True)

In [5]:
input_description_file = "../dataset/results_20130124.token"
input_img_feature_dir = "../dataset/feature_extraction_inception_v3"
input_vocab_file = "../dataset/vocab.txt"

<div  align="center">    
    <img src='../image/词频.png', width= 600, height = 200>
</div>

In [6]:
class Vocab(object):
    def __init__(self, filename, word_num_threshold):
        '''
            filename:词表文件
            word_num_threshold:词的个数少于几个就省略掉？
        '''
        # 用于解码， id到词的映射
        self._id_to_word = {}
        # 用于编码， 词到id的映射
        self._word_to_id = {}
        # 未知字符
        self._unk = -1
        # 开始字符
        self._start = -1
        # 结束字符
        self._eos = -1
        # 前面我们统计过的词频，太少我们就不要了
        self._word_num_threshold = word_num_threshold
        self._read_dict(filename)

    def _read_dict(self, filename):
        with gfile.GFile(filename, 'r') as f:
            lines = f.readlines()
        for line in lines:
            word, occurence = line.strip('\r\n').split('\t')
            # 词出现的次数
            occurence = int(occurence)
            # 如果词出现的频次太少了，我们就跳过
            if word != '<UNK>' and occurence < self._word_num_threshold:
                continue
            # 从顺序开始，第一次出现的词，对应的id为0， 以此类推
            idx = len(self._id_to_word)
            if word == '<UNK>':
                self._unk = idx
            elif word == '<s>':
                self._start = idx
            elif word == '.':
                self._eos = idx
            if idx in self._id_to_word or word in self._word_to_id:
                raise Exception("重复添加！！！")
            self._word_to_id[word] = idx
            self._id_to_word[idx] = word

    @property
    def unk(self):
        return self._unk
    @property
    def start(self):
        return self._start
    @property
    def eos(self):
        return self._eos
    # 单个词到id的转换
    def word_to_id(self, word):
        return self._word_to_id.get(word, self.unk)
    # 单个id到词的转换
    def id_to_word(self, cur_id):
        return self._id_to_word.get(cur_id, '<UNK>')
    # 整个词表的大小
    def size(self):
        return len(self._word_to_id)
    # 编码，把句子转换成id  用于训练模型前的编码
    def encode(self, sentence):
        word_ids = [self.word_to_id(cur_word) for cur_word in sentence.split(' ')]
        return word_ids
    # 解码，把id数组转化成句子， 预测时，模型输出的是id， 用于解码
    def decode(self, sentence_id):
        words = [self.id_to_word(word_id) for word_id in sentence_id]
        return ' '.join(words)

In [7]:
# 这个大家应该是特别熟悉了，我们第一次就讲的这个
def parse_token_file(token_file):
    img_name_to_tokens = {}
    with gfile.GFile(token_file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        img_id, description = line.strip('\r\n').split('\t')
        img_name, _ = img_id.split('#')
        img_name_to_tokens.setdefault(img_name, [])
        img_name_to_tokens[img_name].append(description)
    return img_name_to_tokens

In [8]:
# 把描述的句子转换成id
def convert_token_to_id(img_name_to_tokens, vocab):
    img_name_to_token_ids = {}
    # 拿到每张图片多个描述
    for img_name in img_name_to_tokens:
        img_name_to_token_ids.setdefault(img_name, [])
        descriptions = img_name_to_tokens[img_name]
        for description in descriptions:
            # 进行编码
            token_ids = vocab.encode(description)
            img_name_to_token_ids[img_name].append(token_ids)
    return img_name_to_token_ids

In [9]:
# 词的次数出现少于4次的我们用 <UNK> 进行表示
vocab = Vocab(input_vocab_file, 4)
vocab_size = vocab.size()
logging.info("vocab_size: %d" % vocab_size)

img_name_to_tokens = parse_token_file(input_description_file)
img_name_to_token_ids = convert_token_to_id(img_name_to_tokens, vocab)

logging.info("num of all images: %d" % len(img_name_to_tokens))
pprint.pprint(list(img_name_to_tokens.keys())[0:10])
pprint.pprint(img_name_to_tokens['2778832101.jpg'])
logging.info("num of all images: %d" % len(img_name_to_token_ids))
pprint.pprint(list(img_name_to_token_ids.keys())[0:10])
pprint.pprint(img_name_to_token_ids['2778832101.jpg'])

INFO:tensorflow:vocab_size: 9223
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
['A man in jeans is reclining on a green metal bench along a busy sidewalk and '
 'crowded street .',
 'A white male with a blue sweater and gray pants laying on a sidewalk bench .',
 'A man in a blue shirt and gray pants is sleeping on a sidewalk bench .',
 'A person is sleeping on a bench , next to cars .',
 'A man sleeping on a bench in a city area .']
INFO:tensorflow:num of all images: 31783
['1000092795.jpg',
 '10002456.jpg',
 '1000268201.jpg',
 '1000344755.jpg',
 '1000366164.jpg',
 '1000523639.jpg',
 '1000919630.jpg',
 '10010052.jpg',
 '1001465944.jpg',
 '1001545525.jpg']
[[4, 10, 5, 133, 9, 3533, 7, 2, 49, 338, 147, 140, 2, 245, 94, 8, 381, 37, 3],
 [4, 21, 180, 12, 2, 27, 285, 8, 121, 129, 298, 7, 2, 94, 147, 3],
 [4, 10, 

<div  align="center">    
    <img src='../image/编码.png', width= 600, height = 200>
</div>

产生批量数据

In [None]:
class ImageCaptionData(object):
    def __init__(self,
                 img_name_to_token_ids,
                 img_feature_dir,
                 num_timesteps,
                 vocab,
                 deterministic = False):
        '''
            img_name_to_token_ids:图片名字 -> 描述 
            img_feature_dir：特征的文件夹
            num_timesteps：固定句子的长度， 因为可能有些句子虽然很长，但是出现的次数很少
            vocab： 我们词表的类
            deterministic：是否进行 shuffle， 默认是 shuffle
        '''
        self._vocab = vocab
        self._all_img_feature_filepaths = []
        for filename in gfile.listdir(img_feature_dir):
            self._all_img_feature_filepaths.append(os.path.join(img_feature_dir, filename))
        pprint.pprint(self._all_img_feature_filepaths)

        self._img_name_to_token_ids = img_name_to_token_ids
        # 固定一个句子的长度
        self._num_timesteps = num_timesteps
        self._indicator = 0
        # 是否进行shuffle
        self._deterministic = deterministic
        
        self._img_feature_filenames = []
        self._img_feature_data = []
        self._load_img_feature_pickle()
        if not self._deterministic:
            self._random_shuffle()

    # 载入提取好的特征图片
    def _load_img_feature_pickle(self):
        for filepath in self._all_img_feature_filepaths:
            with gfile.GFile(filepath, 'rb') as f:
                # 之前使用pickle存储的，现在我们使用pickle拿出来
                filenames, features = pickle.load(f, encoding='iso-8859-1')
                self._img_feature_filenames += filenames
                self._img_feature_data.append(features)
        # [feature1, feature2, feature3] ===> [feature1, 
        #                                      feature2, 
        #                                      feature3]
        self._img_feature_data = np.vstack(self._img_feature_data)
        origin_shape = self._img_feature_data.shape
        self._img_feature_filenames = np.asarray(self._img_feature_filenames)
        print("img_feature_data shape is: ", self._img_feature_data.shape)
        print("img_feature_filenames shape is :", self._img_feature_filenames.shape)
        if not self._deterministic:
            self._random_shuffle()

    # 有多少张图片
    def size(self):
        return len(self._img_feature_filenames)
    # 提取到特征的大小
    def img_feature_size(self):
        return self._img_feature_data.shape[1]
    # 进行下标打乱
    def _random_shuffle(self):
        p = np.random.permutation(self.size())
        self._img_feature_filenames = self._img_feature_filenames[p]
        self._img_feature_data = self._img_feature_data[p]
    # 批次图片的描述转成id
    def _img_desc(self, filenames):
        batch_sentence_ids = []
        batch_weights = []
        for filename in filenames:
            token_ids_set = self._img_name_to_token_ids[filename]
            # 因为一张图片是对应多个描述，这边我们选用第一个描述
            chosen_token_ids = token_ids_set[0]
            chosen_token_length = len(chosen_token_ids)
            # 为什么会有weight， 因为描述的长度小于我们固定的长度，那么我们不做长句的惩罚.
            # 假设我们选取的句子长度为5
            # i love you . .  模型：i love you very much.   weight:[1, 1, 1, 0, 0]
            weight = [1 for i in range(chosen_token_length)]
            # 如果句子长度大于我们固定的长度
            if chosen_token_length >= self._num_timesteps:
                chosen_token_ids = chosen_token_ids[0:self._num_timesteps]
                weight = weight[0:self._num_timesteps]
            else:
                remaining_length = self._num_timesteps - chosen_token_length
                chosen_token_ids += [self._vocab.eos for i in range(remaining_length)]
                weight += [0 for i in range(remaining_length)]
            chosen_token_ids.insert(0, 1)
            chosen_token_ids.append(3) # 是我们结束字符的id .
            # 对于这两个地方,我们就不计算权重
            weight.insert(0, 0)
            weight.append(0)
            batch_sentence_ids.append(chosen_token_ids)
            batch_weights.append(weight)
        batch_sentence_ids = np.asarray(batch_sentence_ids)
        batch_weights = np.asarray(batch_weights)
        return batch_sentence_ids, batch_weights
    # 下一个批次
    def next(self, batch_size):
        end_indicator = self._indicator + batch_size
        # 如果已经取到文件末尾了，是否进行shuffle，还有就是把其实的index置为0
        if end_indicator > self.size():
            if not self._deterministic:
                self._random_shuffle()
            self._indicator = 0
            end_indicator = self._indicator + batch_size
        assert end_indicator <= self.size()

        batch_img_features = self._img_feature_data[self._indicator: end_indicator]
        batch_img_names = self._img_feature_filenames[self._indicator: end_indicator]
        batch_sentence_ids, batch_weights = self._img_desc(batch_img_names)

        self._indicator = end_indicator
        return batch_img_features, batch_sentence_ids, batch_weights, batch_img_names


caption_data = ImageCaptionData(img_name_to_token_ids, input_img_feature_dir, 10, vocab)
img_feature_dim = caption_data.img_feature_size()
caption_data_size = caption_data.size()
logging.info("img_feature_dim: %d" % img_feature_dim)
logging.info("caption_data_size: %d" % caption_data_size)

batch_img_features, batch_sentence_ids, batch_weights, batch_img_names = caption_data.next(5)
pprint.pprint(batch_img_features)
pprint.pprint(batch_sentence_ids)
pprint.pprint(batch_weights)
pprint.pprint(batch_img_names)

['../dataset/feature_extraction_inception_v3\\image_features-0.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-1.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-10.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-100.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-101.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-102.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-103.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-104.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-105.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-106.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-107.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-108.pickle',
 '../dataset/feature_extraction_inception_v3\\image_features-109.pickle',
 '../dataset/feature_extraction_inception_v

decoder网络的搭建

<div  align="center">    
    <img src='../image/框架图1.png', width= 600, height = 200>
    <img src='../image/框架图2.png', width= 600, height = 200>
</div>

In [2]:
# 我们的图像经过特征提取网络出来之后是 7*7*512
# show attention and tell 是 14*14*512， 但是原理一样就好

In [3]:
# 只是得到图片特征的 encoding_output
class Encoder(tf.keras.Model):
    def __init__(self):
        super(Encoder, self).__init__()
        self.dense = keras.layers.Dense(512)
        # return_sequences: 每一步是否有输出
    @tf.function
    def call(self, x):
        # 对特征进行融合，再 reshape 成 (batch_size, 7*7, 512)
        # 那么怎么进行融合呢，其实很简单
        x = tf.reshape(x, [-1, 512])
        # x shape == (-1, 512)
        x = self.dense(x)
        # x shape == (batch_size, 7*7, 512)
        x = tf.reshape(x, [-1, 7*7, 512])
        return x


# 我们的 encoder 主要是对图片进行attention
inputs = tf.random.normal([32, 7, 7, 512])
encoder = Encoder()
encoder_output = encoder(inputs)
print("encoder_output size is :", encoder_output.shape)

encoder_output size is : (32, 49, 512)


In [4]:
# 这个是对特征图的 attention
class BahdanauAttention(tf.keras.Model):
    def __init__(self):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(512)
        self.W2 = tf.keras.layers.Dense(512)
        self.V = tf.keras.layers.Dense(1)

    def call(self, hidden_state, encoder_output):
        '''
            hidden_state:decoder的隐层状态
            encoder_output：encoder的输出
        '''
        # hidden_state shape == (batch_size, channel_size)
        # hidden_with_time_axis shape == (batch_size, 1, channel_size)
        hidden_with_time_axis = tf.expand_dims(hidden_state, 1)

        # tf.nn.tanh(self.W1(values) + self.W2(hidden_with_time_axis)) shape == 
        #                                            (batch_size, 7*7, channel_size)
        # score shape == (batch_size, num_timesteps, 1)
        score = self.V(tf.nn.tanh(self.W1(encoder_output) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, 7*7, 1)
        # 对于当前步，特征图中(7*7*512中哪个位置比较重要, 即1*1*512
        attention_weights = tf.nn.softmax(score, axis=1)
    
        # encoder_output shape ==     (batch_size, 7*7, channel_size)
        # attention_weights shape == (batch_size, 7*7, 1)
        # context_vector shape ==   (batch_size, channel_size)
        context_vector = attention_weights * encoder_output
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights
    
attention = BahdanauAttention()
encoder_hidden = tf.random.normal([32, 512])
attention_result, attention_weights = attention(encoder_hidden, encoder_output)
print("attention_result shape is :", attention_result.shape)
print("attention_weights shape is :", attention_weights.shape)

attention_result shape is : (32, 512)
attention_weights shape is : (32, 49, 1)


In [None]:
class Decoder(tf.keras.Model):
    def __init__(self, vocab_size, embedding_dim, decoding_units, channel_size, batch_size):
        super(Decoder, self).__init__()
        '''
            vocab_size:词表的大小，用来生成词的概率分布，看哪个词的下标概率比较大
            embedding_dim: 词embedding的维度
            decoding_units： 循环神经网络的单元个数
            channel_size：我们图片的channel数量
            batch_size：批次的大小，因为我们数据是批次进来的
        '''
        self.batch_size = batch_size
        self.channel_size = channel_size
        self.decoding_units = decoding_units
        self.embedding_layer = keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = keras.layers.GRU(self.decoding_units,
                                    return_sequences=True,
                                    return_state=True,
                                    recurrent_initializer='glorot_uniform')
        self.fc_context = keras.layers.Dense(embedding_dim)
        self.fc_logits = keras.layers.Dense(vocab_size, activation='softmax')

        # 初始化 attention
        self.attention = BahdanauAttention()

    def call(self, x, hidden, encoding_output):
        '''
            encoding_output: 上一步的输出
            hidden         : 上一步的状态
            x              : 当前步的输入
        '''
        # 第一次hidden shape == (batch_size, channel_size)
        # 第一次encoder_output shape == (batch_size, 7*7, channel_size)
        
        # context_vector  shape == (batch_size, channel_size)
        context_vector, attention_weights = self.attention(hidden, encoding_output)
        
        # x shape == (batch_size, embedding_dim )
        x = self.embedding_layer(x)
        # context_vector shape == (batch_size, channel_size) 维度和 x 维度是不匹配的，
        # 所以加上一个全连接层，把它变成(batch_size, embedding_dim)
        # context_vector shape == (batch_size, embedding_dim)
        context_vector = self.fc_context(context_vector)
        # 这一步就是把context_vector 和 当前步的输入 x 一起输入循环神经网络
        # x shape == (batch_size, 1, embedding_dim + embedding_dim)
        x = tf.concat([tf.expand_dims(context_vector, 1), tf.expand_dims(x, 1)], axis=-1)

        # 经过 GRU 网络 # output shape == (batch_size, 1, decoding_units)
        output, state = self.gru(x)

        # output shape == (batch_size*1, decoding_units)
        output = tf.reshape(output, (-1, output.shape[2]))

        # output shape == (batch_size, vocab_size)
        # 也就是每个词的概率是多少，那个词概率大就取哪个值为预测的词
        x = self.fc_logits(output)

        return x, state, attention_weights
    # 对于每一个批次数据，第一次的时候，我们需要初始化我们的hidden_state，
    # 因为我们图片没有经过循环神经网络，所以我们自己初始化
    # 我们的Hidden_state要和encoder_output做attention，输入到循环神经网络中
    def initialize_hidden_state(self):
        return tf.zeros((self.batch_size, self.channel_size))
    
decoder = Decoder(vocab_size, 64, 1024, 512, 32)
decoder_output, _, _ = decoder(np.ones((32, 1)).reshape(-1,), encoder_hidden, encoder_output)
print ('Decoder output shape: ', decoder_output.shape)

训练流程

In [None]:
# 自定义学习率
class CustomizedSchedule(
    keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps=4000):
        super(CustomizedSchedule, self).__init__()
        self.d_model = tf.cast(d_model, tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.warmup_steps ** (-1.5))
        arg3 = tf.math.rsqrt(self.d_model)
        return arg3 * tf.math.minimum(arg1, arg2)


# 定义损失函数,因为我们的输出是经过激活函数的,所以from_logits=False
# reduction='none' 表示我们要自己求和,因为有权重
loss_object = keras.losses.SparseCategoricalCrossentropy(from_logits=False, reduction='none')

batch_size = 100
embedding_dim = 64
decoding_units, channel_size = 512, 512
num_timesteps = 20


# 初始化我们的网络
encoder = Encoder()
decoder = Decoder(vocab.size(), embedding_dim, decoding_units, channel_size, batch_size)
# 定义自适应学习率
learning_rate = CustomizedSchedule(128)
# 定义优化器
optimizer = keras.optimizers.Adam(learning_rate,
                                  beta_1=0.9,
                                  beta_2=0.98,
                                  epsilon=1e-9)

@tf.function
def train_step(batch_img_features, batch_sentence_ids, batch_weights, encoder_hidden):
    loss = 0
    acc = 0
    with tf.GradientTape() as tape:
        # batch_img_features == (batch_size, 7, 7, channel_size)
        # batch_sentence_ids == (batch_size, num_timesteps + 2) 因为有开始字符和结束字符
        # batch_weights == (batch_size, num_timesteps + 2) 因为有开始字符和结束字符
        
        # encoding_hidden shape == [batch_size, channel_size]
        encoding_output = encoder(batch_img_features)            
        decoding_hidden = encoding_hidden
        # decoder循环神经网络的工作流程
        #    <s>      => 第一个词
        # 第一个词 + context_vector     => 生成第二个词
        # 第二个词     => 生成第三个词
        #  .......   => .   .就是我们的结束字符
        for t in range(num_timesteps + 2 - 1):
            # decoding_input shape == (batch_size, 1) 
            decoding_input = batch_sentence_ids[:, t]
            # predictions shape == (batch_size, vocab_size) 
            predictions, decoding_hidden, _ = decoder(decoding_input, decoding_hidden, encoding_output)
            # labels hsape == (batch_size, 1)
            # labels_weight shape == (batch_size, 1)
            # 这边想一想为什么是 t+1 :<s> i love you . -> <s> i love -> i love you .
            labels = batch_sentence_ids[:, t+1]
            labels_weight = batch_weights[:, t+1]
            loss_ = loss_object(labels, predictions)
            labels_weight = tf.cast(labels_weight, loss_.dtype)
            loss_ *= labels_weight
            # 求和平均一下
            loss_ = tf.reduce_mean(loss_)
            loss += loss_
            ######## 计算准确度，我们使用简单的方法，就是看对应位置，单词预测正确
            pred_word_id = tf.argmax(predictions, 1, output_type = tf.int32)
            correct_pred = tf.equal(pred_word_id, labels)
            correct_prediction_with_mask = tf.multiply(tf.cast(correct_pred, tf.float32), 
                                                       labels_weight)
            acc_ = tf.reduce_sum(correct_prediction_with_mask)
            acc += acc_
        batch_loss = (loss) / (batch_sentence_ids.shape[1])
        batch_acc = (acc) / (batch_sentence_ids.shape[1])

    trainable_variables =encoder.trainable_variables + decoder.trainable_variables
    gradients = tape.gradient(batch_loss, trainable_variables)
    optimizer.apply_gradients(zip(gradients, trainable_variables))
    return batch_loss, batch_acc
# 循环10次
epoch_size = 50
for epoch in range(epoch_size):
    start = time.time() 
    # 生成我们的批训练数据集, 我们规定句子的长度为 num_timesteps
    imgcaption_data = ImageCaptionData(img_name_to_token_ids, input_img_feature_dir, num_timesteps, vocab)

    # 初始化hidden_state
    encoding_hidden = decoder.initialize_hidden_state()
    total_loss = 0
    total_acc = 0
    # 看我们整个数据集，能跑几个batch
    batchs_per_epoch = imgcaption_data.size() // batch_size
    for batch in range(batchs_per_epoch):
        # 名字我们就不要拿了
        batch_img_features, batch_sentence_ids, batch_weights, _ = \
                                                            imgcaption_data.next(batch_size)
        # 喂进去神经网络
        batch_loss, batch_acc = train_step(batch_img_features, batch_sentence_ids, 
                                           batch_weights, encoding_hidden)
        total_loss += batch_loss
        total_acc += batch_acc
        # 打印我们所关心的值
#         print('Batch [{}]  Loss {:.10f} Acc {:.10f}'.format(batch, batch_loss.numpy(), batch_acc.numpy()))

    print('Epoch [{}/{}]  Loss {:.10f}, Acc {:.10f}'.format(epoch + 1, epoch_size, 
                                                            total_loss.numpy()/batchs_per_epoch , 
                                                            total_acc.numpy()/batchs_per_epoch))
    print('Time take for 1 epoch: {} secs\n'.format(time.time() - start))

模型推理

In [None]:
with gfile.GFile('../dataset/feature_extraction_inception_v3/image_features-0.pickle', 'rb') as f:
    # 之前使用pickle存储的，现在我们使用pickle拿出来
    filenames, features = pickle.load(f, encoding='iso-8859-1')
test_feature = features[1]
print(filenames[1])
print(test_feature.shape)
# 我们还要在 batch_szie 上扩充一个维度
test_feature = np.expand_dims(test_feature, 0)
print(test_feature.shape)

In [None]:
# 推理的时候就是
# <s> => 第一个词
# 第一个词     => 生成第二个词
# 第二个词     => 生成第三个词
#    ...       => .

# 因为我们在 decoder hidden_state shape == [batch_size, channel_size]
# encoding_hidden = decoder.initialize_hidden_state()
# 但是在这里我们 batch_size = 1, 所以我们自己写一个就好
hidden = tf.zeros((1, channel_size))
encoding_out = encoder(test_feature)

decoding_hidden = hidden
# 开始字符作为我们的第一个输入
decoding_input = np.array([vocab.start]).reshape(-1,)

result = []
for t in range(encoding_out.shape[1]):
    predictions, decoding_hidden, attention_weights = decoder(
        decoding_input, decoding_hidden, encoding_out)
    predict_idx = tf.argmax(predictions[0]).numpy()
    # 已经到了结束字符,那么我们就不应在预测了
    if predict_idx == vocab.eos:
        break
    
    result.append(predict_idx)
    # 作为下一步的输入
    decoding_input  = predict_idx.reshape(-1,)

In [None]:
print("预测的id: ", result)
print("真实的id", img_name_to_token_ids['10002456.jpg'][0])

In [None]:
print("预测的结果:", vocab.decode(result))
print("真实的结果：", vocab.decode(img_name_to_token_ids['10002456.jpg'][0]))