In [1]:
import tensorflow as tf
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
    try:
        # Currently, memory growth needs to be the same across GPUs
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
        tf.config.experimental.set_visible_devices(gpus[0], 'GPU')
        logical_gpus = tf.config.experimental.list_logical_devices('GPU')
        print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
    except RuntimeError as e:
        # Memory growth must be set before GPUs have been initialized
        print(e)

1 Physical GPUs, 1 Logical GPUs


In [2]:
# You'll generate plots of attention in order to see which parts of an image
# our model focuses on during captioning
import matplotlib.pyplot as plt

# Scikit-learn includes many helpful utilities
from sklearn.model_selection import train_test_split
from sklearn.utils import shuffle

import re
import numpy as np
import os
import time
import json
from glob import glob
from PIL import Image
import pickle
import tqdm
from pathlib import Path

## Load data
把training跟validation data的file path存入img_names，把caption存入captions  
testing data的file path存入test_img_names

In [3]:
import glob
img_names = []
captions = []
with open("./words_captcha/spec_train_val.txt") as f:
    for line in f:
        spec = line.strip().split()
        img_names.append(f'./words_captcha/{spec[0]}.png')
        captions.append('<start> ' + ' '.join(spec[1]) + ' <end>')

# find all filenames in the directory
all_img_path = glob.glob(f'./words_captcha/*.png')
all_img_path = ["./"+ Path(path).as_posix() for path in all_img_path]
test_img_names = set(all_img_path)-set(img_names)
test_img_names = list(test_img_names)
print("Number of training images: ", len(img_names))

Number of training images:  120000


## Preprocess and tokenization 
這邊參考助教給的notebook，但是tokenizer不設max k

In [4]:
tokenizer = tf.keras.preprocessing.text.Tokenizer(oov_token="<unk>",filters='!"#$%&()*+.,-/:;=?@[\]^_`{|}~ ')
tokenizer.word_index['<pad>'] = 0
tokenizer.index_word[0] = '<pad>'
tokenizer.fit_on_texts(captions)
train_seqs = tokenizer.texts_to_sequences(captions)
cap_vector = tf.keras.preprocessing.sequence.pad_sequences(train_seqs, padding='post')
max_length = len(cap_vector[0])

## Split training, validating, and testing data 
使用train test split把data分成training跟validation data

In [5]:
# Create training and validation sets using an 100-20 split
img_names_train, img_names_val, cap_train, cap_val = train_test_split(img_names, cap_vector, test_size=20000, train_size=100000, random_state=0)
print(len(img_names_train), len(img_names_val))
print(len(cap_train), len(cap_val))

100000 20000
100000 20000


## Parameter settings

In [6]:
IMAGE_SIZE = (160, 300)
BATCH_SIZE = 100
BUFFER_SIZE = 5000
EPOCHS = 15
EMBEDDING_DIM = 256
UNITS = 512
VOCAB_SIZE = len(tokenizer.word_index) + 1
STEPS = len(img_names_train) // BATCH_SIZE
# Shape of the vector extracted from InceptionV3 is (64, 2048)
# These two variables represent that vector shape                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     
LEARNING_RATE = 1e-4

## Build dataset

In [7]:
def map_func(img_path, cap):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    # turn the value to range of [-1, 1]
    img = img / 255.0 * 2 - 1
    return img, cap

In [8]:
dataset_train = tf.data.Dataset.from_tensor_slices((img_names_train, cap_train))\
                               .map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
                               .shuffle(BUFFER_SIZE)\
                               .batch(BATCH_SIZE, drop_remainder=True)\
                               .prefetch(tf.data.experimental.AUTOTUNE)

dataset_valid = tf.data.Dataset.from_tensor_slices((img_names_val, cap_val))\
                               .map(map_func, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
                               .batch(BATCH_SIZE, drop_remainder=True)\
                               .prefetch(tf.data.experimental.AUTOTUNE)

In [9]:
print(next(iter(dataset_train))[1])

tf.Tensor(
[[ 2  7 11 13  3  0  0]
 [ 2  6  9 17 13  3  0]
 [ 2  9  8 19 18  9  3]
 [ 2  5  6 18  3  0  0]
 [ 2 24  5 25  3  0  0]
 [ 2  8 10  5  3  0  0]
 [ 2 15  8 14  3  0  0]
 [ 2  8 15  7 13  3  0]
 [ 2 18  8 10  4  3  0]
 [ 2 19  7 12  4  3  0]
 [ 2 14 15 17  3  0  0]
 [ 2 16  8  6 14  3  0]
 [ 2 15  5 16  3  0  0]
 [ 2 14 11  4  5 10  3]
 [ 2 15 10  8 28  4  3]
 [ 2  7 24 12  4 10  3]
 [ 2 16  5 27  3  0  0]
 [ 2 15 25 14  3  0  0]
 [ 2 19 20  5  3  0  0]
 [ 2 20  5 23  4 10  3]
 [ 2  6  7 14  3  0  0]
 [ 2 15  7  6  9  6  3]
 [ 2 14 17 20  4  3  0]
 [ 2  6  8 28  4  6  3]
 [ 2 21  7 27  3  0  0]
 [ 2 10  7 13  3  0  0]
 [ 2 24  4 13  3  0  0]
 [ 2 14 18  4 14 23  3]
 [ 2 17 15  7 12  3  0]
 [ 2 20  7  7  9  6  3]
 [ 2 15  7 15  4  3  0]
 [ 2  7 24 12  4 10  3]
 [ 2  5  6 15  3  0  0]
 [ 2  8 13  4  3  0  0]
 [ 2 12  5  9  7  3  0]
 [ 2 13  4 15  9  3  0]
 [ 2 11  7 19  7  3  0]
 [ 2  5 19  7  3  0  0]
 [ 2 10  7 11 11  3  0]
 [ 2 18  8  9  6  3  0]
 [ 2  6 24  8  6  6  3]
 [ 2 

## Model

### Feature extraction
這裡我使用yolo的hidden state的output當作extract出來的feature

In [10]:
from tensorflow import keras
from tensorflow.keras import layers

In [11]:
def conv_leaky_relu(inputs, filters, size, stride):
    x = layers.Conv2D(filters, size, stride, padding="same",
                      kernel_initializer=tf.keras.initializers.TruncatedNormal())(inputs)
    x = layers.BatchNormalization()(x)
    x = layers.LeakyReLU(0.1)(x)

    return x

In [12]:
img_inputs = keras.Input(shape=(IMAGE_SIZE[0], IMAGE_SIZE[1], 3))
x = conv_leaky_relu(img_inputs, 64, 7, 2)
x = layers.MaxPool2D()(x)
x = conv_leaky_relu(x, 192, 3, 1)
x = layers.MaxPool2D()(x)
x = conv_leaky_relu(x, 128, 1, 1)
x = conv_leaky_relu(x, 256, 3, 1)
x = conv_leaky_relu(x, 256, 1, 1)
x = conv_leaky_relu(x, 512, 3, 1)
x = layers.MaxPool2D()(x)
x = conv_leaky_relu(x, 256, 1, 1)
x = conv_leaky_relu(x, 512, 3, 1)
x = conv_leaky_relu(x, 256, 1, 1)
x = conv_leaky_relu(x, 512, 3, 1)
x = conv_leaky_relu(x, 256, 1, 1)
x = conv_leaky_relu(x, 512, 3, 1)
x = conv_leaky_relu(x, 256, 1, 1)
x = conv_leaky_relu(x, 512, 3, 1)
x = conv_leaky_relu(x, 512, 1, 1)
x = conv_leaky_relu(x, 1024, 3, 1)
x = layers.MaxPool2D()(x)
x = conv_leaky_relu(x, 512, 1, 1)
x = conv_leaky_relu(x, 1024, 3, 1)
x = conv_leaky_relu(x, 512, 1, 1)
x = conv_leaky_relu(x, 1024, 3, 1)
x = conv_leaky_relu(x, 1024, 3, 1)
x = conv_leaky_relu(x, 1024, 3, 2)
x = conv_leaky_relu(x, 1024, 3, 1)
outputs = conv_leaky_relu(x, 1024, 3, 1)

feature_extractor = keras.Model(inputs=img_inputs, outputs=outputs, name="YOLO")

In [13]:
feature_extractor.summary()

Model: "YOLO"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input_1 (InputLayer)        [(None, 160, 300, 3)]     0         
                                                                 
 conv2d (Conv2D)             (None, 80, 150, 64)       9472      
                                                                 
 batch_normalization (BatchN  (None, 80, 150, 64)      256       
 ormalization)                                                   
                                                                 
 leaky_re_lu (LeakyReLU)     (None, 80, 150, 64)       0         
                                                                 
 max_pooling2d (MaxPooling2D  (None, 40, 75, 64)       0         
 )                                                               
                                                                 
 conv2d_1 (Conv2D)           (None, 40, 75, 192)       110784 

### CNN encoder
參考助教的notebook

In [14]:
class CNN_Encoder(tf.keras.Model):
    def __init__(self, embedding_dim):
        super(CNN_Encoder, self).__init__()
        self.fc = tf.keras.layers.Dense(embedding_dim)

    def call(self, x):
        # x shape after passing through fc == (batch_size, 15, embedding_dim)
        x = self.fc(x)
        x = tf.nn.relu(x)
        return x

### RNN decoder
參考助教的notebook

In [15]:
class BahdanauAttention(tf.keras.Model):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)

        # hidden shape == (batch_size, hidden_size)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)

        # score shape == (batch_size, 64, hidden_size)
        score = tf.nn.tanh(self.W1(features) + self.W2(hidden_with_time_axis))

        # attention_weights shape == (batch_size, 64, 1)
        # you get 1 at the last axis because you are applying score to self.V
        attention_weights = tf.nn.softmax(self.V(score), axis=1)

        # context_vector shape after sum == (batch_size, hidden_size)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [16]:
class RNN_Decoder(tf.keras.Model):
    def __init__(self, embedding_dim, units, vocab_size):
        super(RNN_Decoder, self).__init__()
        self.units = units

        self.embedding = tf.keras.layers.Embedding(vocab_size, embedding_dim)
        self.gru = tf.keras.layers.GRU(self.units,
                                       return_sequences=True,
                                       return_state=True,
                                       recurrent_initializer='glorot_uniform')
        self.fc1 = tf.keras.layers.Dense(self.units)
        self.fc2 = tf.keras.layers.Dense(vocab_size)

        self.attention = BahdanauAttention(self.units)

    def call(self, x, features, hidden):
        # defining attention as a separate model
        context_vector, attention_weights = self.attention(features, hidden)

        # x shape after passing through embedding == (batch_size, 1, embedding_dim)
        x = self.embedding(x)

        # x shape after concatenation == (batch_size, 1, embedding_dim + hidden_size)
        x = tf.concat([tf.expand_dims(context_vector, 1), x], axis=-1)

        # passing the concatenated vector to the GRU
        output, state = self.gru(x)

        # shape == (batch_size, max_length, hidden_size)
        x = self.fc1(output)

        # x shape == (batch_size * max_length, hidden_size)
        x = tf.reshape(x, (-1, x.shape[2]))

        # output shape == (batch_size * max_length, vocab)
        x = self.fc2(x)

        return x, state, attention_weights

    def reset_state(self, batch_size):
        return tf.zeros((batch_size, self.units))

In [17]:
encoder = CNN_Encoder(EMBEDDING_DIM)
decoder = RNN_Decoder(EMBEDDING_DIM, UNITS, VOCAB_SIZE)

### Define loss

In [18]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
    from_logits=True, reduction='none')

def loss_function(real, pred):
    mask = tf.math.logical_not(tf.math.equal(real, 0))
    loss_ = loss_object(real, pred)

    mask = tf.cast(mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(loss_)

### Checkpoints
參考助教的notebook，不過要多存feature extractor，因為我不是用pre-trained的model

In [19]:
checkpoint_path = "./checkpoints/train"
ckpt = tf.train.Checkpoint(feature_extractor=feature_extractor,
                           encoder=encoder,
                           decoder=decoder,
                           optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

In [20]:
start_epoch = 0
# if ckpt_manager.latest_checkpoint:
#     start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])

### Training

In [21]:
# adding this in a separate cell because if you run the training cell
# many times, the loss_plot array will be reset
loss_plot = []

參考助教的notebook

In [22]:
@tf.function
def train_step(img_tensor, target):
    loss = 0

    # initializing the hidden state for each batch
    # because the captions are not related from image to image
    hidden = decoder.reset_state(batch_size=target.shape[0])

    dec_input = tf.expand_dims([tokenizer.word_index['<start>']] * BATCH_SIZE, 1)

    with tf.GradientTape() as tape:
        features = feature_extractor(img_tensor, True)
        features = tf.reshape(features, (features.shape[0], -1, features.shape[3]))
        features = encoder(features)

        for i in range(1, target.shape[1]):
            # passing the features through the decoder
            predictions, hidden, _ = decoder(dec_input, features, hidden)
            
            loss += loss_function(target[:, i], predictions)

            # using teacher forcing
            dec_input = tf.expand_dims(target[:, i], 1)

    total_loss = (loss / int(target.shape[1]))

    trainable_variables = feature_extractor.trainable_variables + encoder.trainable_variables + decoder.trainable_variables

    gradients = tape.gradient(loss, trainable_variables)

    optimizer.apply_gradients(zip(gradients, trainable_variables))

    return total_loss

預測img的caption，然後把預測的caption中的start，end，padding都拿掉

In [23]:
def predict(img_tensor):
    batch_size = img_tensor.shape[0]
    dec_input = tf.expand_dims(
        [tokenizer.word_index['<start>']] * batch_size, 1)

    features = feature_extractor(img_tensor)
    features = tf.reshape(features, (features.shape[0], -1, features.shape[3]))
    features = encoder(features)

    hidden = decoder.reset_state(batch_size=batch_size)

    result = tf.expand_dims([tokenizer.word_index['<start>']] * batch_size, 1)
    for _ in range(max_length):
        predictions, hidden, _ = decoder(dec_input, features, hidden)
        predicted_id = tf.argmax(predictions, axis=1).numpy()
        dec_input = tf.expand_dims(predicted_id, 1)
        result = tf.concat([result, predicted_id.reshape((batch_size, 1))], axis=1)
    
    actual_result = []
    for r in result.numpy():
        seq = ""
        for s in r[1:]:
            if s == tokenizer.word_index['<end>']:
                break
            seq += tokenizer.index_word[s]
        actual_result.append(seq)
        
    return actual_result

在每個epoch都做一次evaluation看validation accuracy多少  
在evaluation要exact match才能算correct

In [None]:
start = time.time()
for epoch in range(start_epoch, EPOCHS):
    
    loss = 0
    pbar = tqdm.tqdm(enumerate(dataset_train), total=STEPS)
    for (batch, (img_tensor, target)) in pbar:
        loss += train_step(img_tensor, target)
        pbar.set_postfix({'loss': loss.numpy() / (batch + 1)})

    # # storing the epoch end loss value to plot later
    # loss_plot.append(loss / STEPS)
    # ckpt_manager.save()
    
    # correct = 0
        for img_tensor, target in dataset_valid:
            pred_list = predict(img_tensor)
            real_list = []
            for r in target.numpy():
                seq = ""
                for s in r[1:]:
                    if s == tokenizer.word_index['<end>']:
                        break
                    seq += tokenizer.index_word[s]
                real_list.append(seq)
                
            for pred, real in zip(pred_list, real_list):
                if pred == real:
                    correct += 1
    print(f'Validation accuracy: {correct/len(img_names_val):.2f}')
    
print ('Time taken for {} epoch {} sec\n'.format(EPOCHS, time.time() - start))

  0%|          | 0/1000 [00:00<?, ?it/s]

## Predict testing data

雖然loss一直有在降，但validation accuracy卻是會上下震盪  
在testing我用最後一個epoch的model來predict，因為他的validation accuracy有達到0.94

In [None]:
# restore the latest checkpoint and test
ckpt.restore(ckpt_manager.latest_checkpoint)

<tensorflow.python.checkpoint.checkpoint.CheckpointLoadStatus at 0x1d20862f7c0>

In [None]:
def map_test(img_path):
    img = tf.io.read_file(img_path)
    img = tf.image.decode_png(img, channels=3)
    img = tf.image.resize(img, IMAGE_SIZE)
    img = img / 255 * 2 - 1
    return img, img_path

In [None]:
dataset_test = tf.data.Dataset.from_tensor_slices((test_img_names))\
                              .map(map_test, num_parallel_calls=tf.data.experimental.AUTOTUNE)\
                              .batch(100)\
                              .prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
import re

with open('./Lab12-2_113062556.txt', 'w') as fout:
    pbar = tqdm.tqdm(enumerate(dataset_test), total = len(test_img_names) // 100)
    for step, (img_tensor, img_path) in pbar:
        pred_list = predict(img_tensor)
        for path, pred in zip(img_path, pred_list):
            path = path.numpy().decode('utf-8')
            name = os.path.splitext(os.path.basename(path))[0]
            fout.write(f'{name} {pred}\n')
    

100%|██████████| 200/200 [00:54<00:00,  3.70it/s]
