# 02 Model Training

This notebook builds the CNN+LSTM model and trains it on Flickr8k dataset.

In [None]:
import os
import numpy as np
import pickle
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

### Load Preprocessed Data and Tokenizer

In [None]:
with open('../data/Flickr8k_text/mapping.pkl', 'rb') as f:
    mapping = pickle.load(f)

with open('../data/Flickr8k_text/features.pkl', 'rb') as f:
    features = pickle.load(f)

with open('../data/Flickr8k_text/tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)

vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for captions in mapping.values() for caption in captions)
print(f"Vocab Size: {vocab_size}, Max Length: {max_length}")

### Data Generator for Training

In [None]:
def data_generator(data_keys, mapping, features, tokenizer, max_length, vocab_size, batch_size):
    X1, X2, y = list(), list(), list()
    n = 0
    while True:
        for key in data_keys:
            captions = mapping[key]
            for caption in captions:
                seq = tokenizer.texts_to_sequences([caption])[0]
                for i in range(1, len(seq)):
                    in_seq, out_seq = seq[:i], seq[i]
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    X1.append(features[key][0])
                    X2.append(in_seq)
                    y.append(out_seq)
            n += 1
            if n == batch_size:
                yield [np.array(X1), np.array(X2)], np.array(y)
                X1, X2, y = list(), list(), list()
                n = 0

### Define CNN+LSTM Model

In [None]:
inputs1 = Input(shape=(4096,))
fe1 = Dropout(0.5)(inputs1)
fe2 = Dense(256, activation='relu')(fe1)

inputs2 = Input(shape=(max_length,))
se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
se2 = Dropout(0.5)(se1)
se3 = LSTM(256)(se2)

decoder1 = add([fe2, se3])
decoder2 = Dense(256, activation='relu')(decoder1)
outputs = Dense(vocab_size, activation='softmax')(decoder2)

model = Model(inputs=[inputs1, inputs2], outputs=outputs)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
model.summary()

### Train Model

In [None]:
train = list(mapping.keys())[:6000]
steps = len(train) // 64
generator = data_generator(train, mapping, features, tokenizer, max_length, vocab_size, 64)
model.fit(generator, epochs=10, steps_per_epoch=steps, verbose=1)

model.save('../models/decoder/caption_model.h5')
print("Model saved at ../models/decoder/caption_model.h5")