# Model Training
This notebook implements and trains a CNN-RNN model with attention for image captioning using 10% of the Flickr8k dataset for initial testing.

In [1]:
import os
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'  # Suppress TensorFlow warnings

In [2]:
# Install TensorFlow if not already installed
# %pip install tensorflow

import os
import pickle
import numpy as np
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, Add
from tensorflow.keras.models import Model
from tensorflow.keras.applications import VGG16
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.preprocessing.sequence import pad_sequences



## Load and Sample Preprocessed Data
Load the preprocessed data and sample 10% for initial training.

In [None]:
# Load preprocessed data
processed_dir = '../data/processed/'

with open(os.path.join(processed_dir, 'train_features.pkl'), 'rb') as f:
    train_features = pickle.load(f)
    
with open(os.path.join(processed_dir, 'train_captions.pkl'), 'rb') as f:
    train_captions = pickle.load(f)
    
with open(os.path.join(processed_dir, 'val_features.pkl'), 'rb') as f:
    val_features = pickle.load(f)
    
with open(os.path.join(processed_dir, 'val_captions.pkl'), 'rb') as f:
    val_captions = pickle.load(f)
    
with open(os.path.join(processed_dir, 'tokenizer.pkl'), 'rb') as f:
    tokenizer = pickle.load(f)

# Calculate vocabulary size and maximum sequence length
vocab_size = len(tokenizer.word_index) + 1
# Calculate max length from training captions
max_length = max(len(caption.split()) for captions_list in train_captions.values() 
                for caption in captions_list)

print(f'Vocabulary size: {vocab_size}')
print(f'Maximum sequence length: {max_length}')

# Sample 10% of training data instead of 1%
np.random.seed(42)  # for reproducibility
train_ids = list(train_features.keys())
sampled_train_ids = np.random.choice(train_ids, size=int(len(train_ids) * 0.1), replace=False)

# Sample 10% of validation data
val_ids = list(val_features.keys())
sampled_val_ids = np.random.choice(val_ids, size=int(len(val_ids) * 0.1), replace=False)

# Create sampled datasets
sampled_train_features = {k: train_features[k] for k in sampled_train_ids}
sampled_train_captions = {k: train_captions[k] for k in sampled_train_ids}
sampled_val_features = {k: val_features[k] for k in sampled_val_ids}
sampled_val_captions = {k: val_captions[k] for k in sampled_val_ids}

print(f'Original training samples: {len(train_features)}')
print(f'Sampled training samples: {len(sampled_train_features)}')
print(f'Original validation samples: {len(val_features)}')
print(f'Sampled validation samples: {len(sampled_val_features)}')

# Replace original data with sampled data
train_features = sampled_train_features
train_captions = sampled_train_captions
val_features = sampled_val_features
val_captions = sampled_val_captions

# Preprocess image features to match model input shape
def preprocess_image_features(features):
    processed_features = {}
    for img_id, feature in features.items():
        if feature.shape != (224, 224, 3):
            # Reshape if necessary
            feature = tf.image.resize(feature, (224, 224))
        processed_features[img_id] = feature
    return processed_features

train_features = preprocess_image_features(train_features)
val_features = preprocess_image_features(val_features)

print('\nData sampling and preprocessing completed successfully')

Original training samples: 6000
Sampled training samples: 600
Original validation samples: 1000
Sampled validation samples: 100

Data sampling and preprocessing completed successfully


## Define the CNN-RNN Model with Attention
Create the model architecture combining CNN features with RNN and attention mechanism.

In [4]:
class BahdanauAttention(tf.keras.layers.Layer):
    def __init__(self, units):
        super(BahdanauAttention, self).__init__()
        self.W1 = tf.keras.layers.Dense(units)
        self.W2 = tf.keras.layers.Dense(units)
        self.V = tf.keras.layers.Dense(1)

    def call(self, features, hidden):
        # features(CNN_encoder output) shape == (batch_size, 64, embedding_dim)
        # hidden shape == (batch_size, hidden_size)
        hidden_with_time_axis = tf.expand_dims(hidden, 1)
        # hidden_with_time_axis shape == (batch_size, 1, hidden_size)

        # score shape == (batch_size, 64, 1)
        score = self.V(tf.nn.tanh(
            self.W1(features) + self.W2(hidden_with_time_axis)))

        # attention_weights shape == (batch_size, 64, 1)
        attention_weights = tf.nn.softmax(score, axis=1)

        # context_vector shape after sum == (batch_size, embedding_dim)
        context_vector = attention_weights * features
        context_vector = tf.reduce_sum(context_vector, axis=1)

        return context_vector, attention_weights

In [5]:
def create_model(embedding_dim, units, vocab_size, max_length, features_shape):
    # Image encoder
    inputs1 = Input(shape=features_shape)
    fe1 = Dropout(0.4)(inputs1)
    fe2 = Dense(embedding_dim, activation='relu')(fe1)

    # Sequence encoder
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, embedding_dim, mask_zero=True)(inputs2)
    se2 = Dropout(0.4)(se1)

    # Decoder with attention
    decoder1 = LSTM(units, return_sequences=True)(se2)
    decoder2 = LSTM(units)(decoder1)

    # Attention mechanism
    attention = BahdanauAttention(units)
    context_vector, attention_weights = attention(fe2, decoder2)

    # Combine context vector with decoder output
    decoder3 = tf.concat([context_vector, decoder2], axis=-1)

    # Dense layers
    outputs = Dense(512, activation='relu')(decoder3)
    outputs = Dropout(0.4)(outputs)
    outputs = Dense(vocab_size, activation='softmax')(outputs)

    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    return model

# Model parameters
embedding_dim = 256
units = 512
features_shape = (224, 224, 3)  # VGG16 input shape

# Create and compile model
model = create_model(
    embedding_dim=embedding_dim,
    units=units,
    vocab_size=vocab_size,
    max_length=max_length,
    features_shape=features_shape
)

# Compile with appropriate loss and optimizer
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

NameError: name 'vocab_size' is not defined

## Data Generator
Create a generator to feed data to the model during training.

In [None]:
def data_generator(features, captions, tokenizer, max_length, vocab_size, batch_size):
    # Get image IDs and their corresponding captions
    all_image_ids = list(captions.keys())
    
    while True:
        # Shuffle image IDs at the start of each epoch
        np.random.shuffle(all_image_ids)
        
        for i in range(0, len(all_image_ids), batch_size):
            batch_image_ids = all_image_ids[i:i + batch_size]
            
            # Initialize batch arrays
            X1 = []  # Images
            X2 = []  # Input sequences
            y = []   # Output words
            
            # Process each image in the batch
            for image_id in batch_image_ids:
                image = features[image_id]
                captions_list = captions[image_id]
                
                # Randomly select one caption for the image
                caption = np.random.choice(captions_list)
                
                # Convert caption to sequence
                seq = tokenizer.texts_to_sequences([caption])[0]
                
                # Generate input-output pairs
                for i in range(1, len(seq)):
                    in_seq = seq[:i]
                    out_seq = seq[i]
                    
                    # Pad input sequence
                    in_seq = pad_sequences([in_seq], maxlen=max_length)[0]
                    
                    # One-hot encode output word
                    out_seq = to_categorical([out_seq], num_classes=vocab_size)[0]
                    
                    # Add to batch
                    X1.append(image)
                    X2.append(in_seq)
                    y.append(out_seq)
            
            if len(X1) > 0:  # Only yield if we have data
                yield ([np.array(X1), np.array(X2)], np.array(y))

## Train the Model
Set up training parameters and train the model with callbacks for checkpointing and early stopping.

In [None]:
# Training parameters - adjusted batch size
batch_size = 32  # Reduced from 64 to handle smaller dataset better
epochs = 20  # Increased epochs since we have more data now

# Calculate steps per epoch - revised calculation
train_descriptions = sum([[desc for desc in captions] for captions in train_captions.values()], [])
val_descriptions = sum([[desc for desc in captions] for captions in val_captions.values()], [])

steps_per_epoch = len(train_descriptions) // batch_size
validation_steps = len(val_descriptions) // batch_size

print(f'Number of training descriptions: {len(train_descriptions)}')
print(f'Number of validation descriptions: {len(val_descriptions)}')
print(f'Steps per epoch: {steps_per_epoch}')
print(f'Validation steps: {validation_steps}')

# Create model checkpoint callback
checkpoint_path = '../models/model_checkpoint.h5'
os.makedirs('../models', exist_ok=True)

checkpoint = ModelCheckpoint(
    checkpoint_path,
    monitor='val_loss',
    verbose=1,
    save_best_only=True,
    mode='min'
)

# Create early stopping callback
early_stopping = EarlyStopping(
    monitor='val_loss',
    patience=5,
    restore_best_weights=True
)

# Create training and validation generators
train_generator = data_generator(
    train_features,
    train_captions,
    tokenizer,
    max_length,
    vocab_size,
    batch_size
)

val_generator = data_generator(
    val_features,
    val_captions,
    tokenizer,
    max_length,
    vocab_size,
    batch_size
)

# Train the model
try:
    history = model.fit(
        train_generator,
        steps_per_epoch=steps_per_epoch,
        epochs=epochs,
        validation_data=val_generator,
        validation_steps=validation_steps,
        callbacks=[checkpoint, early_stopping],
        verbose=1
    )
    print('Training completed successfully')
except Exception as e:
    print(f'Error during training: {str(e)}')

## Save the Model
Save the trained model and training history.

In [None]:
# Save the final model
model.save('../models/cnn_rnn_attention.h5')
print('Model saved successfully.')

# Save training history
with open('../models/training_history.pkl', 'wb') as f:
    pickle.dump(history.history, f)
print('Training history saved successfully.')