<a href="https://colab.research.google.com/github/tadiwamark/CaptionCraft/blob/main/image_captioning_R204445V_v2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Image Captioning Model

This code is structured to create a model that can generate captions for videos.
The primary dataset used here is the `coco-2017-dataset` from Kaggle, and
we utilize a fraction of this dataset for training due to the limitation in computational resources.

### Libraries and Modules

In [None]:
import os
import json
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf
import re
import pickle
from tqdm import tqdm
from textwrap import wrap
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.applications import MobileNetV2
from tensorflow.keras import Model, Sequential
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, Concatenate, Dropout, Flatten
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

### Setup and Data Retrieval

We start by setting up the Kaggle directory and retrieving the dataset.

In [None]:
from google.colab import files
files.upload() # upload kaggle.json

# Setup Kaggle Dir
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

# Download the dataset
!kaggle datasets download -d awsaf49/coco-2017-dataset

# Unzip the downloaded file
!unzip -q coco-2017-dataset.zip

Saving kaggle.json to kaggle.json
Downloading coco-2017-dataset.zip to /content
100% 25.0G/25.0G [03:12<00:00, 166MB/s]
100% 25.0G/25.0G [03:12<00:00, 140MB/s]


### Data Preparation

We are using only a fraction (20%) of the dataset and reducing the image resolution to manage computational load.


In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
# Use a smaller fraction of the dataset
frac = 0.2

# Reduce image resolution
img_size = 64

# Setup paths
image_path = '/content/coco2017/train2017'
annotations_path = '/content/coco2017/annotations/captions_train2017.json'

# Load Annotations
with open(annotations_path, 'r') as f:
    annotations = json.load(f)['annotations']

# Extracting Image-Caption Pairs and using a smaller fraction of data
img_cap_pairs = []
for sample in annotations:
    img_name = '%012d.jpg' % sample['image_id']
    img_cap_pairs.append([img_name, sample['caption']])

# Create a DataFrame from the image-caption pairs
data = pd.DataFrame(img_cap_pairs, columns=['image', 'caption']).sample(frac=frac)
data['image'] = data['image'].apply(lambda x: os.path.join(image_path, x))

### Text Preprocessing

Text captions are processed to remove punctuations, convert to lowercase, and prepend with 'startseq' and append with 'endseq' to indicate the start and the end of the sequence respectively.


In [None]:
# Text Preprocessing
def text_preprocessing(data):
    data['caption'] = data['caption'].apply(lambda x: x.lower())
    data['caption'] = data['caption'].apply(lambda x: re.sub("[^A-Za-z]", " ", x))
    data['caption'] = data['caption'].apply(lambda x: re.sub("\s+", " ", x))
    data['caption'] = data['caption'].apply(lambda x: " ".join([word for word in x.split() if len(word) > 1]))
    data['caption'] = "startseq " + data['caption'] + " endseq"
    return data

data = text_preprocessing(data)
captions = data['caption'].tolist()

# Tokenization with a reduced vocabulary size
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(captions)
vocab_size = len(tokenizer.word_index) + 1
max_length = max(len(caption.split()) for caption in captions)

# Save the Tokenizer
with open('/content/drive/My Drive/Colab Notebooks/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
img_size = 224

In [None]:
# Using MobileNetV2 for feature extraction
base_model = MobileNetV2(input_shape=(img_size,img_size,3), include_top=False, pooling='avg', weights='imagenet')
x = base_model.output
x = Dense(4096, activation='relu')(x)
feature_extractor = Model(inputs=base_model.input, outputs=x)

# Extracting features from reduced resolution images
features = {}
for image in tqdm(data['image'].unique().tolist()):
    img = load_img(image, target_size=(img_size, img_size))
    img = img_to_array(img) / 255.0
    img = np.expand_dims(img, axis=0)
    feature = feature_extractor.predict(img, verbose=0)
    features[image] = feature.flatten()

Downloading data from https://storage.googleapis.com/tensorflow/keras-applications/mobilenet_v2/mobilenet_v2_weights_tf_dim_ordering_tf_kernels_1.0_224_no_top.h5


100%|██████████| 79719/79719 [1:23:12<00:00, 15.97it/s]


In [None]:
with open('/content/drive/My Drive/Colab Notebooks/transformed_features.pkl', 'wb') as file:
    pickle.dump(features, file)

### Model Construction

Construct a model consisting of an image model and a text model that are concatenated and connected to a final dense layer with softmax activation. Regularization is applied to prevent overfitting.

In [None]:
# Constructing the Model with less complexity
input_img = Input(shape=(4096,), name='image_input')  # <-- Adjust the input shape here.
img_model = Dense(256, activation='relu')(input_img)
input_txt = Input(shape=(max_length,), name='text_input')
embedding_layer = Embedding(input_dim=vocab_size, output_dim=256, mask_zero=True)(input_txt)
lstm_layer = LSTM(256)(embedding_layer)
txt_model = Dense(256, activation='relu')(lstm_layer)
merged = Concatenate(axis=-1)([img_model, txt_model])
output = Dense(vocab_size, activation='softmax')(merged)

# Regularization
img_model = Dropout(0.5)(Dense(256, activation='relu', kernel_regularizer='l2')(input_img))
txt_model = Dropout(0.5)(Dense(256, activation='relu', kernel_regularizer='l2')(lstm_layer))

# Fine-tuning the base model
for layer in base_model.layers[:-5]:
    layer.trainable = False

# Hyperparameter tuning
opt = Adam(learning_rate=0.0001)


# Compiling the Model
model = Model(inputs=[input_img, input_txt], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])


### Data Generator and Model Training

Define a generator to yield batches of input-output pairs and train the model using this generator with early stopping.


In [None]:
def generator():
    for idx, row in data.iterrows():
        try:  # try to get the feature
            feature = features[row['image']].squeeze()
        except KeyError:  # if feature not available, continue to the next iteration
            print(f"Feature for {row['image']} not found. Skipping...")
            continue

        seq = tokenizer.texts_to_sequences([row['caption']])[0]

        for i in range(1, len(seq)):
            in_seq = pad_sequences([seq[:i]], maxlen=max_length)[0]
            out_seq = to_categorical([seq[i]], num_classes=vocab_size)[0]

            yield {'image_input': feature, 'text_input': in_seq}, out_seq

# Create a tf.data dataset from the generator
dataset = tf.data.Dataset.from_generator(
    generator,
    output_signature=(
        {
            'image_input': tf.TensorSpec(shape=(4096,), dtype=tf.float32),
            'text_input': tf.TensorSpec(shape=(max_length,), dtype=tf.int32)
        },
        tf.TensorSpec(shape=(vocab_size,), dtype=tf.int32)
    )
)

# Set up pipeline for shuffling, batching, and prefetching
batch_size = 64
dataset = dataset.shuffle(buffer_size=1000).batch(batch_size).prefetch(buffer_size=tf.data.AUTOTUNE)

In [None]:
# Training with more patience
early_stopping = EarlyStopping(patience=4, restore_best_weights=True)
model.fit(dataset, epochs=10, callbacks=[early_stopping])

Epoch 1/10
  17902/Unknown - 2085s 116ms/step - loss: 3.7687 - accuracy: 0.3183



Epoch 2/10



Epoch 3/10



Epoch 4/10



Epoch 5/10



Epoch 6/10



Epoch 7/10



Epoch 8/10



Epoch 9/10



Epoch 10/10





<keras.src.callbacks.History at 0x7db18180c9d0>

In [None]:
# Save the Model
model.save('/content/drive/My Drive/Colab Notebooks/image_captioning_model.h5')

  saving_api.save_model(
