In [8]:
%pwd

'/content'

In [9]:
%cd /content/drive/MyDrive/icg

/content/drive/MyDrive/icg


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
import nltk
nltk.download("punkt")
nltk.download('punkt_tab')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [11]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from PIL import Image
import nltk

In [12]:
# Load captions dataset
df = pd.read_csv("captions.txt", delimiter=",", header=0, names=["image", "caption"])
df["caption"] = df["caption"].fillna("")  # Replace NaN with empty strings
df["caption"] = df["caption"].apply(lambda x: nltk.word_tokenize(str(x).lower()))

In [13]:
# Convert tokenized lists to strings for Tokenizer
df["caption_str"] = df["caption"].apply(lambda x: " ".join(x))

In [14]:
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.applications.resnet50 import preprocess_input
from tensorflow.keras.preprocessing import image
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tqdm import tqdm

In [15]:
# Load ResNet50 model for image feature extraction
resnet = ResNet50(weights="imagenet", include_top=False, pooling="avg")

In [16]:
def extract_features(image_path):
    try:
        img = Image.open(image_path).resize((224, 224))  # Resize image
        img = np.array(img, dtype=np.float32)  # Ensure correct type
        img = np.expand_dims(img, axis=0)  # Add batch dimension
        img = preprocess_input(img)  # Apply ResNet preprocessing
        features = resnet.predict(img)  # Extract features
        return features.squeeze()  # Remove extra dimension
    except Exception as e:
        print(f"Error processing {image_path}: {e}")
        return np.zeros((2048,))  # Return zero vector for missing files

In [17]:
# Define the base path where images are stored
image_path = "/content/drive/MyDrive/icg/Images"

# Convert image filenames to full paths
df["full_image_path"] = df["image"].apply(lambda x: os.path.join(image_path, x))

In [15]:
# Batch process image features
def batch_extract_features(image_paths, batch_size=32):
    image_features = []
    for i in tqdm(range(0, len(image_paths), batch_size), desc="Processing Batches"):
        batch = image_paths[i:i+batch_size]
        batch_features = np.array([extract_features(img) for img in batch])
        image_features.append(batch_features)
    return np.vstack(image_features)

# Run feature extraction
image_features = batch_extract_features(df["full_image_path"].tolist(), batch_size=32)
image_features = np.squeeze(image_features)  # Fix shape issue

Output hidden; open in https://colab.research.google.com to view.

In [18]:
# Tokenize captions
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df["caption_str"])  # Fit on processed string captions
vocab_size = len(tokenizer.word_index) + 1  # Vocabulary size

In [19]:
# Convert captions to sequences and pad them
sequences = tokenizer.texts_to_sequences(df["caption_str"])
max_len = max(len(seq) for seq in sequences)  # Maximum caption length
padded_captions = pad_sequences(sequences, maxlen=max_len, padding="post")

In [20]:
# Convert to NumPy arrays
X_captions = np.array(padded_captions)

In [21]:
# One-hot encode labels
Y_labels = np.zeros((len(sequences), max_len, vocab_size))
for i, seq in enumerate(sequences):
    for j, word_idx in enumerate(seq):
        if word_idx != 0:  # Ignore padding
            Y_labels[i, j, word_idx] = 1

In [22]:
# Define the model
import tensorflow as tf
from tensorflow.keras.layers import Embedding, LSTM, Dense, Input, Dropout

In [23]:
def build_model():
    # Image input
    input_img = Input(shape=(2048,))
    img_features = Dense(256, activation="relu")(input_img)

    # Text input
    input_text = Input(shape=(max_len,))
    text_features = Embedding(vocab_size, 256, mask_zero=True)(input_text)
    text_features = LSTM(256, return_sequences=True)(text_features)  # Output full sequence

    # Merge both inputs
    merged = tf.keras.layers.Add()([img_features, tf.keras.layers.GlobalAveragePooling1D()(text_features)])
    merged = Dense(256, activation="relu")(merged)
    output = Dense(vocab_size, activation="softmax")(merged)

    model = tf.keras.models.Model(inputs=[input_img, input_text], outputs=output)
    return model

# Compile model
model = build_model()
model.compile(loss="categorical_crossentropy", optimizer="adam")
model.summary()

In [None]:
# Train the model
model.fit([image_features, X_captions], Y_labels, epochs=10, batch_size=32)

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Function to generate caption
def generate_caption(model, tokenizer, image_path, max_len):
    # Extract features from the test image
    image_feature = extract_features(image_path)
    image_feature = np.expand_dims(image_feature, axis=0)  # Add batch dimension

    # Initialize input sequence with the start token (assumed to be 'startseq')
    start_token = "startseq"
    caption_seq = [tokenizer.word_index[start_token]]

    for _ in range(max_len):
        # Pad sequence to match max_len
        sequence_padded = pad_sequences([caption_seq], maxlen=max_len, padding="post")

        # Predict next word
        preds = model.predict([image_feature, sequence_padded], verbose=0)
        next_word_index = np.argmax(preds[0])  # Get index of most probable word

        # Stop if end token is reached
        if next_word_index == tokenizer.word_index.get("endseq", 0):
            break

        # Add predicted word to sequence
        caption_seq.append(next_word_index)

    # Convert word indices back to text
    reverse_word_map = {index: word for word, index in tokenizer.word_index.items()}
    generated_caption = " ".join(reverse_word_map[idx] for idx in caption_seq if idx in reverse_word_map)

    return generated_caption

# Test the model
test_image_path = "/content/drive/MyDrive/icg/Images/sample.jpg"  # Change to an actual test image path
predicted_caption = generate_caption(model, tokenizer, test_image_path, max_len)
print("Predicted Caption:", predicted_caption)


In [None]:
import pickle

# Save the trained model
model.save("image_captioning_model.h5")

# Save the tokenizer as a pickle file
with open("tokenizer.pkl", "wb") as f:
    pickle.dump(tokenizer, f)

# Save max_len (needed for prediction)
with open("max_len.pkl", "wb") as f:
    pickle.dump(max_len, f)

print("Model and tokenizer saved successfully! 🎉")
