In [18]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import string
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.applications.inception_v3 import InceptionV3, preprocess_input
from tensorflow.keras.models import Model
from tensorflow.keras.layers import LSTM, Embedding, Dense, Dropout, add
from tensorflow.keras.utils import to_categorical
from PIL import Image
import cv2
import re

In [22]:
captions_path = r"C:\Users\kokar\Downloads\archive (42)\captions.txt"
images_path = "C:/Users/kokar/Downloads/archive (42)/Images/"
df = pd.read_csv(captions_path)
df.columns = ["image", "caption"]
print(df.head())

                       image  \
0  1000268201_693b08cb0e.jpg   
1  1000268201_693b08cb0e.jpg   
2  1000268201_693b08cb0e.jpg   
3  1000268201_693b08cb0e.jpg   
4  1000268201_693b08cb0e.jpg   

                                             caption  
0  A child in a pink dress is climbing up a set o...  
1              A girl going into a wooden building .  
2   A little girl climbing into a wooden playhouse .  
3  A little girl climbing the stairs to her playh...  
4  A little girl in a pink dress going into a woo...  


In [23]:
def clean_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r"[^\w\s]", "", text)  # Remove punctuation
    text = text.replace("\n", " ")  # Remove new lines
    text = re.sub(r"\d+", "", text)  # Remove numbers
    text = text.strip()  # Trim spaces
    return text

In [24]:
df["caption"] = df["caption"].apply(clean_text)

In [25]:
df

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,a child in a pink dress is climbing up a set o...
1,1000268201_693b08cb0e.jpg,a girl going into a wooden building
2,1000268201_693b08cb0e.jpg,a little girl climbing into a wooden playhouse
3,1000268201_693b08cb0e.jpg,a little girl climbing the stairs to her playh...
4,1000268201_693b08cb0e.jpg,a little girl in a pink dress going into a woo...
...,...,...
40450,997722733_0cb5439472.jpg,a man in a pink shirt climbs a rock face
40451,997722733_0cb5439472.jpg,a man is rock climbing high in the air
40452,997722733_0cb5439472.jpg,a person in a red shirt climbing up a rock fac...
40453,997722733_0cb5439472.jpg,a rock climber in a red shirt


In [26]:
df["caption"] = df["caption"].apply(lambda x: "<start> " + x + " <end>")

In [27]:
df

Unnamed: 0,image,caption
0,1000268201_693b08cb0e.jpg,<start> a child in a pink dress is climbing up...
1,1000268201_693b08cb0e.jpg,<start> a girl going into a wooden building <end>
2,1000268201_693b08cb0e.jpg,<start> a little girl climbing into a wooden p...
3,1000268201_693b08cb0e.jpg,<start> a little girl climbing the stairs to h...
4,1000268201_693b08cb0e.jpg,<start> a little girl in a pink dress going in...
...,...,...
40450,997722733_0cb5439472.jpg,<start> a man in a pink shirt climbs a rock fa...
40451,997722733_0cb5439472.jpg,<start> a man is rock climbing high in the air...
40452,997722733_0cb5439472.jpg,<start> a person in a red shirt climbing up a ...
40453,997722733_0cb5439472.jpg,<start> a rock climber in a red shirt <end>


In [47]:
image_captions = {}
for _,row in df.iterrows():
    # print(row,_)
    img, caption = row["image"], row["caption"]
    if img not in image_captions:
        image_captions[img] = []
        # print(image_captions)
    image_captions[img].append(caption)
print(list(image_captions.items())[:2])   

[('1000268201_693b08cb0e.jpg', ['<start> a child in a pink dress is climbing up a set of stairs in an entry way <end>', '<start> a girl going into a wooden building <end>', '<start> a little girl climbing into a wooden playhouse <end>', '<start> a little girl climbing the stairs to her playhouse <end>', '<start> a little girl in a pink dress going into a wooden cabin <end>']), ('1001773457_577c3a7d70.jpg', ['<start> a black dog and a spotted dog are fighting <end>', '<start> a black dog and a tricolored dog playing with each other on the road <end>', '<start> a black dog and a white dog with brown spots are staring at each other in the street <end>', '<start> two dogs of different breeds looking at each other on the road <end>', '<start> two dogs on pavement moving toward each other <end>'])]


Understanding InceptionV3:

InceptionV3 is a deep convolutional neural network (CNN) designed for image classification.
It was introduced by Google and is well-known for its efficiency and accuracy.
The model was trained on the ImageNet dataset, which contains over 14 million images across 1,000 classes.

What is avg_pool in InceptionV3?

In InceptionV3, avg_pool refers to the Global Average Pooling (GAP) layer, which is the second-last layer of the model, just before the final classification layer.

Understanding Global Average Pooling (avg_pool)

The avg_pool layer reduces the spatial dimensions (height & width) of the feature maps to 1x1 per channel by computing the average of each feature map.
Instead of using a fully connected layer (which has many parameters), Global Average Pooling significantly reduces the number of parameters and helps prevent overfitting.

In [49]:
base_model=InceptionV3(weights="imagenet")
model = Model(inputs=base_model.input, outputs=base_model.get_layer("avg_pool").output)
def extract_features(img_path):
    img = Image.open(img_path).resize((299, 299))  # Resize for InceptionV3
    img = np.array(img) / 255.0  # Normalize
    img = np.expand_dims(img, axis=0)  # Add batch dimension
    # Expand Dimensions (np.expand_dims(img, axis=0))
    # Converts from shape (299, 299, 3) to (1, 299, 299, 3) (adds batch dimension).
    img = preprocess_input(img)  # Apply InceptionV3 preprocessing
    features = model.predict(img)  # Extract features
    return features

In [56]:
simple_image=list(image_captions.keys())[0]
image_features = extract_features(os.path.join(images_path, sample_image))

print(f"Feature Shape: {image_features.shape}")
# print(image_features)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 112ms/step
Feature Shape: (1, 2048)


In [68]:
# Tokenizer().fit_on_texts(all_captions) → This creates a vocabulary of unique words from all captions.
# tokenizer.texts_to_sequences(captions) → Converts each caption into a sequence of integers (word indices).
all_caption=[]
for caption in image_captions.values():
    all_caption.extend(caption)
tokenizer = Tokenizer()
tokenizer.fit_on_texts(all_caption)
all_caption[0]

'<start> a child in a pink dress is climbing up a set of stairs in an entry way <end>'

In [70]:
vocab_size = len(tokenizer.word_index) + 1

In [73]:
def encode_caption(captions):
    return tokenizer.texts_to_sequences(captions) 

In [75]:
captions_sequences = {img: encode_caption(caps) for img, caps in image_captions.items()}