<a href="https://colab.research.google.com/github/upputurirajkumar/CodeSoft-internship/blob/main/Image_Captioning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [19]:
pip install tensorflow keras numpy pillow



In [2]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Dense, LSTM, Embedding, Dropout, add
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.preprocessing.text import Tokenizer
from PIL import Image

In [3]:
def preprocess_image(image_path):
    # Load and preprocess the image
    image = load_img(image_path, target_size=(224, 224))
    image = img_to_array(image)
    image = np.expand_dims(image, axis=0)
    image = image / 255.0
    return image

In [4]:
def extract_features(image_path):
    model = VGG16()
    model = Model(inputs=model.inputs, outputs=model.layers[-2].output)
    image = preprocess_image(image_path)
    features = model.predict(image)
    return features


In [7]:
def load_captions(file_path):
    with open(file_path, 'r') as file:
        captions = file.readlines()
    return captions

# Load and preprocess captions, make sure to use the correct path
captions = load_captions('/content/sample_data/captions.txt')  # Replace with the actual path
tokenizer, sequences, max_length = preprocess_captions(captions)
vocab_size = len(tokenizer.word_index) + 1

In [8]:
def define_model(vocab_size, max_length):
    # Feature extractor (CNN)
    inputs1 = Input(shape=(4096,))
    fe1 = Dropout(0.5)(inputs1)
    fe2 = Dense(256, activation='relu')(fe1)

    # Sequence processor (RNN)
    inputs2 = Input(shape=(max_length,))
    se1 = Embedding(vocab_size, 256, mask_zero=True)(inputs2)
    se2 = Dropout(0.5)(se1)
    se3 = LSTM(256)(se2)

    # Decoder (combining features and sequence)
    decoder1 = add([fe2, se3])
    decoder2 = Dense(256, activation='relu')(decoder1)
    outputs = Dense(vocab_size, activation='softmax')(decoder2)

    # Combine into one model
    model = Model(inputs=[inputs1, inputs2], outputs=outputs)
    model.compile(loss='categorical_crossentropy', optimizer='adam')
    return model

model = define_model(vocab_size, max_length)


In [15]:
def data_generator(captions, images, tokenizer, max_length, vocab_size):
    while True:
        for i, caption in enumerate(captions):
            # Encode the input sequence
            sequence = tokenizer.texts_to_sequences([caption])[0]
            # Pad sequences to match the maximum length
            sequence = pad_sequences([sequence], maxlen=max_length)[0]
            # One-hot encode the output sequence
            y = to_categorical(sequence, num_classes=vocab_size)
            # Reshape the sequence to have 3 dimensions
            sequence = np.reshape(sequence, (1, max_length))  # Add this line
            yield [images[i], sequence], y

In [18]:
def generate_caption(model, tokenizer, image, max_length):
    in_text = 'startseq'
    for _ in range(max_length):
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        sequence = pad_sequences([sequence], maxlen=max_length)
        yhat = model.predict([image, sequence], verbose=0)
        yhat = np.argmax(yhat)
        word = tokenizer.index_word[yhat]
        if word is None:
            break
        in_text += ' ' + word
        if word == 'endseq':
            break
    return in_text

# Example usage
image_path = '/content/sample_data/image.jpg'
image = extract_features(image_path)
caption = generate_caption(model, tokenizer, image, max_length)
print(caption)


startseq 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174 1685990174
