# Libraries

In [1]:
from PIL import Image
import numpy as np
import os
import pickle
from tqdm.notebook import tqdm
from tensorflow.keras.applications.vgg16 import VGG16, preprocess_input
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.models import Model
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical, plot_model
from tensorflow.keras.layers import Embedding, Dropout, Dense, Input, LSTM, add
import gradio as gr
import time




# Load Model

In [2]:
# Recreate the exact same model, including its weights and the optimizer
import tensorflow
new_model = tensorflow.keras.models.load_model(r"D:\Mechine Learning\Project\Image Captioning\best_model.h5")




In [3]:
new_model.summary()

Model: "model_1"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_3 (InputLayer)        [(None, 35)]                 0         []                            
                                                                                                  
 input_2 (InputLayer)        [(None, 4096)]               0         []                            
                                                                                                  
 embedding (Embedding)       (None, 35, 256)              2171648   ['input_3[0][0]']             
                                                                                                  
 dropout (Dropout)           (None, 4096)                 0         ['input_2[0][0]']             
                                                                                            

In [4]:
features = pickle.load(open(r"D:\Mechine Learning\Project\Image Captioning\features.pkl", "rb"))
tokenizer = pickle.load(open(r"D:\Mechine Learning\Project\Image Captioning\tokenizer.pkl", "rb"))

In [5]:
vocab_size = len(tokenizer.word_index)+1
max_length = 35

## Genrate Captions

In [6]:
def idx_to_word(integer, tokenizer):
    for word, index in tokenizer.word_index.items():
        if index == integer:
            return word
    return None

In [7]:
def predict_caption(model, image, tokenizer, max_length):
    # add start tag for generation process
    in_text = 'start'
    # iterate over the max length of sequence
    for i in range(max_length):
        # encode input sequence
        sequence = tokenizer.texts_to_sequences([in_text])[0]
        # pad the sequence
        sequence = pad_sequences([sequence], max_length)
        # predict next word
        yhat = model.predict([image, sequence], verbose=0)
        # get index with high probability
        yhat = np.argmax(yhat)
        # convert index to word
        word = idx_to_word(yhat, tokenizer)
        # stop if word not found
        if word is None:
            break
        # append word as input for generating next word
        in_text += " " + word
        # stop if we reach end tag
        if word == 'end':
            break
      
    return in_text

# VGG16 Model

In [8]:
vgg_model = VGG16()
# restructure the model
vgg_model = Model(inputs=vgg_model.inputs, outputs=vgg_model.layers[-2].output)




# Story Generation

In [9]:
import google.generativeai as genai
import os


os.environ["GEMINI_API_KEY"] = "AIzaSyDyc2kCi4_X0T8gCeyXSjv6YudKP7YuIww"
gemini_api_key = os.environ["GEMINI_API_KEY"]
genai.configure(api_key = gemini_api_key)

In [10]:
from IPython.display import Markdown

model = genai.GenerativeModel('gemini-pro')


In [11]:
def process_image(input_image):
    data = Image.fromarray(input_image)
    temp_img_path = "temp_input_img.jpg"
    data.save(temp_img_path)
    image = load_img(temp_img_path, target_size=(224, 224))
    os.remove(temp_img_path)
    image = np.array(image)
    image = image.reshape((1, image.shape[0], image.shape[1], image.shape[2]))
    image = preprocess_input(image)
    feature = vgg_model.predict(image, verbose=0)
    caption = predict_caption(new_model, feature, tokenizer, max_length)
    caption = caption[6:len(caption)-3]
    #caption = " ".join(list(filter(lambda x:x not in ["start", "end"], caption.split())))
    #return caption
    #for i in range(len(caption)):
    #    time.sleep(0.03)
    #    yield caption[0:i+1]
    response = model.generate_content(f'''Write a short story in 5 lines in a creative way with the caption
                                        generated by my Image captioning model.
                                        caption of my image :{caption}
                                        Now generate story in simple english.''')
    story = response.text
    
    #return caption, story
    for i in range(len(story)):
        if(len(caption)>=i):
            time.sleep(0.03)
            yield caption[0:i+1], ""
        else:
            for i in range(len(story)):
                time.sleep(0.03)
                yield caption, story[0:i+1]
            break

# Voice 

In [12]:
import os
from gtts import gTTS
from pygame import mixer

text = '''In the quaint streets of a small town. 
            where they shared dreams and secrets. With every shared laugh and stolen glance. '''

def listen_to_story(text):
    tts = gTTS(text=text, lang='en', slow=False, tld="ca")
    file = "output_audio.mp3"
    tts.save(file)
    mixer.init()
    mixer.music.load(file)
    mixer.music.play()
    while mixer.music.get_busy():
        continue
    mixer.quit()
    os.remove(file)
# listen_to_story(text)

pygame 2.5.2 (SDL 2.28.3, Python 3.11.5)
Hello from the pygame community. https://www.pygame.org/contribute.html


# Gradio

In [13]:
css = '''
    .gradio-container {
       /*background-color:#9a8c98;*/
       background-image:url("https://www.csshero.org/wp-content/uploads/2016/07/hohenschwangau-532864_1920.jpg");
       background-size:cover;
       /*background: linear-gradient(#e66465, #9198e5);*/
       
       
    }
    .svelte-13hsdno{
        color:white;
    }
    .sub-title{
        display:flex;
        justify-content:center;
        margin:0px;
    }
    .scroll{
        width:30%;
        font-weight:800;
        
    }
    .title{
        font-size:30px;
        text-align: center;
        font-weight:900;
    }
    button>.secondary{
        width:50%;
        color:red;
        
    }
   
'''
description_html = """<div class='sub-title'><marquee class='scroll' behavior="scroll" direction="right">Let your picture speak..</marquee></div>"""


# Block

In [14]:
import gradio as gr

with gr.Blocks(theme=gr.themes.Monochrome(), title="Caption Generator", css=css) as demo:
    gr.Row(
        gr.Markdown("<b>Caption Generator</b>", elem_classes='title'),
        gr.Markdown(description_html)
    )
#     gr.Markdown("Upload Image")
    with gr.Row():
        with gr.Column():
            inp1 = gr.Image()
            btn = gr.Button("Run")
        with gr.Column():
            out1 = gr.Textbox(label="Caption", text_align="center")
            out2 = gr.TextArea(label="Story", text_align="center", lines=5)
            btn1 = gr.Button("Listen Story")
    btn.click(fn=process_image, inputs=inp1, outputs=[out1, out2])
    btn1.click(fn=listen_to_story, inputs=out2)
    gr.Row(gr.ClearButton([inp1, out1, out2]))
    gr.Examples([[r"D:\Mechine Learning\Project\Image Captioning\archive\Images\109202801_c6381eef15.jpg"],
                [r"D:\Mechine Learning\Project\Image Captioning\archive\Images\1002674143_1b742ab4b8.jpg"],
               [r"D:\Mechine Learning\Project\Image Captioning\archive\Images\44129946_9eeb385d77.jpg"],
               [r"D:\Mechine Learning\Project\Image Captioning\archive\Images\138705546_be7a6845dd.jpg"],
               [r"D:\Mechine Learning\Project\Image Captioning\archive\Images\128912885_8350d277a4.jpg"]
               ], inputs=[inp1])
demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.


