<a href="https://colab.research.google.com/github/vedsharma1/Image-Captioning-Model/blob/main/Image_Captioning_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!jupyter nbconvert --ClearMetadataPreprocessor.enabled=True --clear-output --inplace Image_Captioning_Model.ipynb

This application is used to convert notebook files (*.ipynb)
        to various other formats.


Options
The options below are convenience aliases to configurable class-options,
as listed in the "Equivalent to" description-line of the aliases.
To see all configurable class-options for some <cmd>, use:
    <cmd> --help-all

--debug
    set log level to logging.DEBUG (maximize logging output)
    Equivalent to: [--Application.log_level=10]
--show-config
    Show the application's configuration (human-readable format)
    Equivalent to: [--Application.show_config=True]
--show-config-json
    Show the application's configuration (json format)
    Equivalent to: [--Application.show_config_json=True]
--generate-config
    generate default config file
    Equivalent to: [--JupyterApp.generate_config=True]
-y
    Answer yes to any questions instead of prompting.
    Equivalent to: [--JupyterApp.answer_yes=True]
--execute
    Execute the notebook prior to export.
    Equivalent to: [--ExecutePr

In [None]:
!!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers
!pip install gradio
!pip install Pillow

Looking in indexes: https://download.pytorch.org/whl/cu124
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (24.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m24.6/24.6 MB[0m [31m13.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (883 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m883.7/883.7 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading https://download.pytorch.org/whl/cu124/nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl (13.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m13.8/13.8 MB[0m [31m21.1 MB/s[0m eta [36m0:00:00[0m
[?25

In [None]:
!pip install transformers accelerate

In [None]:
import torch
print(torch.version.cuda)  # Check CUDA version PyTorch supports karne ke liye likha
print(torch.cuda.is_available())  # Should return True if GPU is detected hai to
print(torch.backends.cudnn.enabled)  # Should be True if cuDNN is working hai to

In [None]:
!pip install gradio transformers torch pillow accelerate gtts

In [None]:
!pip install gradio transformers torch pillow accelerate gtts

In [None]:
!pip install SpeechRecognition

In [None]:
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import re
from gtts import gTTS
import os
import speech_recognition as sr


device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
recognizer = sr.Recognizer()

def clean_caption(text):
    """ Repeated words ko remove karta hai aur better formatting karta hai """
    words = text.split()
    cleaned_words = []
    for word in words:
        if not cleaned_words or word.lower() != cleaned_words[-1].lower():
            cleaned_words.append(word)
    return " ".join(cleaned_words)

def speech_to_text(audio):
    """ Audio file ko text me convert karta hai """
    try:
        with sr.AudioFile(audio) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except Exception as e:
        return " Speech recognition failed! Try again."

def generate_caption(image, user_prompt="", audio=None):
    """ Image ko analyze karke meaningful aur detailed caption generate karta hai """
    if image is None:
        return " Please upload an image!", None

    image = Image.open(image).convert("RGB")


    if not user_prompt.strip() and audio is not None:
        user_prompt = speech_to_text(audio)

    prompt_text = user_prompt if user_prompt.strip() else "Describe the image in detail."
    inputs = processor(image, text=prompt_text, return_tensors="pt").to(device)


    output = model.generate(**inputs, temperature=0.7, top_p=0.9, max_length=100, repetition_penalty=1.2)


    caption = processor.decode(output[0], skip_special_tokens=True)
    caption = clean_caption(caption)


    tts = gTTS(text=caption, lang='hi')
    audio_path = "caption_audio.mp3"
    tts.save(audio_path)

    return caption, audio_path


app = gr.Interface(
    fn=generate_caption,
    inputs=[
        gr.Image(type="filepath", label=" Upload Image"),
        gr.Textbox(lines=2, placeholder=" Ask anything about the image...", label="Your Question"),
        gr.Audio(type="filepath", label=" Speak your question")
    ],
    outputs=[
        gr.Textbox(label=" AI's Answer"),
        gr.Audio(label=" AI Speech Output")
    ],
    title=" AI Image Captioning with BLIP-2 (Text & Audio Input)",
    description="Upload an image and ask anything about it via text or voice. AI will analyze and respond accordingly with a detailed answer and audio output.",
    theme="Huggingface-",)


app.launch(share=True)
