<a href="https://colab.research.google.com/github/vedsharma1/Image-Captioning-Model/blob/main/Image_Captioning_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu124
!pip install transformers
!pip install gradio
!pip install Pillow

Collecting gradio
  Downloading gradio-5.31.0-py3-none-any.whl.metadata (16 kB)
Collecting aiofiles<25.0,>=22.0 (from gradio)
  Downloading aiofiles-24.1.0-py3-none-any.whl.metadata (10 kB)
Collecting fastapi<1.0,>=0.115.2 (from gradio)
  Downloading fastapi-0.115.12-py3-none-any.whl.metadata (27 kB)
Collecting ffmpy (from gradio)
  Downloading ffmpy-0.5.0-py3-none-any.whl.metadata (3.0 kB)
Collecting gradio-client==1.10.1 (from gradio)
  Downloading gradio_client-1.10.1-py3-none-any.whl.metadata (7.1 kB)
Collecting groovy~=0.1 (from gradio)
  Downloading groovy-0.1.2-py3-none-any.whl.metadata (6.1 kB)
Collecting pydub (from gradio)
  Downloading pydub-0.25.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting python-multipart>=0.0.18 (from gradio)
  Downloading python_multipart-0.0.20-py3-none-any.whl.metadata (1.8 kB)
Collecting ruff>=0.9.3 (from gradio)
  Downloading ruff-0.11.11-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (25 kB)
Collecting safehttpx<0.2.0,>=0.1.

In [2]:
!pip install transformers accelerate



In [3]:
import torch
print(torch.version.cuda)  # Check CUDA version PyTorch supports karne ke liye likha
print(torch.cuda.is_available())  # Should return True if GPU is detected hai to
print(torch.backends.cudnn.enabled)  # Should be True if cuDNN is working hai to

12.4
False
True


In [4]:
!pip install gradio transformers torch pillow accelerate gtts

Collecting gtts
  Downloading gTTS-2.5.4-py3-none-any.whl.metadata (4.1 kB)
Collecting click<8.2,>=7.1 (from gtts)
  Downloading click-8.1.8-py3-none-any.whl.metadata (2.3 kB)
Downloading gTTS-2.5.4-py3-none-any.whl (29 kB)
Downloading click-8.1.8-py3-none-any.whl (98 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.2/98.2 kB[0m [31m7.0 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: click, gtts
  Attempting uninstall: click
    Found existing installation: click 8.2.1
    Uninstalling click-8.2.1:
      Successfully uninstalled click-8.2.1
Successfully installed click-8.1.8 gtts-2.5.4


In [5]:
!pip install gradio transformers torch pillow accelerate gtts



In [6]:
!pip install SpeechRecognition

Collecting SpeechRecognition
  Downloading speechrecognition-3.14.3-py3-none-any.whl.metadata (30 kB)
Downloading speechrecognition-3.14.3-py3-none-any.whl (32.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m32.9/32.9 MB[0m [31m60.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: SpeechRecognition
Successfully installed SpeechRecognition-3.14.3


In [7]:
import gradio as gr
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
import torch
import re
from gtts import gTTS
import os
import speech_recognition as sr


device = "cuda" if torch.cuda.is_available() else "cpu"
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-large")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-large").to(device)
recognizer = sr.Recognizer()

def clean_caption(text):
    """ Repeated words ko remove karta hai aur better formatting karta hai """
    words = text.split()
    cleaned_words = []
    for word in words:
        if not cleaned_words or word.lower() != cleaned_words[-1].lower():
            cleaned_words.append(word)
    return " ".join(cleaned_words)

def speech_to_text(audio):
    """ Audio file ko text me convert karta hai """
    try:
        with sr.AudioFile(audio) as source:
            audio_data = recognizer.record(source)
            text = recognizer.recognize_google(audio_data)
            return text
    except Exception as e:
        return " Speech recognition failed! Try again."

def generate_caption(image, user_prompt="", audio=None):
    """ Image ko analyze karke meaningful aur detailed caption generate karta hai """
    if image is None:
        return " Please upload an image!", None

    image = Image.open(image).convert("RGB")


    if not user_prompt.strip() and audio is not None:
        user_prompt = speech_to_text(audio)

    prompt_text = user_prompt if user_prompt.strip() else "Describe the image in detail."
    inputs = processor(image, text=prompt_text, return_tensors="pt").to(device)


    output = model.generate(**inputs, temperature=0.7, top_p=0.9, max_length=100, repetition_penalty=1.2)


    caption = processor.decode(output[0], skip_special_tokens=True)
    caption = clean_caption(caption)


    tts = gTTS(text=caption, lang='hi')
    audio_path = "caption_audio.mp3"
    tts.save(audio_path)

    return caption, audio_path


app = gr.Interface(
    fn=generate_caption,
    inputs=[
        gr.Image(type="filepath", label=" Upload Image"),
        gr.Textbox(lines=2, placeholder=" Ask anything about the image...", label="Your Question"),
        gr.Audio(type="filepath", label=" Speak your question")
    ],
    outputs=[
        gr.Textbox(label=" AI's Answer"),
        gr.Audio(label=" AI Speech Output")
    ],
    title=" AI Image Captioning with BLIP-2 (Text & Audio Input)",
    description="Upload an image and ask anything about it via text or voice. AI will analyze and respond accordingly with a detailed answer and audio output.",
    theme="Huggingface-",)


app.launch(share=True)


Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/445 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/527 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/711k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/4.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.88G [00:00<?, ?B/s]



Colab notebook detected. To show errors in colab notebook, set debug=True in launch()
* Running on public URL: https://d95ebe428beb7e3439.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


