In [None]:

!pip install transformers torch pillow gtts gradio opencv-python


Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

In [None]:
# Step 1: Install required libraries
# Run this in your terminal or command prompt:
# pip install transformers torch pillow gtts gradio opencv-python

# Step 2: Import libraries
from transformers import BlipProcessor, BlipForConditionalGeneration
from PIL import Image
from gtts import gTTS
import cv2
import time
import gradio as gr
import os
import threading

# Step 3: Load the BLIP model and processor
processor = BlipProcessor.from_pretrained("Salesforce/blip-image-captioning-base")
model = BlipForConditionalGeneration.from_pretrained("Salesforce/blip-image-captioning-base")

# Step 4: Function to generate a caption and convert it to speech
def image_to_voice(image):
    # Generate a textual description
    inputs = processor(image, return_tensors="pt")
    out = model.generate(**inputs)
    caption = processor.decode(out[0], skip_special_tokens=True)

    # Convert the caption to speech
    tts = gTTS(caption, lang='en')
    audio_file = "output.mp3"
    tts.save(audio_file)

    # Return the caption and audio file path
    return caption, audio_file

# Step 5: Function to capture images from the camera
def capture_frame():
    # Initialize the camera
    cap = cv2.VideoCapture(0)  # 0 is the default camera

    if not cap.isOpened():
        return None, "Error: Could not open camera."

    # Capture an image from the camera
    ret, frame = cap.read()
    if not ret:
        return None, "Error: Could not capture image."

    # Convert the frame to a PIL image
    image = Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))

    # Release the camera
    cap.release()

    return image, None

# Step 6: Function to periodically capture and process frames
def periodic_capture():
    while True:
        # Capture and process a frame
        image, error = capture_frame()
        if not error:
            caption, audio_file = image_to_voice(image)
            print("Generated Caption:", caption)
            os.system(f"start {audio_file}")  # Play the audio (Windows)
            # os.system(f"afplay {audio_file}")  # Play the audio (macOS)
            # os.system(f"mpg321 {audio_file}")  # Play the audio (Linux)

        # Wait for 10 seconds
        time.sleep(10)

# Step 7: Gradio interface for real-time camera feed and voice output
def gradio_interface():
    def process_frame():
        # Capture a frame from the camera
        image, error = capture_frame()
        if error:
            return error, None

        # Generate a caption and voice output
        caption, audio_file = image_to_voice(image)
        return caption, audio_file

    # Gradio interface
    with gr.Blocks() as demo:
        gr.Markdown("# Image to Voice for Blind People")
        gr.Markdown("This app captures images from your camera every 10 seconds, describes them, and provides voice output.")

        with gr.Row():
            caption_output = gr.Textbox(label="Generated Caption")
            audio_output = gr.Audio(label="Audio Output", type="filepath")

        # Add a button to manually trigger frame capture
        capture_button = gr.Button("Capture and Describe")

        # Set up the button click event
        capture_button.click(
            fn=process_frame,
            outputs=[caption_output, audio_output],
        )

    # Launch the Gradio app
    demo.launch()

# Step 8: Start the periodic capture thread
thread = threading.Thread(target=periodic_capture, daemon=True)
thread.start()

# Step 9: Run the Gradio interface
if __name__ == "_main_":
    gradio_interface()

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.
