In [None]:
! pip install torch transformers peft accelerate bitsandbytes
! pip install gradio

In [7]:
from transformers import pipeline
import gradio as gr

# Load the Whisper model pipeline
pipe = pipeline("automatic-speech-recognition", model="Maverickz1989/openai-whisper-small-canto-colab", return_timestamps=True)

# Function to convert seconds to SRT time format
def seconds_to_srt_time(seconds):
    hours = int(seconds // 3600)
    minutes = int((seconds % 3600) // 60)
    secs = int(seconds % 60)
    milliseconds = int((seconds - int(seconds)) * 1000)
    return f"{hours:02}:{minutes:02}:{secs:02},{milliseconds:03}"

# Generate SRT file from transcription
def generate_srt(transcription):
    srt_output = []
    segments = transcription.get("chunks", [])
    for idx, segment in enumerate(segments):
        start_time = seconds_to_srt_time(segment["timestamp"][0])
        end_time = seconds_to_srt_time(segment["timestamp"][1])
        text = segment["text"]
        srt_output.append(f"{idx + 1}\n{start_time} --> {end_time}\n{text}\n")
    return "\n".join(srt_output)

# Transcription function
def transcribe(audio):
    transcription = pipe(audio)
    # Generate SRT formatted text
    srt_text = generate_srt(transcription)
    return transcription["text"], srt_text

# Gradio Interface
iface = gr.Interface(
    fn=transcribe,
    inputs=gr.Audio(type="filepath"),  # No 'source' argument; default behavior supports file upload or recording
    outputs=[
        gr.Textbox(label="Transcription"),  # Plain text transcription
        gr.Textbox(label="SRT File Content"),  # SRT format text output
    ],
    title="Whisper Small Canto",
    description="Realtime demo for Canto speech recognition using a fine-tuned Whisper small model.",
)

# Launch the interface
iface.launch()

Device set to use mps:0


* Running on local URL:  http://127.0.0.1:7863

To create a public link, set `share=True` in `launch()`.






In [8]:
iface.close()

Closing server running on port: 7863
