In [None]:
# Whisper Quick Start Guide
Whisper is an open-source neural network model developed by OpenAI for Automatic Speech Recognition (ASR). It can perform language identification and speech translation, supporting multi-language transcription. Whisper is also capable of handling audio with poor quality or significant background noise.

Follow these steps to convert audio/video into text using this tool:
1. In the "Runtime" menu, select "Change runtime type", then choose T4 GPU in the pop-up window.
2. Each time you open this document, click "Run all" under the "Runtime" menu or press Ctrl + F9.
3. When the program reaches Step 4, choose the type of media you want to transcribe (YouTube link / MP3 / MP4) and click the "Confirm" button.
4. In Step 5, enter the media source (URL or file path) and press Enter.
5. Find and download the converted TXT file from the folder on the right side.

In [None]:
"""
1. Download necessary packages and import them
"""
!pip install yt-dlp                     # For downloading YouTube videos
!pip install git+https://github.com/openai/whisper.git   # For downloading Whisper
!sudo apt update && sudo apt install ffmpeg      # Use Linux command to install ffmpeg, for converting "video files" to "audio files"
!pip install librosa                    # For extracting audio features

# Import packages
import whisper
import time
import librosa
import re
import yt_dlp as youtube_dl
import subprocess
from google.colab import widgets
import ipywidgets as widgets
from IPython.display import display
import os


In [None]:
"""
2. Choose the speech recognition model you want to use. Generally, 'base' is sufficient.

  - If you are sure the audio is entirely in English, add ".en" to the model name.

  - If you're unsure about the audio language or it's not purely English, remove the ".en" from the model name.
"""
# model = whisper.load_model("tiny.en")
model = whisper.load_model("base")
# model = whisper.load_model("small.en")
# model = whisper.load_model("medium.en")
# model = whisper.load_model("large")

In [None]:
"""
3. Functions for handling different data types
"""

def youtube_filepath():
    url = input("Enter a YouTube video URL: ")

    # Create youtube-dl options dictionary
    ydl_opts = {
        # Choose the best available audio format
        'format': 'bestaudio/best',
        # Use ffmpeg to extract audio and convert it to mp3
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        # Set the output filename to the video title
        'outtmpl': '%(title)s.%(ext)s',
    }

    # Download the video and extract audio
    with youtube_dl.YoutubeDL(ydl_opts) as ydl:
        ydl.download([url])

    # Get the path of the downloaded audio file
    file_path = ydl.prepare_filename(ydl.extract_info(url, download=False))
    file_path = file_path.replace('.webm', '.mp3')
    file_path = file_path.replace('.m4a', '.mp3')
    return file_path


def mp4_filepath():
    file_path = input("Enter the MP4 filepath: ")
    # Use ffmpeg to extract audio from MP4 and convert to MP3
    command = "ffmpeg -i '{}' -vn -ar 44100 -ac 2 -b:a 192k '{}'".format(
        file_path.replace("/content/", ""), file_path.replace(".mp4", ".mp3"))
    !$command
    return file_path.replace(".mp4", ".mp3")


In [None]:
"""
4. Run this code block, select the type of data to convert, and click the confirm button.
"""
file_type = None

# Create a dropdown widget for selecting the file type
file_type_dropdown = widgets.Dropdown(
    options=['YouTube Link', 'MP4', 'MP3'],
    description='File Type: ',
)

# Create a button to trigger the processing
process_button = widgets.Button(description="Confirm")

# Output widget to display results
output = widgets.Output()

# Define a function to handle the processing
def process_file_type(b):
    with output:
        output.clear_output()  # Clear previous outputs
        print(f"Selected file type: {file_type_dropdown.value}")
        global file_type
        file_type = file_type_dropdown.value


# Link the button to the processing function
process_button.on_click(process_file_type)

# Display the dropdown, button, and output widget
display(file_type_dropdown, process_button, output)


In [None]:
"""
5. Enter the URL or file path and press Enter to call the functions from Step 3.
"""
if file_type == 'YouTube Link':
    print("User selected YouTube Link.")
    file_path = youtube_filepath()
    print(file_path)

elif file_type == 'MP4':
    print("User selected MP4.")
    file_path = mp4_filepath()
    print(file_path)

elif file_type == 'MP3':
    print("User selected MP3.")
    file_path = input("Enter the MP3 filepath: ")
    print(file_path)

else:
    print("No valid file type selected. Defaulting to YouTube Link.")
    file_path = youtube_filepath()
    print(file_path)


In [None]:
"""
6. This section uses the Whisper model to transcribe audio/video into text.
   The resulting .txt file will be saved in the folder on the right
   and can be downloaded directly.
"""

# Get the duration of the audio
duration = librosa.get_duration(path=file_path)
start = time.time()
result = model.transcribe(file_path)
end = time.time()
seconds = end - start

result

print("Video length:", duration, "seconds")
print("Transcription time:", seconds)

# Split result["text"] on !, ?, and . but keep the punctuation
sentences = re.split("([!?.])", result["text"])

# Reattach the punctuation to the sentences
sentences = ["".join(i) for i in zip(sentences[0::2], sentences[1::2])]
text = "\n\n".join(sentences)
for s in sentences:
    print(s)

# Save the transcript as a .txt file
name = "".join(file_path) + ".txt"
with open(name, "w") as f:
    f.write(text)

print("\n\n", "-" * 100, "\n\nYour transcript is here:", name)
