<a href="https://colab.research.google.com/github/viniciusdutra314/faster-whisper-youtube-jellyfin/blob/main/faster_whisper_youtube.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Youtube Videos Transcription with Faster Whisper + Jellyfin support**


[![repository shield](https://img.shields.io/static/v1?label=&message=Repository&color=blue&style=for-the-badge&logo=github&link=https://github.com/lewangdev/faster_whisper_youtube)](https://github.com/viniciusdutra314/faster-whisper-youtube-jellyfin)

This is a notebook where you can download videos/playlists from YouTube and transcribe them quickly using Faster Whisper. The videos and subtitles will be saved on your Google Drive in a Jellyfin-friendly format, ready to be used with the [Youtube-Metadata-Plugin](https://github.com/ankenyr/jellyfin-youtube-metadata-plugin).

Credits:
This is a fork from [lewangdev](https://github.com/lewangdev/faster-whisper-youtube) which itself is a fork of [ArthurFDLR](https://github.com/ArthurFDLR/whisper-youtube), i just added a few lines to make it compatible with jellyfin, thank you guys to have made this so simple yet extremly useful google colab notebook!


In [None]:
#@markdown # **Jellyfin** <img src="https://jellyfin.org/images/logo.svg"> { display-mode: "form" }

user="" #@param {type:"string"}
#@markdown Video or playlist link:
URL="" #@param {type:"string"}
drive_path = "Faster-Whisper-YT-Jellyfin" #@param {type:"string"}
save_dir=f"/content/drive/MyDrive/Faster-Whisper-YT-Jellyfin/{user}/"
from pathlib import Path
from google.colab import drive
drive_mount_path = Path("/") / "content" / "drive"
drive.mount(str(drive_mount_path))
drive_mount_path /= "My Drive"
drive_whisper_path = drive_mount_path / Path(drive_path.lstrip("/"))
drive_whisper_path.mkdir(parents=True, exist_ok=True)

In [None]:
#@markdown # **Download Video/Playlist** 📺 { display-mode: "form" }


def seconds_to_time_format(s):
    hours = s // 3600
    s %= 3600
    minutes = s // 60
    s %= 60
    seconds = s // 1
    milliseconds = round((s % 1) * 1000)
    return f"{int(hours):02d}:{int(minutes):02d}:{int(seconds):02d},{int(milliseconds):03d}"

print("Downloading yt-dlp")

!pip install yt-dlp --quiet


import sys
import warnings
import yt_dlp
import subprocess
import torch
import shutil
import numpy as np
from IPython.display import display, Markdown, YouTubeVideo





import subprocess
command = ["yt-dlp", "-o", f"{save_dir}[%(id)s].%(ext)s", URL, "--write-info-json","--quiet"]
process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)
for line in process.stdout:
    print(line, end='')

ydl_opts = {
    'format': 'm4a/bestaudio/best',
    'outtmpl': '%(id)s.%(ext)s',
    'postprocessors': [{
        'key': 'FFmpegExtractAudio',
        'preferredcodec': 'wav',
    }]
}

with yt_dlp.YoutubeDL(ydl_opts) as ydl:
    error_code = ydl.download([URL])
    download_info = [ydl.extract_info(URL, download=False)]

In [None]:
# @title **Run the model** 🚀 { display-mode: "form" }
#@markdown Run this cell to execute the transcription of the video. This can take a while and very based on the length of the video and the number of parameters of the model selected above.
#@markdown **The default options should be fine for most users**

language = "auto" #@param {type:"string"}
model_size = 'large-v2' #@param {type:"string"} ['tiny', 'tiny.en', 'base', 'base.en', 'small', 'small.en', 'medium', 'medium.en', 'large-v1', 'large-v2']
device_type = "cuda" #@param {type:"string"} ["cuda","cpu"]
compute_type = "float16"
word_level_timestamps = False #@param {type:"boolean"}
#@markdown ---

vad_filter = True
vad_filter_min_silence_duration_ms = 50

!pip install faster-whisper --quiet
print(f"Downloading {model_size} model")
from faster_whisper import WhisperModel
model = WhisperModel(model_size,device=device_type,compute_type=compute_type)

if "entries" in download_info[0]:
  videos=download_info[0]["entries"]
else:
  videos=download_info


for index,video_info in enumerate(videos):
  print(f"{index} of {len(videos)}")
  video_path_local=Path(f"{video_info['id']}.wav")

  segments, info = model.transcribe(f"{str(video_path_local)}", beam_size=5,
                                    language=None if language == "auto" else language,
                                    word_timestamps=word_level_timestamps,
                                    vad_filter=vad_filter,
                                    vad_parameters=dict(min_silence_duration_ms=vad_filter_min_silence_duration_ms))

  display(Markdown(f"Detected language '{info.language}' with probability {info.language_probability}"))

  ext_name =".srt"
  transcript_file_name = f"{save_dir}[{video_path_local.stem}].srt"
  sentence_idx = 1
  with open(transcript_file_name, 'w') as f:
    for segment in segments:
      if word_level_timestamps:
        for word in segment.words:
          ts_start = seconds_to_time_format(word.start)
          ts_end = seconds_to_time_format(word.end)
          print(f"[{ts_start} --> {ts_end}] {word.word}")
          f.write(f"{sentence_idx}\n")
          f.write(f"{ts_start} --> {ts_end}\n")
          f.write(f"{word.word}\n\n")
          f.write("\n")
          sentence_idx = sentence_idx + 1
      else:
        ts_start = seconds_to_time_format(segment.start)
        ts_end = seconds_to_time_format(segment.end)
        print(f"[{ts_start} --> {ts_end}] {segment.text}")
        f.write(f"{sentence_idx}\n")
        f.write(f"{ts_start} --> {ts_end}\n")
        f.write(f"{segment.text.strip()}\n\n")
        sentence_idx = sentence_idx + 1

  try:
    shutil.copy(video_path_local.parent / transcript_file_name,
              drive_whisper_path / transcript_file_name
    )
    display(Markdown(f"**Transcript file created: {drive_whisper_path / transcript_file_name}**"))
  except:
    display(Markdown(f"**Transcript file created: {video_path_local.parent / transcript_file_name}**"))
