In [None]:
!pip install yt-dlp
!pip install faster-whisper
!apt-get install ffmpeg -y

In [None]:
import os
import re
import warnings
import logging
from typing import List, Union
import yt_dlp
from faster_whisper import WhisperModel
import sys

In [None]:
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler(sys.stdout)],
    force=True
)
logger = logging.getLogger(__name__)

In [None]:
def sanitize_filename(filename: str) -> str:
    filename = re.sub(r'[^a-zA-Z0-9_-]', '_', filename)
    return filename

In [None]:
class YouTubeDownloader:
  def __init__(self, download_dir: str, cookies_file: str = None):
    self.download_dir = download_dir
    self.cookies_file = cookies_file
    os.makedirs(self.download_dir, exist_ok=True)

  def download_audio(self, urls: List[str]) -> List[str]:
    downloaded_files = []
    ydl_opts = {
        'format': 'bestaudio/best',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '64',
        }],
        'outtmpl': os.path.join(self.download_dir, '%(title)s.%(ext)s'),
        'quiet': True,
        'noprogress': True,
        'cookiefile': self.cookies_file if self.cookies_file else None,
        'retries': 5,
        'timeout': 60,
    }

    for url in urls:
        try:
            with yt_dlp.YoutubeDL(ydl_opts) as ydl:
                logger.info(f"Baixando: {url}")
                info = ydl.extract_info(url, download=True)
                video_title = info.get("title", "unknown")
                filename = sanitize_filename(video_title) + ".mp3"
                old_path = os.path.join(self.download_dir, info['title'] + ".mp3")
                new_path = os.path.join(self.download_dir, filename)
                if os.path.exists(old_path):
                    os.rename(old_path, new_path)
                downloaded_files.append(new_path)
                logger.info(f"Download concluído: {filename}")
        except Exception as e:
            logger.error(f"Erro ao baixar {url}: {e}")

    return downloaded_files

In [None]:
class Transcriber:
  def __init__(self, folder_path: str):
    warnings.filterwarnings("ignore")
    self.model = WhisperModel("small", device="cpu", compute_type="int8")
    # If you have a GPU, use the line below for better performance
    # self.model = WhisperModel("small", device="cuda", compute_type="float16")
    self.folder_path = folder_path
    self.transcripts_dir = os.path.join(folder_path, "transcripts")
    os.makedirs(self.transcripts_dir, exist_ok=True)

  def transcribe_audio(self, file_path: str) -> Union[str, None]:
    try:
        segments, info = self.model.transcribe(file_path, beam_size=5, language="pt", condition_on_previous_text=False)
        transcription = "\n".join([segment.text for segment in segments])
        return transcription
    except RuntimeError as e:
        logger.error(f"Erro ao transcrever {file_path}: {e}")
        return None

  def process_files(self):
    for filename in os.listdir(self.folder_path):
      file_path = os.path.join(self.folder_path, filename)
      if os.path.isfile(file_path) and filename.endswith(('.mp3', '.mp4', '.mkv', '.avi')):
          logger.info(f"Transcrevendo: {filename}")
          transcription = self.transcribe_audio(file_path)
          if transcription:
              transcript_file = os.path.join(self.transcripts_dir, os.path.splitext(filename)[0] + ".txt")
              with open(transcript_file, 'w', encoding='utf-8') as f:
                  f.write(transcription)
              logger.info(f"Transcrição salva: {transcript_file}")
          else:
              logger.error(f"Falha ao transcrever: {filename}")


In [None]:
example = ["https://www.youtube.com/watch?v=auXfAHHNSFo",
           "https://www.youtube.com/watch?v=mAJIO-0p0iM"
           ]

In [None]:
# If you have a cookies file, instantiate the YouTubeDownloader like this:
# downloader = YouTubeDownloader("destinationDirectory", cookies_file="cookies.txt")

# Otherwise, instantiate it without cookies:
downloader = YouTubeDownloader("destinationDirectory")
downloader.download_audio(example)

In [None]:
transcriber = Transcriber("destinationDirectory")
transcriber.process_files()