# Installs & Imports

In [None]:
# !pip install yt-dlp openai-whisper pydub

# Dependencies (systemwide)
#sudo apt install ffmpeg  # Linux
#brew install ffmpeg      # macOS

In [2]:
import yt_dlp
import re
import time
import os
import whisper

# Scrape Channel and/or Video Metadata

In [159]:
def scrape_yt_meta(url, limits):
  options = {
          'ignoreerrors': True,
          'playlistend': limits,
          'sleep_interval': 20,
          'max_sleep_interval': 40,
          'cookiesfrombrowser': ('chromium',),
      }

  with yt_dlp.YoutubeDL(options) as ydl:
    channel_info = ydl.extract_info(url, download=False)
  return channel_info

# Extracting Video URLs

In [160]:
def get_video_urls(db):
    urls = []
    for item in db["entries"]:
        urls.append(item["webpage_url"])
    return urls

# Scrape the Data and Download Specific Videos

In [161]:
def scrape_yt(video_url, start_count):
    options = {
        'format': 'bestaudio/best',
        'sleep_interval': 20,
        'max_sleep_interval': 40,
        'cookiesfrombrowser': ('chromium',),
    }

    with yt_dlp.YoutubeDL(options) as ydl:
        try:
            info = ydl.extract_info(video_url, download=False)

            # Проверяем получена ли инфа
            # Если нет, то заполняем словарь данными об отсутствии инфы
            if not isinstance(info, dict):
                errors = "no metadata"
                info = {
                    "entry_no": start_count,
                    "title": f"no_title_{int(time.time())}",
                    "duration": 0,
                    "uploader": "unknown",
                    "view_count": 0,
                    "upload_date": "unknown",
                    "url": video_url,
                }
            else:
                errors = None

        except Exception as e:
            errors = str(e)
            info = {
                "entry_no": start_count,
                "title": f"no_title_{int(time.time())}",
                "duration": 0,
                "uploader": "unknown",
                "view_count": 0,
                "upload_date": "unknown",
                "url": video_url,
            }

        # Обрабатываем названия для имени файла и диреткории
        max_file = 200
        max_dir = 7
        
        video_title = info["title"].lower()
        normalized_title = re.sub(r'[\\/*?:"<>|!]', "", video_title)
        normalized_title = re.sub(r"[^\x00-\x7F]", "", normalized_title)
        title_to_filename = re.sub(r'\s+', "_", normalized_title.strip())
        
        if len(title_to_filename) > max_file:
            title_to_filename = title_to_filename[:max_file].strip()

        interim_dir = info['uploader'].lower()
        directory = re.sub(r'[^\w\d]', "", interim_dir.strip())

        if len(directory) > max_dir:
            directory = directory[:max_dir].strip()
        else:
            directory = directory + ("x" * (max_dir - len(directory)))

        # Задаем имя файла и путь
        if not os.path.exists(directory):
            os.makedirs(directory)

        output_file = os.path.join(directory, f"{start_count}_{directory}_{title_to_filename}.%(ext)s")
        options['outtmpl'] = output_file

        # Качаем видео
        try:
            ydl = yt_dlp.YoutubeDL(options)
            ydl.download([video_url])
        
        except Exception as e:
            errors = str(e)

    result = {
        "entry_no": start_count,
        "title": info["title"],
        "directory": directory,
        "filename": f"{start_count}_{directory}_{title_to_filename}.webm",
        "duration": info["duration"],
        "uploader": info["uploader"],
        "view_count": info["view_count"],
        "upload_date": info["upload_date"],
        "url": video_url,
        "errors": errors,
    }

    start_count += 1

    return result, start_count

In [162]:
def batch_dl_and_make_db(video_urls, start_count):
    downloaded_videos = []
    for item in video_urls:
        result = scrape_yt(item, start_count)
        downloaded_videos.append(result[0])
        start_count = result[1]
    return downloaded_videos

# Transcribing Audio

In [163]:
def transcribe_audio(dl_db):
    directory = dl_db[0]['directory']
    files_to_transcribe = os.listdir(directory)
    
    transcripts = []
    for item in dl_db:
        filename = item["filename"]
        
        if isinstance(files_to_transcribe, list):
            if filename in files_to_transcribe:
                path = os.path.join(directory, filename)
                try:
                    model = whisper.load_model("small")
                    result = model.transcribe(path, language="en", task="transcribe", temperature=0, best_of=3, fp16=False)
                    item["transcript"] = result["text"]
                
                except Exception as e:
                    error = str(e)
                    item["transcript"] = f"Something went wrong (with the model): {error}"
            else:
                item["transcript"] = f"Something went wrong. It looks like the entry exists, but the file to transcribe is missing."
        else:
            item["transcript"] = f"Something went wrong. Maybe, the directory is empty."
    
    return dl_db

# Testing the Full Pipline

In [None]:
alphamx_full_db = transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta("https://www.youtube.com/@alpham/videos", 50)),0))

In [None]:
print(alphamx_full_db[49])
print(type(alphamx_full_db))

In [164]:
import json

def export_to_json(db, name_of_file):
    try:
        if isinstance(db, list):
            json_file = f"{name_of_file}.json"
            with open(json_file, "w") as file:
                json.dump(db, file, indent=4)
        else:
            print("Something's wrong with your DB; skipping this step")
    except Exception as e:
        error = str(e)
        print(f"Something's wrong with the JSON export process: {error}; skipping this step")

In [165]:
def export_transcripts(db, name_of_file):
    try:
        if isinstance(db, list):
            txt_file = f"{name_of_file}.txt"
            with open(txt_file, "a") as file:
                for item in db:
                    file.write(item["transcript"] + "\n\n\n")
        else:
            print("Something's wrong with your DB; skipping this step")
    except Exception as e:
        error = str(e)
        print(f"Something's wrong with the export: {error}; skipping this step")

In [79]:
name = "alphamx_full_db"
export_to_json(alphamx_full_db, name)

name = "alphamx_transcrips"
export_transcripts(alphamx_full_db, name)

In [None]:
# TMF
teachin_full_db = transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta("https://www.youtube.com/@JosecZuniga/videos", 50)),50))

name = "teachin_full_db"
export_to_json(teachin_full_db, name)

name = "teachin_transcrips"
export_transcripts(teachin_full_db, name)

# The Style OG
thestyl_full_db = transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta("https://www.youtube.com/@TheStyleOG/videos", 50)),100))

name = "thestyl_full_db"
export_to_json(thestyl_full_db, name)

name = "thestyl_transcrips"
export_transcripts(thestyl_full_db, name)

# RMRS
realmen_full_db = transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta("https://www.youtube.com/@RealMenRealStyle/videos", 50)),150))

name = "realmen_full_db"
export_to_json(realmen_full_db, name)

name = "realmen_transcrips"
export_transcripts(realmen_full_db, name)

# 40 Over
fourtyo_db = transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta("https://www.youtube.com/@40OverFashion/videos", 50)),200))

name = "fourtyo_db"
export_to_json(fourtyo_db, name)

name = "fourtyo_transcrips"
export_transcripts(fourtyo_db, name)

# Brock
brockmc_db = transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta("https://www.youtube.com/@BrockMcGoff/videos", 50)),250))

name = "brockmc_db"
export_to_json(brockmc_db, name)

name = "brockmc"
export_transcripts(brockmc_db, name)

In [90]:
def transcribe_one(db):
    path = os.path.join(db[0]["directory"], db[0]["filename"])
    model = whisper.load_model("small")
    result = model.transcribe(path, language="en", task="transcribe", temperature=0, best_of=3, fp16=False)
    db[0]["transcript"] = result["text"]
    return db[0]


# "YouTube Register"

In [None]:
everything = "https://www.youtube.com/@clarkkegley/videos https://www.youtube.com/@mattdavella/videos https://www.youtube.com/@howtobeast/videos https://www.youtube.com/@mkbhd/videos https://www.youtube.com/@AndrewPaul1/videos https://www.youtube.com/@JeffSu/videos https://www.youtube.com/@NicholasGarofola/videos https://www.youtube.com/@betterideas/videos https://www.youtube.com/@TeddyBaldassarre/videos https://www.youtube.com/@ManTalks/videos https://www.youtube.com/@Christinamychas/videos https://www.youtube.com/@JimmyTriesWorld/videos https://www.youtube.com/@spoonfedstudy/videos https://www.youtube.com/@GabeBult/videos https://www.youtube.com/@JesseJamesWest/videos https://www.youtube.com/@danmartell/videos https://www.youtube.com/@IAmMarkManson/videos https://www.youtube.com/@LeviHildebrandYT/videos https://www.youtube.com/@CarlMurawski/videos https://www.youtube.com/@FlyWithJohnnyThai/videos"
channel_urls = everything.split()
print(channel_urls)

In [None]:
yt_register_full_db = []
start_count = 0

for item in channel_urls:
    yt_register_full_db.extend(transcribe_audio(batch_dl_and_make_db(get_video_urls(scrape_yt_meta(item, 15)), start_count)))
    start_count += 15

name = "yt_register_full_db"
export_to_json(yt_register_full_db, name)

name = "yt_register_transcrips"
export_transcripts(yt_register_full_db, name)