<a href="https://colab.research.google.com/github/ykitaguchi77/YouTube_DL/blob/main/Faster_Whisper.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -U yt-dlp
!pip install -U faster_whisper

#**一つの動画を文字起こし**

In [None]:
import subprocess
import re
from faster_whisper import WhisperModel

YOUTUBE_ID = "TOVAc1YCgag"  # Youtube ID
AUDIO_FILE_NAME = f"{YOUTUBE_ID}.mp3"

# Download audio and metadata from Youtube
def dl_yt(yt_url):
    # Download audio
    subprocess.run(f"yt-dlp -x --audio-format mp3 -o {AUDIO_FILE_NAME} {yt_url}", shell=True)
    # Get metadata
    result = subprocess.run(f"yt-dlp -j {yt_url}", shell=True, capture_output=True, text=True)
    return result.stdout

metadata = dl_yt(f"https://youtu.be/{YOUTUBE_ID}")

# Extract title from metadata
import json
metadata_json = json.loads(metadata)
title = metadata_json.get("title", YOUTUBE_ID)

# Sanitize the title to be used as a filename
safe_title = re.sub(r'[\\/*?:"<>|]', "", title)

TRANSCRIPTION_FILE_NAME = f"{safe_title}_transcription.txt"

model_size = "large-v2"
model = WhisperModel(model_size, device="cuda", compute_type="float16")
segments, info = model.transcribe(AUDIO_FILE_NAME, beam_size=5)

# Print detected language
print("Detected language '%s' with probability %f" % (info.language, info.language_probability))

# Write the transcription to a file
with open(TRANSCRIPTION_FILE_NAME, "w", encoding="utf-8") as file:
    for segment in segments:
        file.write(f"{segment.text}\n")
        print("[%.2fs -> %.2fs] %s" % (segment.start, segment.end, segment.text))

print(f"Transcription saved to {TRANSCRIPTION_FILE_NAME}")


#**動画ID --> CHHANNEL_URL --> すべての動画をDL**

In [2]:
import subprocess
import re
import json
import os
from faster_whisper import WhisperModel

VIDEO_ID = "YaK2pwtavi4"  # 動画IDをここに設定
AUDIO_DIR = "downloaded_audio"
TRANSCRIPTION_DIR = "transcriptions"

# ディレクトリ確認・作成
os.makedirs(AUDIO_DIR, exist_ok=True)
os.makedirs(TRANSCRIPTION_DIR, exist_ok=True)

# YouTube動画からチャンネルURLを取得
def get_channel_url_from_video(video_id):
    result = subprocess.run(f"yt-dlp -j https://youtu.be/{video_id}", shell=True, capture_output=True, text=True)
    video_data = json.loads(result.stdout)
    return video_data["channel_url"]

def list_videos(channel_url):
    result = subprocess.run(f"yt-dlp -j --flat-playlist {channel_url}", shell=True, capture_output=True, text=True)
    lines = result.stdout.split('\n')  # 出力を行に分割
    videos = []
    for line in lines:
        if line:  # 空の行を無視
            try:
                video = json.loads(line)
                videos.append((video['id'], video['title']))
            except json.JSONDecodeError as e:
                print(f"JSON decoding error: {e} for line: {line}")
    return videos


# メイン処理
CHANNEL_URL = get_channel_url_from_video(VIDEO_ID)
videos = list_videos(CHANNEL_URL)

# 以下、既存のdl_yt, transcribe_and_save 関数と動画処理ループを使用
print(CHANNEL_URL)

https://www.youtube.com/channel/UCH9YK4B72yMjRPsVgx-EXxw


In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
##############
# Create file list #
##############

import subprocess
import os
import json
import pandas as pd

VIDEO_ID = "YaK2pwtavi4"  # Set the video ID
BASE_PATH = "/content/drive/MyDrive/YouTube"  # Base path

def get_channel_info_from_video(video_id):
    result = subprocess.run(f"yt-dlp -j https://youtu.be/{video_id}", shell=True, capture_output=True, text=True)
    video_data = json.loads(result.stdout)
    channel_url = video_data["channel_url"]
    channel_name = video_data["channel"]
    return channel_url, channel_name

def create_directories(channel_name):
    path = os.path.join(BASE_PATH, channel_name)
    os.makedirs(path, exist_ok=True)
    os.makedirs(os.path.join(path, "audio"), exist_ok=True)
    os.makedirs(os.path.join(path, "transcriptions"), exist_ok=True)
    return path

def list_videos(channel_url):
    result = subprocess.run(f"yt-dlp -j --flat-playlist {channel_url}", shell=True, capture_output=True, text=True)
    lines = result.stdout.strip().split('\n')
    videos = []
    for line in lines:
        if line:
            try:
                video_data = json.loads(line)
                video_id = video_data['id']
                video_title = video_data['title']
                video_view_count = video_data.get('view_count', 0)  # Default to 0 if view count is not available
                videos.append((video_id, video_title, video_view_count))
            except json.JSONDecodeError as e:
                print(f"Error decoding JSON for line: {line}\nError: {e}")
    return videos

def list_videos_sorted_by_view_count(channel_url):
    videos = list_videos(channel_url)
    # Sort videos by view count in descending order
    sorted_videos = sorted(videos, key=lambda x: x[2], reverse=True)
    df = pd.DataFrame(sorted_videos, columns=['ID', 'Title', 'View Count'])
    return df

# Get channel information
CHANNEL_URL, channel_name = get_channel_info_from_video(VIDEO_ID)

# Create necessary directories
channel_path = create_directories(channel_name)

# Sort and list videos
videos_df = list_videos_sorted_by_view_count(CHANNEL_URL)

# Print results
print(f"Channel Name: {channel_name}")
print(videos_df)


In [None]:
##################
# Download mp3 files #
##################

import re

def download_video(video_id, title, base_path):
    safe_title = re.sub(r'[\\/*?:"<>|]', "", title)  # Remove any characters that are invalid in filenames
    audio_file_path = os.path.join(base_path, "audio", f"{safe_title}.mp3")
    subprocess.run(f"yt-dlp -x --audio-format mp3 -o \"{audio_file_path}\" https://youtu.be/{video_id}", shell=True)

# Prompt user for starting index
start_index = int(input("Enter the starting index (0 for beginning): "))

# Download videos starting from the provided index
for index, row in videos_df.iterrows():
    if index >= start_index:
        video_id, title, view_count = row['ID'], row['Title'], row['View Count']
        print(f"Downloading video {index + 1}/{len(videos_df)}: {title}")
        download_video(video_id, title, channel_path)

print("All videos downloaded.")



In [None]:
#################
# transcript to text #
#################

from faster_whisper import WhisperModel
import os
from tqdm import tqdm
import re

def transcribe_and_save(audio_file_path, transcription_file_path, video_title):
    model_size = "large-v2"
    model = WhisperModel(model_size, device="cuda", compute_type="float16")
    segments, info = model.transcribe(audio_file_path, beam_size=5)

    with open(transcription_file_path, "w", encoding="utf-8") as file:
        file.write(f"title: {video_title}\n")
        for segment in tqdm(segments, desc="Transcribing Audio"):
            file.write(f"{segment.text}\n")

def safe_file_name(title):
    """Create a safe file name by removing invalid characters."""
    return re.sub(r'[\\/*?:"<>|]', "", title)

def create_transcription_file_lists(videos_df, channel_path):
    audio_files = []
    transcription_files = []
    titles = []
    for index, row in tqdm(videos_df.iterrows(), total=videos_df.shape[0], desc="Processing Videos"):
        video_id = row['ID']
        title = safe_file_name(row['Title'])
        audio_file_path = os.path.join(channel_path, "audio", f"{title}.mp3")
        transcription_file_path = os.path.join(channel_path, "transcriptions", f"{title}.txt")
        audio_files.append(audio_file_path)
        transcription_files.append(transcription_file_path)
        titles.append(row['Title'])
    return audio_files, transcription_files, titles

# Generate file lists for transcription
audio_files, transcription_files, titles = create_transcription_file_lists(videos_df, channel_path)

# Transcribe and save
for audio_file, transcription_file, title in zip(audio_files, transcription_files, titles):
    print(title)
    transcribe_and_save(audio_file, transcription_file, title)


In [None]:
from faster_whisper import WhisperModel
import os

def transcribe_and_save(audio_file_path, transcription_file_path):
    model_size = "large-v2"
    model = WhisperModel(model_size, device="cuda", compute_type="float16")
    segments, info = model.transcribe(audio_file_path, beam_size=5)

    with open(transcription_file_path, "w", encoding="utf-8") as file:
        for segment in segments:
            file.write(f"{segment.text}\n")

# 以下の部分をオーディオファイルのリスト処理ループに組み込む
transcribe_and_save(audio_file_path, transcription_file_path)
