In [None]:
import os
import pandas as pd
import yt_dlp
import whisper
from pathlib import Path
import multiprocessing as mp
from functools import partial

# Whisper 모델 로드
model = whisper.load_model("base")  # "tiny"나 "small"로 변경 가능

def download_audio(url, output_dir="temp_audio"):
    """유튜브에서 오디오만 추출"""
    ydl_opts = {
        'format': 'bestaudio/best',
        'outtmpl': f'{output_dir}/%(id)s.%(ext)s',
        'postprocessors': [{
            'key': 'FFmpegExtractAudio',
            'preferredcodec': 'mp3',
            'preferredquality': '192',
        }],
        'quiet': True,
        'nooverwrites': True,
    }
    with yt_dlp.YoutubeDL(ydl_opts) as ydl:
        info = ydl.extract_info(url, download=True)
        return f"{output_dir}/{info['id']}.mp3"

def transcribe_audio(audio_path, model):
    """Whisper로 STT 수행"""
    result = model.transcribe(audio_path, language="en")
    return result["text"]

def process_video(url, output_dir="temp_audio"):
    """단일 영상 처리: 다운로드 → STT → 파일 삭제"""
    try:
        audio_path = download_audio(url, output_dir)
        text = transcribe_audio(audio_path, model)
        os.remove(audio_path)
        return text
    except Exception as e:
        print(f"Error processing {url}: {e}")
        return None

def process_videos_in_parallel(url_list, output_dir="temp_audio"):
    """병렬 처리로 시간 복잡도 개선"""
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    
    pool = mp.Pool(processes=mp.cpu_count())
    process_func = partial(process_video, output_dir=output_dir)
    results = pool.map(process_func, url_list)
    pool.close()
    pool.join()
    
    for file in Path(output_dir).glob("*.mp3"):
        file.unlink()
    if not os.listdir(output_dir):
        os.rmdir(output_dir)
    return results

# CSV 파일에서 URL 리스트 읽기 (샘플 3개만)
def load_sample_urls_from_csv(csv_files, sample_size=3):
    url_list = []
    for file in csv_files:
        df = pd.read_csv(file)
        if 'url' in df.columns:
            url_list.extend(df['url'].dropna().tolist())
        else:
            print(f"Warning: 'url' column not found in {file}")
            
    # 중복 제거 후 처음 3개만 선택
    unique_urls = list(set(url_list))
    return unique_urls[:sample_size]


if __name__ == "__main__":
    
    # CSV 파일 경로
    csv_files = [
        "fox_with_labels_01_over60per.csv",
        "cnn_with_labels_01_over60per.csv"
    ]
    
    # 샘플로 3개 URL만 로드
    url_list = load_sample_urls_from_csv(csv_files, sample_size=3)
    print(f"Loaded {len(url_list)} sample URLs from CSV files:")
    for url in url_list:
        print(url)

    # 병렬로 STT 수행
    transcriptions = process_videos_in_parallel(url_list)
    
    # 결과 출력
    for url, text in zip(url_list, transcriptions):
        if text:
            print(f"\nURL: {url}\nText: {text[:100]}...\n")
            
        else:
            print(f"\nURL: {url}\nText: Failed to transcribe\n")

  from .autonotebook import tqdm as notebook_tqdm


Loaded 3 sample URLs from CSV files:
https://www.youtube.com/watch?v=1km_uxAUD9k
https://www.youtube.com/watch?v=AJmzz3YA-A0
https://www.youtube.com/watch?v=tWHt8Dmvmpk
