In [None]:
# -*- coding: utf-8 -*-
"""
YouTube VOD音声を Whisper(base, ja) で逐次書き起こし。
- 166件のうち、rank昇順→view_count降順→video_id昇順で決定的に並べた先頭50件のみ処理
- 進捗は video_id 単位の tqdm
- 中断再開：既存の transcripts/{video_id}.json が有効ならスキップ
- 出力は .part → .json/.srt へ安全にリネーム（疑似アトミック）。壊れた成果物は自動検知して再処理。
"""

import os
import io
import json
import tempfile
from datetime import datetime
from typing import List, Dict, Optional

import pandas as pd
import fsspec
from tqdm.auto import tqdm

import whisper
import torch

# =========================
# 設定セクション（必要に応じて変更）
# =========================
BATCH_ID = "20250910_104532"
BASE_PREFIX = f"gs://dena-ai-intern-yoshihara-data/Archive/{BATCH_ID}"
MANIFEST_USED = f"{BASE_PREFIX}/manifests/vods_with_chat_and_audio_{BATCH_ID}.csv"

# 50件の確定リストを保存/再利用（再現性担保）
SELECTED50_PATH = f"{BASE_PREFIX}/manifests/selected_50_vods_{BATCH_ID}.csv"

AUDIO_DIR = f"{BASE_PREFIX}/audio"
TRANS_DIR = f"{BASE_PREFIX}/transcripts"  # 出力先（json, srt）
LOG_CSV_PATH = f"{TRANS_DIR}/transcription_status_{BATCH_ID}.csv"  # 任意：進捗ログ

# Whisperモデル
model_size = "base"
audio_language = "ja"

# GCS 認証
STORAGE_OPTIONS = {"token": "cloud"}

# 同時実行はせず逐次処理
# =========================


def ensure_gs_uri(p: str) -> str:
    """パスが 'gs://' で始まっていなければ付与。"""
    return p if p.startswith("gs://") else f"gs://{p}"

def gcs_fs():
    return fsspec.filesystem("gcs", **STORAGE_OPTIONS)

def safe_exists(fs, path: str) -> bool:
    try:
        return fs.exists(path)
    except Exception:
        return False

def list_transcripts_json_ids(fs) -> set:
    """既存の有効な transcript JSON を列挙して video_id セットを返す。"""
    done = set()
    try:
        for p in fs.glob(f"{TRANS_DIR}/*.json"):
            vid = os.path.splitext(os.path.basename(p))[0]
            # 有効性チェック（壊れかけのJSONを弾く）
            try:
                with fs.open(p, "r") as f:
                    obj = json.load(f)
                if isinstance(obj, dict) and "text" in obj:
                    done.add(vid)
            except Exception:
                # 無効JSONはスキップ（再処理対象のまま）
                pass
    except Exception:
        pass
    return done

def load_or_build_selected50(fs) -> pd.DataFrame:
    """
    既に selected_50 があればそれを使う（常に同じ50件）。
    なければ MANIFEST_USED を rank↑, view_count↓, video_id↑ でソートして先頭50件を保存→返却。
    """
    if safe_exists(fs, SELECTED50_PATH):
        df = pd.read_csv(SELECTED50_PATH, storage_options=STORAGE_OPTIONS)
        # 後方互換：最低限 video_id, audio_path があればOK
        return df

    df_all = pd.read_csv(MANIFEST_USED, storage_options=STORAGE_OPTIONS)
    # ソート基準：rank昇順→view_count降順→video_id昇順（決定論）
    for c in ("rank", "video_id", "view_count", "audio_path"):
        if c not in df_all.columns:
            raise ValueError(f"{MANIFEST_USED} に必要列 {c} がありません。")

    # rank/view_count は欠損を後ろへ
    df_all["rank"] = pd.to_numeric(df_all["rank"], errors="coerce")
    df_all["view_count"] = pd.to_numeric(df_all["view_count"], errors="coerce")
    df_all["video_id"] = df_all["video_id"].astype(str)

    df_sorted = df_all.sort_values(
        by=["rank", "view_count", "video_id"],
        ascending=[True, False, True],
        na_position="last"
    )

    df_sel = df_sorted.head(50).copy()
    # 最低限必要列だけ残しておく（将来の列変動に強く）
    keep_cols = ["rank", "video_id", "audio_path", "chat_path", "channel", "title", "view_count", "start_utc", "url"]
    df_sel = df_sel[[c for c in keep_cols if c in df_sel.columns]]

    # 保存（再現性担保）
    df_sel.to_csv(SELECTED50_PATH, index=False, storage_options=STORAGE_OPTIONS)
    return df_sel

def copy_gcs_to_local(fs, gcs_path: str) -> str:
    """GCSの音声をローカルの一時ファイルにストリーミング保存し、そのパスを返す。"""
    gcs_path = ensure_gs_uri(gcs_path)
    suffix = os.path.splitext(gcs_path)[1] or ".m4a"
    tmp = tempfile.NamedTemporaryFile(delete=False, suffix=suffix)
    tmp_path = tmp.name
    tmp.close()
    # 8MBチャンクでコピー
    bufsize = 8 * 1024 * 1024
    with fs.open(gcs_path, "rb") as fin, open(tmp_path, "wb") as fout:
        while True:
            chunk = fin.read(bufsize)
            if not chunk:
                break
            fout.write(chunk)
    return tmp_path

def format_timestamp_srt(t: float) -> str:
    """秒(float)→SRTタイムスタンプ(00:00:00,000)"""
    if t is None:
        t = 0.0
    ms = int(round(t * 1000))
    h, rem = divmod(ms, 3600 * 1000)
    m, rem = divmod(rem, 60 * 1000)
    s, ms = divmod(rem, 1000)
    return f"{h:02d}:{m:02d}:{s:02d},{ms:03d}"

def write_result_files(fs, video_id: str, result: Dict):
    """Whisper結果を .json / .srt でGCSに安全保存（.part→rename）。"""
    # JSON
    json_final = f"{TRANS_DIR}/{video_id}.json"
    json_part = f"{TRANS_DIR}/{video_id}.json.part"
    # SRT
    srt_final = f"{TRANS_DIR}/{video_id}.srt"
    srt_part = f"{TRANS_DIR}/{video_id}.srt.part"

    # JSON文字列（日本語保持）
    json_str = json.dumps(result, ensure_ascii=False, indent=2)

    # JSON .part → rename
    with fs.open(json_part, "w") as f:
        f.write(json_str)
    # rename（fsspecのrename or move）
    try:
        fs.rename(json_part, json_final)
    except Exception:
        try:
            fs.move(json_part, json_final)
        except Exception:
            # どうしても失敗する場合は上書き保存
            with fs.open(json_final, "w") as f:
                f.write(json_str)
            if safe_exists(fs, json_part):
                fs.rm(json_part)

    # SRT生成
    lines = []
    segs = result.get("segments") or []
    for i, seg in enumerate(segs, start=1):
        start = format_timestamp_srt(float(seg.get("start", 0.0)))
        end = format_timestamp_srt(float(seg.get("end", 0.0)))
        text = seg.get("text", "").strip()
        lines.append(str(i))
        lines.append(f"{start} --> {end}")
        lines.append(text)
        lines.append("")  # blank

    srt_text = "\n".join(lines)
    with fs.open(srt_part, "w") as f:
        f.write(srt_text)
    try:
        fs.rename(srt_part, srt_final)
    except Exception:
        try:
            fs.move(srt_part, srt_final)
        except Exception:
            with fs.open(srt_final, "w") as f:
                f.write(srt_text)
            if safe_exists(fs, srt_part):
                fs.rm(srt_part)

def is_valid_transcript(fs, video_id: str) -> bool:
    """既存の transcript JSON が有効かを判定。"""
    p = f"{TRANS_DIR}/{video_id}.json"
    if not safe_exists(fs, p):
        return False
    try:
        with fs.open(p, "r") as f:
            obj = json.load(f)
        return isinstance(obj, dict) and "text" in obj
    except Exception:
        return False

def append_status(fs, rows: List[Dict]):
    if not rows:
        return
    df = pd.DataFrame(rows)
    # 追記（GCSは直接appendが難しいので、既存を読んで結合→書き戻し）
    if safe_exists(fs, LOG_CSV_PATH):
        base = pd.read_csv(LOG_CSV_PATH, storage_options=STORAGE_OPTIONS)
        df = pd.concat([base, df], ignore_index=True)
    df.to_csv(LOG_CSV_PATH, index=False, storage_options=STORAGE_OPTIONS)

def main():
    fs = gcs_fs()

    # 出力ディレクトリを作る必要は原則なし（GCSはプレフィックス概念）だが、存在確認的に一度書いて消す…
    # →ここでは不要。必要になれば fs.mkdirs を使う。

    # 1) 50件を確定
    df_sel = load_or_build_selected50(fs)
    # audio_path 正規化（gs://が無い可能性に備える）
    df_sel["audio_path"] = df_sel["audio_path"].astype(str).map(ensure_gs_uri)

    selected_ids = df_sel["video_id"].astype(str).tolist()
    id_to_audio = dict(zip(df_sel["video_id"].astype(str), df_sel["audio_path"].astype(str)))

    # 2) 既存の有効成果物を検知（スキップ用）
    already_ok = list_transcripts_json_ids(fs)

    # 3) Whisper 準備（逐次）
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model = whisper.load_model(model_size, device=device)

    # 4) 逐次処理 with tqdm（video_id単位）
    status_rows = []
    pbar = tqdm(selected_ids, desc=f"Transcribing 50 VODs [{model_size}, lang=ja] on {device}")
    for vid in pbar:
        pbar.set_postfix_str(vid)

        # 既に有効な成果物があればスキップ
        if vid in already_ok or is_valid_transcript(fs, vid):
            status_rows.append({
                "video_id": vid,
                "status": "skip_exists",
                "timestamp": datetime.utcnow().isoformat() + "Z",
                "device": device,
                "model": model_size
            })
            continue

        gcs_audio = id_to_audio.get(vid)
        if not gcs_audio:
            status_rows.append({
                "video_id": vid,
                "status": "missing_audio_path",
                "timestamp": datetime.utcnow().isoformat() + "Z",
            })
            continue

        # GCS→ローカル一時へコピー（Whisperはローカルパス推奨）
        local_audio = None
        try:
            local_audio = copy_gcs_to_local(fs, gcs_audio)

            # transcribe（逐次、verboseオフ、言語ja、fp16はGPU時のみ）
            result = model.transcribe(
                local_audio,
                language=audio_language,
                task="transcribe",
                verbose=False,
                fp16=(device == "cuda")
            )

            # 成果物保存（.part→rename）
            write_result_files(fs, vid, result)

            status_rows.append({
                "video_id": vid,
                "status": "done",
                "timestamp": datetime.utcnow().isoformat() + "Z",
                "device": device,
                "model": model_size,
                "duration_sec": result.get("segments", [{}])[-1].get("end") if result.get("segments") else None,
                "text_chars": len(result.get("text", "") or "")
            })

        except Exception as e:
            status_rows.append({
                "video_id": vid,
                "status": "error",
                "error": repr(e),
                "timestamp": datetime.utcnow().isoformat() + "Z",
            })
        finally:
            # 一時ファイルは都度掃除
            if local_audio and os.path.exists(local_audio):
                try:
                    os.remove(local_audio)
                except Exception:
                    pass

        # 進捗ログは小刻みに書き戻す（カーネル死対策）
        append_status(fs, status_rows)
        status_rows = []

    # 5) 最終サマリ
    # 既存OKを数え直し（今runで作ったものも含めて）
    all_ok = list_transcripts_json_ids(fs)
    print(f"Selected 50 vids: {len(selected_ids)}")
    print(f"Completed (valid transcripts found): {len(all_ok & set(selected_ids))}")
    print(f"Logs: {LOG_CSV_PATH}")
    print(f"Transcripts dir: {TRANS_DIR}  (files: {{video_id}}.json / .srt)")
    print(f"Selection file: {SELECTED50_PATH}")

if __name__ == "__main__":
    main()
