In [None]:
# -*- coding: utf-8 -*-
"""
Whisper書き起こしの進捗を video_id 単位で集計するスキャナ
- まず selected_50_vods_<BATCH_ID>.csv があれば、それを対象に進捗を集計
- 無ければ、chat と audio が両方揃っている VOD を走査して対象集合を自動生成
- transcripts/<video_id>.json が「有効なJSONで text キーあり」なら "done"
- 無効JSONは "broken_json"、.part があれば "partial"、どれも無ければ "pending"
- サマリを表示し、transcription_progress_<BATCH_ID>.csv に保存
"""

import os
import json
from typing import Dict, List, Set
import pandas as pd
import fsspec

# ===== 設定 =====
BATCH_ID = "20250910_104532"
BASE_PREFIX = f"gs://dena-ai-intern-yoshihara-data/Archive/{BATCH_ID}"

MANIFEST_SRC  = f"{BASE_PREFIX}/manifests/manifest_{BATCH_ID}.csv"                          # 元マニフェスト
MANIFEST_USED = f"{BASE_PREFIX}/manifests/vods_with_chat_and_audio_{BATCH_ID}.csv"          # （あれば）抽出後
SELECTED50    = f"{BASE_PREFIX}/manifests/selected_50_vods_{BATCH_ID}.csv"                  # （あれば）対象50件
CHAT_DIR      = f"{BASE_PREFIX}/chat"
AUDIO_DIR     = f"{BASE_PREFIX}/audio"
TRANS_DIR     = f"{BASE_PREFIX}/transcripts"

PROGRESS_CSV  = f"{TRANS_DIR}/transcription_progress_{BATCH_ID}.csv"                        # 出力

STORAGE_OPTIONS = {"token": "cloud"}


# ===== ユーティリティ =====
def fs() -> fsspec.AbstractFileSystem:
    return fsspec.filesystem("gcs", **STORAGE_OPTIONS)

def safe_exists(fsys, path: str) -> bool:
    try:
        return fsys.exists(path)
    except Exception:
        return False

def list_valid_transcripts(fsys) -> Set[str]:
    """transcripts/*.json のうち、有効(JSON & 'text'キーあり)な video_id を返す"""
    done = set()
    for p in fsys.glob(f"{TRANS_DIR}/*.json"):
        vid = os.path.splitext(os.path.basename(p))[0]
        try:
            with fsys.open(p, "r") as f:
                obj = json.load(f)
            if isinstance(obj, dict) and "text" in obj:
                done.add(vid)
        except Exception:
            pass
    return done

def list_broken_json(fsys) -> Set[str]:
    """壊れた/無効な JSON を検知（ファイルはあるが 'text' が無い/読めない）"""
    broken = set()
    for p in fsys.glob(f"{TRANS_DIR}/*.json"):
        vid = os.path.splitext(os.path.basename(p))[0]
        try:
            with fsys.open(p, "r") as f:
                obj = json.load(f)
            if not (isinstance(obj, dict) and "text" in obj):
                broken.add(vid)
        except Exception:
            broken.add(vid)
    return broken

def list_partials(fsys) -> Set[str]:
    """*.json.part / *.srt.part が残っている video_id を検知"""
    partial = set()
    for p in fsys.glob(f"{TRANS_DIR}/*.part"):
        name = os.path.basename(p)
        # 例: abc.json.part / abc.srt.part → 'abc' を抽出
        base = name.replace(".json.part", "").replace(".srt.part", "")
        if base and base not in (".json", ".srt"):
            partial.add(base)
    return partial

def load_candidates(fsys) -> pd.DataFrame:
    """
    対象 video_id の集合を返す。
    優先度:
      1) selected_50_vods_*.csv（あればそれ）
      2) vods_with_chat_and_audio_*.csv（あればそれ）
      3) manifest_*.csv + chat/audio の存在走査で自動生成
    """
    # 1) selected 50
    if safe_exists(fsys, SELECTED50):
        df = pd.read_csv(SELECTED50, storage_options=STORAGE_OPTIONS)
        if "video_id" in df.columns:
            return df[["video_id"]].dropna().drop_duplicates().reset_index(drop=True)

    # 2) 既製の抽出CSV
    if safe_exists(fsys, MANIFEST_USED):
        df = pd.read_csv(MANIFEST_USED, storage_options=STORAGE_OPTIONS)
        if "video_id" in df.columns:
            return df[["video_id"]].dropna().drop_duplicates().reset_index(drop=True)

    # 3) 自動生成（chat/audio 両方があるID）
    chat_paths = fsys.glob(f"{CHAT_DIR}/*.live_chat.json")
    audio_paths = fsys.glob(f"{AUDIO_DIR}/*.m4a")

    def vid_from_chat(p):
        n = os.path.basename(p)
        return n[:-len(".live_chat.json")] if n.endswith(".live_chat.json") else os.path.splitext(n)[0]

    def vid_from_audio(p):
        n = os.path.basename(p)
        return n[:-len(".m4a")] if n.endswith(".m4a") else os.path.splitext(n)[0]

    chat_ids = {vid_from_chat(p) for p in chat_paths}
    audio_ids = {vid_from_audio(p) for p in audio_paths}
    both_ids = sorted(chat_ids & audio_ids)

    return pd.DataFrame({"video_id": both_ids})

def scan_progress() -> pd.DataFrame:
    fsys = fs()

    # 対象集合を作成
    df_cand = load_candidates(fsys)
    df_cand["video_id"] = df_cand["video_id"].astype(str)
    candidates = set(df_cand["video_id"])

    # transcriptsの現況をスキャン
    done    = list_valid_transcripts(fsys)
    broken  = list_broken_json(fsys)
    partial = list_partials(fsys)

    rows = []
    for vid in sorted(candidates):
        json_p = f"{TRANS_DIR}/{vid}.json"
        srt_p  = f"{TRANS_DIR}/{vid}.srt"
        j_ok   = (vid in done)
        j_bad  = (vid in broken) and (vid not in done) and safe_exists(fsys, json_p)
        pflag  = (vid in partial)
        srt_ok = safe_exists(fsys, srt_p)

        if j_ok:
            status = "done"
        elif j_bad:
            status = "broken_json"
        elif pflag:
            status = "partial"
        else:
            status = "pending"

        rows.append({
            "video_id": vid,
            "status": status,
            "json_exists": safe_exists(fsys, json_p),
            "srt_exists": srt_ok
        })

    df = pd.DataFrame(rows)

    # サマリ表示
    total   = len(df)
    n_done  = int((df["status"] == "done").sum())
    n_pend  = int((df["status"] == "pending").sum())
    n_part  = int((df["status"] == "partial").sum())
    n_brok  = int((df["status"] == "broken_json").sum())
    pct     = (n_done / total * 100.0) if total else 0.0

    print("=== Transcription Progress ===")
    print(f"Target videos : {total}")
    print(f"Done          : {n_done} ({pct:.1f}%)")
    print(f"Pending       : {n_pend}")
    print(f"Partial(.part): {n_part}")
    print(f"Broken JSON   : {n_brok}")
    print()
    print(df["status"].value_counts().to_string())
    print()
    # 先頭数件プレビュー
    print(df.sort_values(["status","video_id"]).head(20))

    # CSV保存
    df.to_csv(PROGRESS_CSV, index=False, storage_options=STORAGE_OPTIONS)
    print(f"\nWrote progress table -> {PROGRESS_CSV}")

    # 便利: 完了IDだけ見たい場合
    done_ids = df.loc[df["status"]=="done","video_id"].tolist()
    print(f"\nCompleted video_ids ({len(done_ids)}):")
    print(done_ids[:30], "..." if len(done_ids) > 30 else "")

    return df


if __name__ == "__main__":
    _ = scan_progress()
