In [None]:
# ============================================
# YouTube 10分チャンク出力（フィルタ保存版 + サンプル表示）
# - 対象: transcription_progress の status=="done"（= chat+audio揃い）の video_id（7件想定）
# - フィルタ: 各チャンクで chat_only の "content" 文字数合計 >= MIN_CHATONLY_CHARS（100）
# - 保存: 上記を満たすチャンクのみ chat-only / chat+audio の両方を保存
# - サンプル: 採用されたチャンクから最大 SAMPLE_TAKE 件抜き、両ファイルの先頭10行を表示
# ============================================

from __future__ import annotations
import os, re, json, sys, subprocess, random
from typing import List, Dict, Optional, Iterable, Tuple

# 依存パッケージ
def _ensure(p):
    try:
        __import__(p)
    except Exception:
        subprocess.check_call([sys.executable, "-m", "pip", "install", "-q", p])

for pkg in ("pandas","gcsfs","tqdm"):
    _ensure(pkg)

import pandas as pd
import gcsfs
from tqdm.auto import tqdm

# -----------------------
# 設定
# -----------------------
BATCH_ID = "20250910_104532"
BUCKET   = "dena-ai-intern-yoshihara-data"
RUN_TAG  = "9_17"            # 出力パスに含めるタグ

# 出力挙動
CHUNK_SECONDS        = 600    # 10分
MIN_CHATONLY_CHARS   = 100    # 採用判定（chat_onlyのcontent合計）
OVERWRITE            = False  # 既存ファイルを上書きする場合 True

# 既知の done 7件（progress CSVが無い/壊れている場合のフォールバック）
KNOWN_DONE_IDS = ['KEU1fA03VyE','MKiq-UOgNwc','QVJAcQI0ihM',
                  'X-p9zwW_UFA','Zw4gWI1cnlk','q9h92Oyhs8Q','vQ3IiAKlRJs']

# サンプル表示
SAMPLE_TAKE          = 10     # 採用チャンクから最大この件数を抜き出して確認
SAMPLE_MAX_LINES     = 10     # 各ファイルから表示する最大行数

# -----------------------
# パス
# -----------------------
BASE_PREFIX   = f"gs://{BUCKET}/Archive/{BATCH_ID}"
CHAT_DIR      = f"{BASE_PREFIX}/chat"
TRANS_DIR     = f"{BASE_PREFIX}/transcripts"
PROGRESS_CSV  = f"{TRANS_DIR}/transcription_progress_{BATCH_ID}.csv"

OUT_BASE      = f"{BASE_PREFIX}/chunks/{RUN_TAG}"
OUT_COMBINED  = f"{OUT_BASE}/yt_comment_speech_combined"
OUT_CHATONLY  = f"{OUT_BASE}/yt_comment_only"
OUT_EXT       = ".txt"

# -----------------------
# ユーティリティ
# -----------------------
def safe_exists(fs, path: str) -> bool:
    try:
        return fs.exists(path)
    except Exception:
        return False

def to_int_seconds_safe(x) -> int:
    v = pd.to_numeric(x, errors="coerce")
    v = 0 if pd.isna(v) else int(v)
    return max(0, v)

def ensure_no_trailing_slash(prefix: str) -> str:
    return prefix[:-1] if prefix.endswith("/") else prefix

def output_path(prefix: str, video_id: str, chunk_idx: int, ext: str = OUT_EXT) -> str:
    prefix = ensure_no_trailing_slash(prefix)
    return f"{prefix}/{video_id}_{chunk_idx}{ext}"

def iter_chunk_indices(max_t: int, chunk_seconds: int = CHUNK_SECONDS) -> Iterable[int]:
    if max_t < 0:
        return []
    last_chunk = (max_t // chunk_seconds) + 1
    return range(1, last_chunk + 1)

def build_text_lines(df: pd.DataFrame) -> List[str]:
    """[SOURCE] 本文 形式の行リスト（本文中の改行はスペースに）。"""
    if df is None or df.empty:
        return []
    lines = []
    for row in df.itertuples(index=False):
        text_clean = str(row.content).replace("\n", " ").strip()
        if not text_clean:
            continue
        lines.append(f"[{row.source}] {text_clean}")
    return lines

def chars_sum_of_contents(df: pd.DataFrame) -> int:
    """行の 'content' のトータル文字数（改行除去）"""
    if df is None or df.empty:
        return 0
    return int(df["content"].astype(str).str.replace("\n"," ", regex=False).str.len().sum())

def write_text_gcs(fs: gcsfs.GCSFileSystem, gcs_path: str, text: str, overwrite: bool = False) -> bool:
    if (not overwrite) and fs.exists(gcs_path):
        return False  # skipped
    with fs.open(gcs_path, "w", encoding="utf-8") as f:
        f.write(text if text.endswith("\n") else text + "\n")
    return True

# -----------------------
# YouTube Chat パーサ
# -----------------------
def _extract_text_from_runs(message_obj: dict | None) -> str:
    if not message_obj:
        return ""
    runs = message_obj.get("runs", [])
    parts = []
    for r in runs:
        if "text" in r:
            parts.append(r["text"])
        elif "emoji" in r:
            emo = r["emoji"]
            alt = emo.get("shortcuts") or emo.get("searchTerms") or []
            parts.append(alt[0] if alt else "")
    return "".join(parts)

def read_youtube_chat_jsonl_to_df(fs: gcsfs.GCSFileSystem, chat_jsonl_gcs: str, video_id: str) -> pd.DataFrame:
    rows = []
    idx = 0
    if not safe_exists(fs, chat_jsonl_gcs):
        return pd.DataFrame(columns=["video_id","source","t","content","orig_idx"])
    with fs.open(chat_jsonl_gcs, "r") as f:
        for line in f:
            line = line.strip()
            if not line:
                continue
            try:
                obj = json.loads(line)
            except Exception:
                continue

            rc = obj.get("replayChatItemAction", {})
            actions = rc.get("actions", []) or []

            # 経過ms（動画内オフセット）
            elapsed_ms = None
            if "videoOffsetTimeMsec" in rc:
                try:
                    elapsed_ms = int(rc["videoOffsetTimeMsec"])
                except Exception:
                    elapsed_ms = None
            if elapsed_ms is None:
                continue

            t = to_int_seconds_safe(elapsed_ms // 1000)

            for act in actions:
                add = act.get("addChatItemAction")
                if not add:
                    continue
                item = add.get("item", {})
                msg = item.get("liveChatTextMessageRenderer")
                if not msg:
                    continue
                text = _extract_text_from_runs(msg.get("message"))
                if not text.strip():
                    continue
                rows.append({
                    "video_id": video_id,
                    "source": "CHAT",
                    "t": t,
                    "content": text.strip(),
                    "orig_idx": idx
                })
                idx += 1

    df = pd.DataFrame(rows)
    if not df.empty:
        df = df.sort_values(["t","orig_idx"], kind="mergesort").reset_index(drop=True)
    else:
        df = pd.DataFrame(columns=["video_id","source","t","content","orig_idx"])
    return df

# -----------------------
# Whisper transcript ローダ（JSON優先→SRT）
# -----------------------
def _parse_srt_timestamp(ts: str) -> float:
    h, m, rest = ts.split(":")
    s, ms = rest.split(",")
    return int(h)*3600 + int(m)*60 + int(s) + int(ms)/1000.0

def load_youtube_transcript_to_df(fs: gcsfs.GCSFileSystem, video_id: str) -> pd.DataFrame:
    json_path = f"{TRANS_DIR}/{video_id}.json"
    srt_path  = f"{TRANS_DIR}/{video_id}.srt"
    segs: List[Dict] = []

    # JSON（segments）優先
    if safe_exists(fs, json_path):
        try:
            with fs.open(json_path, "r") as f:
                obj = json.load(f)
            for i, seg in enumerate(obj.get("segments", []) or []):
                text = (seg.get("text") or "").strip()
                if not text:
                    continue
                start = float(seg.get("start", 0.0))
                t = to_int_seconds_safe(int(start))
                segs.append({"video_id": video_id, "source": "SPEECH", "t": t, "content": text, "orig_idx": i})
        except Exception:
            segs = []

    # SRT フォールバック
    if (not segs) and safe_exists(fs, srt_path):
        try:
            with fs.open(srt_path, "r") as f:
                content = f.read()
            blocks = re.split(r"\n\s*\n", content.strip(), flags=re.MULTILINE)
            idx = 0
            for b in blocks:
                lines = [ln.strip("\r") for ln in b.splitlines() if ln.strip()]
                if not lines:
                    continue
                if lines[0].isdigit():
                    lines = lines[1:]
                if not lines:
                    continue
                m = re.match(r"(\d{2}:\d{2}:\d{2},\d{3})\s*-->\s*(\d{2}:\d{2}:\d{2},\d{3})", lines[0])
                if not m:
                    continue
                start_s = m.group(1)
                text = "\n".join(lines[1:]).strip()
                if not text:
                    continue
                start = _parse_srt_timestamp(start_s)
                t = to_int_seconds_safe(int(start))
                segs.append({"video_id": video_id, "source": "SPEECH", "t": t, "content": text, "orig_idx": idx})
                idx += 1
        except Exception:
            segs = []

    df = pd.DataFrame(segs)
    if not df.empty:
        df = df.sort_values(["t","orig_idx"], kind="mergesort").reset_index(drop=True)
    else:
        df = pd.DataFrame(columns=["video_id","source","t","content","orig_idx"])
    return df

# -----------------------
# 対象 video_id（done）を特定
# -----------------------
def find_done_video_ids(fs: gcsfs.GCSFileSystem) -> List[str]:
    if safe_exists(fs, PROGRESS_CSV):
        try:
            df = pd.read_csv(PROGRESS_CSV)
            if {"video_id","status"} <= set(df.columns):
                vids = df.loc[df["status"]=="done","video_id"].astype(str).dropna().unique().tolist()
                if vids:
                    return vids
        except Exception:
            pass
    return KNOWN_DONE_IDS

# -----------------------
# マージ & チャンク付与
# -----------------------
def merge_chat_speech(chat_df: pd.DataFrame, speech_df: pd.DataFrame) -> pd.DataFrame:
    if chat_df is None or chat_df.empty:
        chat_df = pd.DataFrame(columns=["video_id","source","t","content","orig_idx"])
    if speech_df is None or speech_df.empty:
        speech_df = pd.DataFrame(columns=["video_id","source","t","content","orig_idx"])

    chat_df   = chat_df[chat_df["content"].astype(str).str.len() > 0].copy()
    speech_df = speech_df[speech_df["content"].astype(str).str.len() > 0].copy()

    merged = pd.concat([chat_df, speech_df], ignore_index=True)
    if merged.empty:
        return pd.DataFrame(columns=["video_id","source","t","content","orig_idx","chunk_idx"])

    source_order = merged["source"].map({"CHAT":0, "SPEECH":1}).fillna(2).astype(int)
    merged = merged.assign(_source_order=source_order)
    merged = merged.sort_values(["t","_source_order","orig_idx"], kind="mergesort").drop(columns=["_source_order"])
    merged = merged.reset_index(drop=True)
    merged["chunk_idx"] = (merged["t"] // CHUNK_SECONDS) + 1
    return merged

def add_chunk_idx(df: pd.DataFrame) -> pd.DataFrame:
    if df is None or df.empty:
        return pd.DataFrame(columns=["video_id","source","t","content","orig_idx","chunk_idx"])
    out = df.sort_values(["t","orig_idx"], kind="mergesort").reset_index(drop=True).copy()
    out["chunk_idx"] = (out["t"] // CHUNK_SECONDS) + 1
    return out

# -----------------------
# メイン（保存 + サンプル表示）
# -----------------------
def main():
    fs = gcsfs.GCSFileSystem()

    # done=7件を確定
    done_ids = find_done_video_ids(fs)

    created, skipped_exist, skipped_filter, empty_chunks = 0, 0, 0, 0
    errors: List[Tuple[str,str]] = []
    accepted_records: List[Tuple[str,int,str,str,int]] = []  # (vid, chunk_idx, chat_path, comb_path, chat_chars)

    print(f"Output root: {OUT_BASE}")
    print(f"Targets ({len(done_ids)}): {done_ids}")

    pbar = tqdm(done_ids, desc="YouTube VODs (done only)", unit="video")
    for vid in pbar:
        try:
            pbar.set_postfix_str(f"video_id={vid}")

            chat_path_file = f"{CHAT_DIR}/{vid}.live_chat.json"
            chat_df   = read_youtube_chat_jsonl_to_df(fs, chat_path_file, vid)
            speech_df = load_youtube_transcript_to_df(fs, vid)

            merged = merge_chat_speech(chat_df, speech_df)
            chat_df = add_chunk_idx(chat_df)

            max_t_candidates = []
            if not merged.empty:  max_t_candidates.append(int(merged["t"].max()))
            if not chat_df.empty: max_t_candidates.append(int(chat_df["t"].max()))
            max_t = max(max_t_candidates) if max_t_candidates else -1
            if max_t < 0:
                continue

            live_chunks = list(iter_chunk_indices(max_t, CHUNK_SECONDS))
            inner = tqdm(live_chunks, desc=f"write {vid}", leave=False)
            for ci in inner:
                # 抽出
                part_c = chat_df[chat_df["chunk_idx"] == ci] if not chat_df.empty else pd.DataFrame(columns=chat_df.columns)
                part_m = merged[merged["chunk_idx"] == ci] if not merged.empty else pd.DataFrame(columns=merged.columns)

                # 文字数（content合計）でフィルタ判定
                chat_chars = chars_sum_of_contents(part_c)
                if chat_chars < MIN_CHATONLY_CHARS:
                    skipped_filter += 1
                    inner.set_postfix_str(f"filter< {MIN_CHATONLY_CHARS} chars (chat_only) at chunk {ci}")
                    continue

                # 行生成
                lines_c = build_text_lines(part_c.assign(source="CHAT") if not part_c.empty else part_c)
                lines_m = build_text_lines(part_m)

                if not lines_c:
                    empty_chunks += 1
                    inner.set_postfix_str(f"empty chat_only lines at chunk {ci}")
                    continue
                if not lines_m:
                    # 通常は起きないが安全側チェック
                    empty_chunks += 1
                    inner.set_postfix_str(f"empty merged lines at chunk {ci}")
                    continue

                # 保存
                out_chat = output_path(OUT_CHATONLY, vid, ci, OUT_EXT)
                out_comb = output_path(OUT_COMBINED, vid, ci, OUT_EXT)

                ok_c = write_text_gcs(fs, out_chat, "\n".join(lines_c), overwrite=OVERWRITE)
                ok_m = write_text_gcs(fs, out_comb, "\n".join(lines_m), overwrite=OVERWRITE)

                created += int(ok_c) + int(ok_m)
                # 既存でスキップ（上書きOFFの場合）
                skipped_exist += int((not ok_c)) + int((not ok_m))

                accepted_records.append((vid, ci, out_chat, out_comb, chat_chars))
                inner.set_postfix_str(f"wrote chunk {ci}")
            inner.close()

        except Exception as e:
            errors.append((vid, str(e)))
            continue

    pbar.close()
    print("\n=== 完了レポート（done=7件 & フィルタ適用） ===")
    print(f"作成: {created} / 既存スキップ: {skipped_exist} / フィルタ不採用: {skipped_filter} / 空チャンク: {empty_chunks}")
    if errors:
        print(f"\nエラー {len(errors)} 件（一部抜粋）:")
        for vid, msg in errors[:20]:
            print(f"  video_id={vid}: {msg}")

    # ---- 採用チャンクの概要 ----
    if not accepted_records:
        print("\n[採用チャンクなし] フィルタ閾値を下げるか、対象を見直してください。")
        return

    # 動画ごとの採用数
    df_acc = pd.DataFrame(accepted_records, columns=["video_id","chunk_idx","chat_path","combined_path","chat_chars"])
    agg = df_acc.groupby("video_id").size().rename("accepted_chunks").reset_index()
    print("\n=== 採用チャンク件数（video_id別） ===")
    print(agg.to_string(index=False))

    # ---- サンプル表示（保存済みファイルを読み出し） ----
    # ソートした先頭から最大 SAMPLE_TAKE 件
    df_acc_sorted = df_acc.sort_values(["video_id","chunk_idx"]).head(SAMPLE_TAKE)

    print(f"\n=== サンプル表示: 採用チャンク 最大 {SAMPLE_TAKE} 件 / 各ファイル 先頭 {SAMPLE_MAX_LINES} 行 ===")
    fs = gcsfs.GCSFileSystem()
    for row in df_acc_sorted.itertuples(index=False):
        vid, ci, chat_path, comb_path, chars_ = row
        print("\n" + "="*120)
        print(f"video_id={vid}  chunk={ci}  chat_chars={chars_}")
        # chat_only
        print(f"\n[chat_only] {chat_path}")
        try:
            with fs.open(chat_path, "r") as f:
                for i, line in enumerate(f):
                    if i >= SAMPLE_MAX_LINES:
                        print("... (truncated)")
                        break
                    print(line.rstrip("\n"))
        except Exception as e:
            print(f"(read error) {e}")

        # chat+audio
        print(f"\n[chat+audio] {comb_path}")
        try:
            with fs.open(comb_path, "r") as f:
                for i, line in enumerate(f):
                    if i >= SAMPLE_MAX_LINES:
                        print("... (truncated)")
                        break
                    print(line.rstrip("\n"))
        except Exception as e:
            print(f"(read error) {e}")

    print("\n--- 完了 ---")

# 実行
if __name__ == "__main__":
    main()
