In [None]:
# ============================================================
# Reset & Rebuild: Twitch -> 10min Chunks (CHAT+SPEECH / CHAT-only)
# - 既存成果物(出力)を削除してから、ゼロから再作成
# - 入力(raw/chat, processed/whisper_tiny_csv)は読み込み専用
# ============================================================

from __future__ import annotations
import os, io, re, json, math
from typing import List, Dict, Optional, Iterable, Tuple

import pandas as pd
from tqdm.auto import tqdm
from google.cloud import storage
import gcsfs

# -----------------------
# 設定
# -----------------------
GCS_BUCKET      = "dena-ai-intern-yoshihara-data"
GCS_ROOT_PREFIX = "twitch_v2"
RUN_ID          = "20250910_190657"     # 失敗した run_id に合わせる
CHUNK_SECONDS   = 10 * 60               # 10分

# ★ 成果物の保存先（Pocochaと同じ命名）
OUT_COMBINED_PREFIX = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_twitch_comment_speeech_combined"
OUT_CHATONLY_PREFIX = "gs://dena-ai-intern-yoshihara-data/yoshi_LLMQA_twitch_comment_only"
OUT_EXT             = ".txt"

# （任意）処理対象 VOD を限定する場合は列挙
VOD_WHITELIST: Optional[List[str]] = None   # 例: ["2561964528"]

# 削除を本当に実行するか（True で実行）
PURGE_OUTPUTS = True

# -----------------------
# GCS ユーティリティ
# -----------------------
gcs_client = storage.Client()
bucket = gcs_client.bucket(GCS_BUCKET)
fs = gcsfs.GCSFileSystem()

def parse_gcs_path(gcs_path: str) -> Tuple[str, str]:
    assert gcs_path.startswith("gs://")
    rest = gcs_path[5:]
    bkt, _, prefix = rest.partition("/")
    prefix = prefix.rstrip("/") + ("/" if prefix and not prefix.endswith("/") else "")
    return bkt, prefix

def blob_exists(key: str) -> bool:
    return bucket.blob(key).exists()

def download_bytes(key: str) -> bytes:
    bl = bucket.blob(key)
    if not bl.exists():
        raise FileNotFoundError(f"not found: gs://{GCS_BUCKET}/{key}")
    return bl.download_as_bytes()

def download_csv_df(key: str) -> pd.DataFrame:
    data = download_bytes(key)
    return pd.read_csv(io.BytesIO(data))

def whisper_csv_key(vod_id: str) -> str:
    return f"{GCS_ROOT_PREFIX}/{RUN_ID}/processed/whisper_tiny_csv/{vod_id}.csv"

def chat_json_key(vod_id: str) -> str:
    return f"{GCS_ROOT_PREFIX}/{RUN_ID}/raw/chat/{vod_id}_chat.json"

def list_whisper_csv_vods(run_id: str) -> List[str]:
    prefix = f"{GCS_ROOT_PREFIX}/{run_id}/processed/whisper_tiny_csv/"
    vods = []
    for bl in gcs_client.list_blobs(GCS_BUCKET, prefix=prefix):
        base = os.path.basename(bl.name)
        if base.endswith(".csv") and (bl.size or 0) > 0:
            vods.append(base[:-4])
    return sorted(set(vods))

# -----------------------
# 時刻・テキスト整形
# -----------------------
_HMS_RE = re.compile(r"^(?P<h>\d{1,2}):(?P<m>\d{2}):(?P<s>\d{2})(?:\.(?P<ms>\d{1,3}))?$")

def hms_to_seconds(hms: str) -> int:
    if not isinstance(hms, str):
        return 0
    m = _HMS_RE.match(hms.strip())
    if not m:
        return 0
    h = int(m.group("h")); mi = int(m.group("m")); s = int(m.group("s"))
    ms = int(m.group("ms") or 0)
    return int(h*3600 + mi*60 + s + (ms/1000.0))

def to_int_seconds_safe(x) -> int:
    try:
        v = float(x)
        if math.isnan(v):
            return 0
        return max(0, int(v))
    except Exception:
        return 0

def extract_chat_text(m: Dict) -> str:
    """Twitchチャット本文抽出（fragments → body/text → top-level text/body/content）"""
    msg = m.get("message")
    text = ""
    if isinstance(msg, dict):
        frags = msg.get("fragments")
        if isinstance(frags, list):
            text = "".join(str(f.get("text") or "") for f in frags)
        else:
            text = (msg.get("body") or msg.get("text") or "")
    elif isinstance(msg, str):
        text = msg
    if not text:
        text = (m.get("text") or m.get("body") or m.get("content") or "")
        if not text and isinstance(m.get("fragments"), list):
            text = "".join(str(f.get("text") or "") for f in m["fragments"])
    return str(text).strip()

# -----------------------
# ローダ
# -----------------------
def load_chat_df(vod_id: str) -> pd.DataFrame:
    raw = download_bytes(chat_json_key(vod_id))
    try:
        obj = json.loads(raw.decode("utf-8"))
    except Exception:
        obj = json.loads(raw.decode("utf-8", errors="ignore"))

    if isinstance(obj, dict):
        if "messages" in obj and isinstance(obj["messages"], list):
            messages = obj["messages"]
        elif "data" in obj and isinstance(obj["data"], list):
            messages = obj["data"]
        else:
            messages = None
            for v in obj.values():
                if isinstance(v, list):
                    messages = v; break
            if messages is None:
                raise ValueError("CHAT JSON のリストが見つかりません")
    elif isinstance(obj, list):
        messages = obj
    else:
        raise ValueError("CHAT JSON の形式が不正です")

    rows = []
    for i, m in enumerate(messages):
        if not isinstance(m, dict):
            continue
        text = extract_chat_text(m)
        if not text:
            continue
        t_candidates = [m.get("offset_seconds"), m.get("offsetSecs"),
                        m.get("offset"), m.get("elapsed_seconds"), m.get("t")]
        t_val = None
        for cand in t_candidates:
            if cand is None:
                continue
            t_val = to_int_seconds_safe(cand); break
        if (t_val is None or t_val == 0):
            rel = m.get("relative_time") or m.get("time") or m.get("display_time")
            if isinstance(rel, str) and ":" in rel:
                t_val = hms_to_seconds(rel)
        if t_val is None:
            t_val = 0
        rows.append({"source":"CHAT","t":t_val,"content":text,"orig_idx":i})

    df = pd.DataFrame(rows, columns=["source","t","content","orig_idx"])
    if df.empty:
        return pd.DataFrame(columns=["source","t","content","orig_idx"])
    return df.sort_values(["t","orig_idx"], kind="mergesort").reset_index(drop=True)

def load_speech_df(vod_id: str) -> pd.DataFrame:
    df = download_csv_df(whisper_csv_key(vod_id))
    need = {"timestamp_start","transcription"}
    missing = [c for c in need if c not in df.columns]
    if missing:
        raise ValueError(f"Whisper CSV 列不足: {missing}")
    rows = []
    for i, r in enumerate(df.itertuples(index=False)):
        text = str(getattr(r, "transcription", "")).strip()
        if not text:
            continue
        t = hms_to_seconds(str(getattr(r, "timestamp_start", "0:00:00")))
        rows.append({"source":"SPEECH","t":t,"content":text,"orig_idx":i})
    out = pd.DataFrame(rows, columns=["source","t","content","orig_idx"])
    if out.empty:
        return pd.DataFrame(columns=["source","t","content","orig_idx"])
    return out.sort_values(["t","orig_idx"], kind="mergesort").reset_index(drop=True)

# -----------------------
# チャンク & 書き出し
# -----------------------
def ensure_prefix_no_trailing_slash(prefix: str) -> str:
    return prefix[:-1] if prefix.endswith("/") else prefix

def out_path(prefix: str, vod_id: str, chunk_idx: int, ext: str = OUT_EXT) -> str:
    prefix = ensure_prefix_no_trailing_slash(prefix)
    return f"{prefix}/{vod_id}_{chunk_idx}{ext}"

def build_lines(df: pd.DataFrame) -> List[str]:
    if df.empty:
        return []
    lines = []
    for row in df.itertuples(index=False):
        text_clean = str(row.content).replace("\n", " ").strip()
        lines.append(f"[{row.source}] {text_clean}")
    return lines

def iter_chunk_indices(max_t: int, chunk_seconds: int = CHUNK_SECONDS) -> Iterable[int]:
    if max_t < 0:
        return []
    last_chunk = (max_t // chunk_seconds) + 1
    return range(1, last_chunk + 1)

def export_vod(vod_id: str) -> Tuple[int,int]:
    """1 VOD を処理して GCS に出力。戻り値: (created, skipped)"""
    created = 0; skipped = 0

    if not blob_exists(chat_json_key(vod_id)):
        tqdm.write(f"⚠️ chat なし: {vod_id}"); return created, skipped
    if not blob_exists(whisper_csv_key(vod_id)):
        tqdm.write(f"⚠️ whisper CSV なし: {vod_id}"); return created, skipped

    chat_df   = load_chat_df(vod_id)
    speech_df = load_speech_df(vod_id)

    chat_df   = chat_df[chat_df["content"].astype(str).str.len() > 0]
    speech_df = speech_df[speech_df["content"].astype(str).str.len() > 0]

    merged = pd.concat([chat_df, speech_df], ignore_index=True)
    source_order = merged["source"].map({"CHAT":0, "SPEECH":1}).fillna(2).astype(int)
    merged = merged.assign(_o=source_order).sort_values(["t","_o","orig_idx"], kind="mergesort").drop(columns=["_o"])

    if not merged.empty:
        merged["chunk_idx"] = (merged["t"] // CHUNK_SECONDS) + 1
    if not chat_df.empty:
        chat_df = chat_df.sort_values(["t","orig_idx"], kind="mergesort").reset_index(drop=True)
        chat_df["chunk_idx"] = (chat_df["t"] // CHUNK_SECONDS) + 1

    max_t_candidates = []
    if not merged.empty:
        max_t_candidates.append(int(merged["t"].max()))
    if not chat_df.empty:
        max_t_candidates.append(int(chat_df["t"].max()))
    max_t = max(max_t_candidates) if max_t_candidates else -1
    if max_t < 0:
        tqdm.write(f"ℹ️ 有効テキストなし: {vod_id}")
        return created, skipped

    for idx in iter_chunk_indices(max_t, CHUNK_SECONDS):
        # Combined
        out_combined = out_path(OUT_COMBINED_PREFIX, vod_id, idx, OUT_EXT)
        if fs.exists(out_combined):
            skipped += 1
        else:
            part = merged[merged["chunk_idx"] == idx]
            lines = build_lines(part)
            if lines:
                with fs.open(out_combined, "w", encoding="utf-8") as f:
                    txt = "\n".join(lines)
                    f.write(txt if txt.endswith("\n") else txt + "\n")
                created += 1

        # Chat-only
        out_chatonly = out_path(OUT_CHATONLY_PREFIX, vod_id, idx, OUT_EXT)
        if fs.exists(out_chatonly):
            skipped += 1
        else:
            part_c = chat_df[chat_df["chunk_idx"] == idx]
            lines_c = build_lines(part_c)
            if lines_c:
                with fs.open(out_chatonly, "w", encoding="utf-8") as f:
                    txtc = "\n".join(lines_c)
                    f.write(txtc if txtc.endswith("\n") else txtc + "\n")
                created += 1

    return created, skipped

# -----------------------
# 成果物の全削除（Purge）
# -----------------------
def purge_outputs(out_prefix_gs: str, vod_whitelist: Optional[List[str]] = None) -> int:
    """
    指定の 'gs://bucket/prefix' 配下の .txt 成果物を削除。
    vod_whitelist があれば '<prefix>/<vod_id>_' の前方一致だけ削除。
    戻り値: 削除件数
    """
    bkt, pref = parse_gcs_path(out_prefix_gs)
    assert bkt == GCS_BUCKET, f"想定外のバケット: {bkt}"

    to_delete = []
    if vod_whitelist:
        for vid in set(vod_whitelist):
            subprefix = pref + f"{vid}_"
            for bl in gcs_client.list_blobs(bkt, prefix=subprefix):
                if bl.name.endswith(".txt"):
                    to_delete.append(bl.name)
    else:
        for bl in gcs_client.list_blobs(bkt, prefix=pref):
            if bl.name.endswith(".txt"):
                to_delete.append(bl.name)

    if not to_delete:
        print(f"🧹 削除対象なし: {out_prefix_gs}")
        return 0

    print(f"🧹 削除対象 {len(to_delete)} 件: {out_prefix_gs}")
    for name in tqdm(to_delete, desc=f"delete {os.path.basename(out_prefix_gs)}", unit="file"):
        try:
            bucket.blob(name).delete()
        except Exception as e:
            print(f"  ⚠️ delete失敗: gs://{GCS_BUCKET}/{name} ({e})")
    return len(to_delete)

# -----------------------
# メイン
# -----------------------
def main():
    # 1) 対象VODの抽出（Whisper CSV あり）
    vods = list_whisper_csv_vods(RUN_ID)
    if VOD_WHITELIST:
        vods = [v for v in vods if v in set(VOD_WHITELIST)]

    # chat もあるものだけ
    vods_ok = []
    for vid in vods:
        if blob_exists(chat_json_key(vid)) and blob_exists(whisper_csv_key(vid)):
            vods_ok.append(vid)

    if not vods_ok:
        print("処理対象 VOD なし（chat と whisper CSV の両方が必要）")
        return

    # 2) 成果物の削除（要求通り「最初からやり直す」ため）
    if PURGE_OUTPUTS:
        print("=== 成果物の全削除を実行します（入力は変更しません）===")
        n1 = purge_outputs(OUT_COMBINED_PREFIX, vods_ok if VOD_WHITELIST else None)
        n2 = purge_outputs(OUT_CHATONLY_PREFIX, vods_ok if VOD_WHITELIST else None)
        print(f"削除サマリ: combined={n1}, chatonly={n2}")

    # 3) 再作成（ゼロから）
    total_created = 0; total_skipped = 0
    pbar = tqdm(vods_ok, desc="rebuild VODs", unit="vod")
    for vid in pbar:
        c, s = export_vod(vid)
        total_created += c; total_skipped += s
        pbar.set_postfix_str(f"created={total_created} skipped={total_skipped}")

    print("\n=== 完了レポート ===")
    print(f"作成ファイル数: {total_created}")
    print(f"スキップ数    : {total_skipped}（存在チェックにより）")
    print(f"出力先 Combined: {OUT_COMBINED_PREFIX}")
    print(f"出力先 ChatOnly: {OUT_CHATONLY_PREFIX}")

# 実行
main()
