In [None]:
#!/usr/bin/env python3
# -*- coding: utf-8 -*-

"""
01_planner_daily.py  (start-only 7-day filter + dedupe ledger + daily cap)
- YouTube Data API v3 を使って、キーワード=「雑談」× 過去DAYS日「開始」の completedライブを収集
- 抽出条件:
    1) eventType=completed（終了済みライブ）
    2) actualStartTime と actualEndTime の両方が存在
    3) actualStartTime が「今(UTC)から DAYS 日以内」
- その後、viewCount 降順で上位 DAILY_TARGET 件（新規のみ）を選抜
- 重複防止: BASE_DIR/_ledger/collected_video_ids.csv で過去収集済みの video_id を除外
- 出力:
    BASE_DIR/run_id/
      ├─ manifests/
      │    ├─ manifest_<run_id>.csv
      │    └─ assignments_audio_<run_id>.csv  … shard_id(0..SHARDS-1) にラウンドロビン割当
      └─ （ワーカーが audio/chat/meta/logs/tmp を後で自動生成）
"""

from __future__ import annotations
import os, sys, time, csv, datetime as dt, requests, re
from typing import List, Dict, Optional, Set

try:
    from tqdm import tqdm
    HAS_TQDM = True

except Exception:
    HAS_TQDM = False

# ====== 保存先（絶対パス）======
BASE_DIR = "ここに使う予定のディレクトリの絶対パスを入れてください"
# =================================

# ====== 設定 ======
API_KEY = "ここに機密コードを入れてください"  # ← ここに API キー
QUERY = "雑談"
REGION_CODE = "JP"
DAYS = 7
ORDER_MODE = "viewCount"   # 'viewCount' or 'date'
SEARCH_PAGE_LIMIT = 40     # 1ページ最大50件
DAILY_TARGET = 500        # 今日の新規収集上限（最大100件）

SLEEP = 0.10               # API呼び出しの間隔(秒)
SHARDS = 2                 # 並列は2本（shard 0/1）
# ===================

YAPI = "https://www.googleapis.com/youtube/v3"

# RFC3339（小数秒あり/なし, 末尾Z or +00:00 等）を堅牢にUTC naiveに直す
_RFC3339_Z = re.compile(r"^\d{4}-\d{2}-\d{2}T\d{2}:\d{2}:\d{2}(?:\.\d+)?Z$")

def ensure_dir(p: str):
    os.makedirs(p, exist_ok=True)

def parse_rfc3339_to_utc_naive(s: str) -> Optional[dt.datetime]:
    if not s:
        return None
    try:
        if _RFC3339_Z.match(s):
            s = s[:-1] + "+00:00"
        dt_with_tz = dt.datetime.fromisoformat(s)
        return dt_with_tz.astimezone(dt.timezone.utc).replace(tzinfo=None)
    except Exception:
        return None

def now_utc() -> dt.datetime:
    return dt.datetime.utcnow()

def within_days_from_start_only(start_iso: str, days: int) -> bool:
    t = parse_rfc3339_to_utc_naive(start_iso)
    if t is None:
        return False
    return (now_utc() - t) <= dt.timedelta(days=days)

def http_get(url, params):
    r = requests.get(url, params=params, timeout=30)
    r.raise_for_status()
    return r.json()

def url_of(vid: str) -> str:
    return f"https://www.youtube.com/watch?v={vid}"

# ---------- ledger ----------
def ledger_path() -> str:
    led_dir = os.path.join(BASE_DIR, "_ledger")
    ensure_dir(led_dir)
    return os.path.join(led_dir, "collected_video_ids.csv")

def load_collected_ids() -> Set[str]:
    path = ledger_path()
    ids: Set[str] = set()
    if not os.path.exists(path):
        return ids
    with open(path, "r", encoding="utf-8") as f:
        reader = csv.DictReader(f)
        for row in reader:
            vid = (row.get("video_id") or "").strip()
            if re.fullmatch(r"^[\w-]{11}$", vid):
                ids.add(vid)
    return ids

def append_to_ledger(run_id: str, rows: List[Dict]):
    path = ledger_path()
    file_exists = os.path.exists(path)
    with open(path, "a", encoding="utf-8", newline="") as f:
        fieldnames = ["date_utc", "run_id", "video_id", "channel", "title", "start_utc", "view_count", "url"]
        w = csv.DictWriter(f, fieldnames=fieldnames)
        if not file_exists:
            w.writeheader()
        today = now_utc().strftime("%Y-%m-%d")
        for r in rows:
            w.writerow({
                "date_utc": today,
                "run_id": run_id,
                "video_id": r["video_id"],
                "channel": r["channel"],
                "title": r["title"],
                "start_utc": r["start_utc"],
                "view_count": r["view_count"],
                "url": r["url"],
            })

# ---------- YouTube API ----------
def search_completed_ids(order_mode: str) -> List[str]:
    if order_mode not in ("viewCount", "date"):
        raise ValueError("ORDER_MODE must be 'viewCount' or 'date'.")

    ids, page_token, pages = [], None, 0
    print(f"[search.list] q={QUERY or '-'} | region={REGION_CODE or '-'} | eventType=completed | order={order_mode}")

    while pages < SEARCH_PAGE_LIMIT:
        params = {
            "key": API_KEY,
            "part": "snippet",
            "type": "video",
            "eventType": "completed",
            "order": order_mode,
            "maxResults": 50,
        }
        if QUERY:
            params["q"] = QUERY
        if REGION_CODE:
            params["regionCode"] = REGION_CODE
        if page_token:
            params["pageToken"] = page_token

        data = http_get(f"{YAPI}/search", params)
        items = data.get("items", [])
        page_ids = []
        for it in items:
            vid = (it.get("id") or {}).get("videoId")
            if vid:
                page_ids.append(vid)
        ids.extend(page_ids)

        pages += 1
        print(f"  - page {pages:02d}: {len(page_ids):2d} 件 / 累計(raw) {len(ids):3d} 件")
        page_token = data.get("nextPageToken")
        if not page_token:
            break
        time.sleep(SLEEP)

    unique = list(dict.fromkeys(ids))
    print(f"[search.list] 終了: ユニーク {len(unique)} 件（raw {len(ids)} 件）")
    return unique

def fetch_videos_detail(video_ids: List[str]) -> List[Dict]:
    out = []
    rng = range(0, len(video_ids), 50)
    if HAS_TQDM:
        rng = tqdm(rng, desc="fetch videos detail (batches of 50)", unit="batch", leave=False)
    for i in rng:
        batch = video_ids[i:i+50]
        if not batch:
            break
        params = {
            "key": API_KEY,
            "part": "snippet,statistics,liveStreamingDetails",
            "id": ",".join(batch),
            "maxResults": 50,
        }
        data = http_get(f"{YAPI}/videos", params)
        out.extend(data.get("items", []))
        time.sleep(SLEEP)
    if HAS_TQDM:
        print()
    return out

def run_id_str() -> str:
    return dt.datetime.now().strftime("%Y%m%d_%H%M%S")

def main():
    if not API_KEY:
        print("ERROR: API_KEY を設定してください。", file=sys.stderr)
        sys.exit(2)

    ensure_dir(BASE_DIR)

    rid = run_id_str()
    base = os.path.join(BASE_DIR, rid)
    mani_dir = os.path.join(base, "manifests")
    ensure_dir(mani_dir)

    print(f"🗓️ run_id: {rid}")
    print(f"📁 RUN_DIR: {base}")

    # すでに収集済みの video_id セット
    collected = load_collected_ids()
    print(f"[ledger] 既存の収集済み video_id: {len(collected)} 件")

    ids = search_completed_ids(ORDER_MODE)
    videos = fetch_videos_detail(ids)

    rows = []
    it = videos
    if HAS_TQDM:
        it = tqdm(videos, desc="filter & collect", unit="vid", leave=False)

    filtered_cnt = 0
    skipped_no_time = 0
    skipped_old = 0
    skipped_dup = 0

    for v in it:
        sn = v.get("snippet") or {}
        st = v.get("statistics") or {}
        lsd = v.get("liveStreamingDetails") or {}

        start = lsd.get("actualStartTime") or ""
        end   = lsd.get("actualEndTime")   or ""

        if not (start and end):
            skipped_no_time += 1
            continue
        if not within_days_from_start_only(start, DAYS):
            skipped_old += 1
            continue

        vid = v.get("id")
        if vid in collected:
            skipped_dup += 1
            continue

        views = int(st.get("viewCount", "0") or 0)

        rows.append({
            "video_id": vid,
            "title": sn.get("title", ""),
            "channel": sn.get("channelTitle", ""),
            "start_utc": (parse_rfc3339_to_utc_naive(start).strftime("%Y-%m-%dT%H:%M:%SZ") if start else ""),
            "view_count": views,
            "url": url_of(vid),
        })
        filtered_cnt += 1
    if HAS_TQDM:
        print()

    if not rows:
        print(f"→ フィルタ後 0 件でした（条件: start & end が存在 かつ start が直近 {DAYS} 日 かつ 過去未収集）")
        sys.exit(0)

    # viewCount 降順 → 今日の新規を最大 DAILY_TARGET 件に制限
    rows.sort(key=lambda r: r["view_count"], reverse=True)
    rows = rows[:DAILY_TARGET]

    # manifest 保存
    manifest_csv = os.path.join(mani_dir, f"manifest_{rid}.csv")
    with open(manifest_csv, "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["rank", "video_id", "channel", "title", "view_count", "start_utc", "url"])
        w.writeheader()
        for i, r in enumerate(rows, 1):
            w.writerow({"rank": i, **r})

    # assignments 保存（0..SHARDS-1 へラウンドロビン）
    assign_csv = os.path.join(mani_dir, f"assignments_audio_{rid}.csv")
    with open(assign_csv, "w", encoding="utf-8", newline="") as f:
        w = csv.DictWriter(f, fieldnames=["video_id", "shard_id"])
        w.writeheader()
        for i, r in enumerate(rows):
            w.writerow({"video_id": r["video_id"], "shard_id": i % SHARDS})

    # ledger へ追記（今日選んだ新規だけ）
    append_to_ledger(rid, rows)

    print(f"🔎 search.list({ORDER_MODE}) ヒット（ユニーク）: {len(ids)} 件")
    print(f"   - 除外: 時刻欠落 {skipped_no_time} / 古い {skipped_old} / 過去収集 {skipped_dup}")
    print(f"📝 manifest 保存: {manifest_csv} | 今日の新規 {len(rows)} 件（上限 {DAILY_TARGET}）")
    print(f"🗂️ assignments 保存: {assign_csv} | shard=0..{SHARDS-1}")

    top_preview = min(10, len(rows))
    print("\n=== プレビュー（上位 10 件 / view_count 降順, start基準, 新規のみ）=== ")
    print(f"{'Rank':>4}  {'Views':>10}  {'Channel':<28}  {'Title':<42}  URL")
    print("-"*120)
    for i, r in enumerate(rows[:top_preview], 1):
        ch = (r["channel"][:28]).ljust(28)
        ti = (r["title"][:42]).ljust(42)
        print(f"{i:>4}  {r['view_count']:>10,}  {ch}  {ti}  {r['url']}")

    print(f"\n✅ プランナー完了。次は **ワーカー (shard=0,1)** で音声＋チャットを取得してください。")
    print(f"   - manifest : {manifest_csv}")
    print(f"   - assign   : {assign_csv}")
    print(f"   - 保存ルート: {base}")
    print(f"run id:{rid}")


if __name__ == "__main__":
    try:
        main()
    except requests.HTTPError as e:
        print("HTTPError:", e.response.status_code, e.response.text[:400], file=sys.stderr)
        sys.exit(2)
    except Exception as e:
        print("Error:", repr(e), file=sys.stderr)
        sys.exit(3)
