In [None]:
# cell 0

import os, shutil, subprocess, json, pathlib, re, time, hashlib
from datetime import datetime
from typing import Optional, Tuple
import pytz, requests, pandas as pd
from tqdm.auto import tqdm
from google.cloud import storage

def _run(cmd):
    return subprocess.run(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, text=True)

def ffmpeg_sanity():
    base = pathlib.Path("temp_downloads_v2/ffmpeg_sanity"); base.mkdir(parents=True, exist_ok=True)
    wav, m4a = base/"tone_440hz_2s.wav", base/"tone_440hz_2s.m4a"
    ff, fp = shutil.which("ffmpeg"), shutil.which("ffprobe")
    print("ffmpeg  :", ff or "NOT FOUND")
    print("ffprobe :", fp or "NOT FOUND")
    if not ff or not fp:
        raise RuntimeError("ffmpeg/ffprobe が見つかりません。PATH を確認してください。")
    print(_run([ff,"-version"]).stdout.splitlines()[0])
    print(_run([fp,"-version"]).stdout.splitlines()[0])
    r1 = _run([ff,"-hide_banner","-nostdin","-y","-f","lavfi","-i","sine=frequency=440:duration=2",
               "-ac","1","-ar","16000","-c:a","pcm_s16le",str(wav)])
    if r1.returncode != 0 or not wav.exists():
        print(r1.stdout); raise RuntimeError("WAV生成失敗")
    r2 = _run([ff,"-hide_banner","-nostdin","-y","-i",str(wav),"-c:a","aac","-b:a","128k","-movflags","faststart",str(m4a)])
    if r2.returncode != 0 or not m4a.exists():
        print(r2.stdout); raise RuntimeError("M4A変換失敗")
    print("✅ ffmpeg/ffprobe OK")
    # 生成したテストファイルは掃除（ローカル容量節約）
    try:
        wav.unlink(missing_ok=True)
        m4a.unlink(missing_ok=True)
    except Exception:
        pass

ffmpeg_sanity()


In [None]:
# cell 1

#twitchのAPIに登録して取得する必要あり
CLIENT_ID = 'ここに機密コードを入れます'
CLIENT_SECRET = 'ここに機密コードを入れます'

GCS_BUCKET = "dena-ai-intern-yoshihara-data"
GCS_ROOT_PREFIX = "twitch_v2"   # v2 ルート
GLOBAL_SEEN_KEY = f"{GCS_ROOT_PREFIX}/global/seen_vod_ids.csv"  # ★重複判定用

TARGET_GAME_NAME = "Just Chatting"
TARGET_LANGUAGE  = "ja"
TARGET_PERIOD    = "week"
TARGET_SORT      = "views"
MAX_RESULTS      = 500      # ←毎日APIから取る件数（ちょうど500件だけ）
TYPE_FILTER      = "archive"

TWITCHDL = "/home/jupyter/Twitch_data_collection/TwitchDownloaderCLI"

API_BASE_URL = "https://api.twitch.tv/helix"
AUTH_URL     = "https://id.twitch.tv/oauth2/token"
HEADERS = {}

BASE_DIR  = os.path.abspath(".")
TMP_DIR   = os.path.join(BASE_DIR, "temp_downloads_v2")
STATE_DIR = os.path.join(BASE_DIR, "state_v2")
os.makedirs(TMP_DIR, exist_ok=True); os.makedirs(STATE_DIR, exist_ok=True)

JST = pytz.timezone("Asia/Tokyo")
run_id = datetime.now(JST).strftime("%Y%m%d_%H%M%S")
print("✅ 設定読込OK | run_id:", run_id)


In [None]:
# cell 2
def authenticate() -> bool:
    global HEADERS
    if not CLIENT_ID or not CLIENT_SECRET or CLIENT_ID == "YOUR_TWITCH_CLIENT_ID":
        print("❌ CLIENT_ID/SECRET を設定してください"); return False
    r = requests.post(AUTH_URL, params={
        "client_id": CLIENT_ID, "client_secret": CLIENT_SECRET, "grant_type": "client_credentials"
    })
    try:
        r.raise_for_status()
        token = r.json()["access_token"]
        HEADERS = {"Client-ID": CLIENT_ID, "Authorization": f"Bearer {token}"}
        print("✅ Twitch 認証OK")
        return True
    except Exception as e:
        print("❌ 認証失敗:", e, "| resp:", getattr(r, "text", ""))
        return False

authenticate()

def get_game_id_by_name(name: str) -> Optional[str]:
    try:
        r = requests.get(f"{API_BASE_URL}/games", headers=HEADERS, params={"name": name}, timeout=30)
        r.raise_for_status()
        data = r.json().get("data", [])
        if data:
            gid = data[0]["id"]
            print(f"🎮 {name} -> game_id={gid}")
            return gid
    except Exception as e:
        print("⚠️ get_game_id_by_name失敗:", e)
    if name.lower() == "just chatting":
        print("↩️ fallback: Just Chatting = 509658")
        return "509658"
    return None

DUR_RE = re.compile(r"(?:(\d+)h)?(?:(\d+)m)?(?:(\d+)s)?")
def twitch_duration_to_seconds(s: str) -> int:
    if not s: return 0
    m = DUR_RE.fullmatch(s);
    if not m: return 0
    h,mn,se = (int(x) if x else 0 for x in m.groups())
    return h*3600 + mn*60 + se

def seconds_to_hms(sec: int) -> str:
    h = sec//3600; m=(sec%3600)//60; s=sec%60
    return f"{h:02d}:{m:02d}:{s:02d}"


In [None]:
# cell 3
def fetch_top_vods(game_id, period, sort, language, limit, type_filter) -> pd.DataFrame:
    assert HEADERS, "authenticate() を先に実行してください"
    assert sort in ("time","trending","views")
    assert period in ("day","week","month")
    rows, cursor, got = [], None, 0
    pbar = tqdm(total=limit, desc="VOD取得(固定件数)", unit="item")
    while got < limit:
        first = min(100, limit-got)
        params = {"game_id": game_id, "first": first, "sort": sort, "type": type_filter}
        if language: params["language"] = language
        if sort != "trending": params["period"] = period
        if cursor: params["after"] = cursor
        r = requests.get(f"{API_BASE_URL}/videos", headers=HEADERS, params=params, timeout=60)
        if r.status_code != 200:
            print("❌ /videos エラー:", r.text); break
        payload = r.json(); data = payload.get("data", [])
        if not data: break
        for v in data:
            rows.append({
                "id": v.get("id"),
                "stream_id": v.get("stream_id"),
                "user_id": v.get("user_id"),
                "user_name": v.get("user_name"),
                "user_login": v.get("user_login"),
                "title": v.get("title"),
                "description": v.get("description"),
                "created_at": v.get("created_at"),
                "published_at": v.get("published_at"),
                "url": v.get("url"),
                "thumbnail_url": v.get("thumbnail_url"),
                "viewable": v.get("viewable"),
                "view_count": v.get("view_count"),
                "language": v.get("language"),
                "type": v.get("type"),
                "duration": v.get("duration"),
            })

        got += len(data); pbar.update(len(data))
        cursor = payload.get("pagination", {}).get("cursor")
        if not cursor: break
    pbar.close()
    df = pd.DataFrame(rows)
    if not df.empty:
        df["duration_seconds"] = df["duration"].map(twitch_duration_to_seconds)
        total_sec = int(df["duration_seconds"].sum())
        print(f"🧮 API取得: {len(df)}件 / 総時間: {seconds_to_hms(total_sec)}")
    else:
        print("⚠️ 0件（API）")
    return df

game_id = get_game_id_by_name(TARGET_GAME_NAME)
assert game_id, "game_id解決に失敗"
manifest_api_df = fetch_top_vods(game_id, TARGET_PERIOD, TARGET_SORT, TARGET_LANGUAGE, MAX_RESULTS, TYPE_FILTER)


# === 上位5件をprint ===
if not manifest_api_df.empty:
    print("\n=== 上位5件のサマリ ===")
    top5 = manifest_api_df.head(5)
    for i, row in top5.iterrows():
        vod_id   = row["id"]
        title    = row.get("title", "(タイトルなし)")
        url      = row["url"]
        duration = row["duration"]   # 例: "2h13m5s"
        print(f"#{i+1}")
        print(f"  VOD ID   : {vod_id}")
        print(f"  Title    : {title}")
        print(f"  Duration : {duration}")
        print(f"  URL      : {url}")
        print("-"*40)
else:
    print("⚠️ データなし（上位5件を表示できません）")


In [None]:
# cell 4
def gcs_client() -> storage.Client:
    return storage.Client()

def gcs_blob(key: str):
    b = gcs_client().bucket(GCS_BUCKET)
    return b.blob(key)

def download_if_exists(key: str, local_path: str) -> bool:
    bl = gcs_blob(key)
    if not bl.exists():
        return False
    os.makedirs(os.path.dirname(local_path), exist_ok=True)
    bl.download_to_filename(local_path)
    return True

SEEN_LOCAL = os.path.join(STATE_DIR, "seen_vod_ids.csv")
seen_exists = download_if_exists(GLOBAL_SEEN_KEY, SEEN_LOCAL)
if seen_exists:
    seen_df = pd.read_csv(SEEN_LOCAL, dtype={"vod_id": str})
    print(f"📚 既存 seen_vod_ids: {len(seen_df)} 件")
else:
    seen_df = pd.DataFrame(columns=["vod_id","first_seen_run_id","published_at","user_name"])
    print("📚 既存 seen_vod_ids: 0 件（初回）")

if manifest_api_df.empty:
    print("⚠️ APIから0件。以降はスキップ可能です。")

# APIの500件から、過去に見た VOD を除外（＝新規のみ）
manifest_api_df["id"] = manifest_api_df["id"].astype(str)
new_mask = ~manifest_api_df["id"].isin(seen_df["vod_id"].astype(str))
manifest_new_df = manifest_api_df.loc[new_mask].copy()

print(f"🧪 新規VOD候補: {len(manifest_new_df)} / API取得合計 {len(manifest_api_df)}")
# （注）あくまでAPIからの取得は500件固定。新規が500になるように増やさない＝要件通り。


In [None]:
# cell 5
def jst_now():
    return datetime.now(JST).isoformat()

def gcs_run_key(prefix: str, filename: str) -> str:
    return f"{GCS_ROOT_PREFIX}/{run_id}/{prefix}/{filename}"

def atomic_write_csv(df: pd.DataFrame, path: str):
    os.makedirs(os.path.dirname(path), exist_ok=True)
    tmp = path + ".tmp"
    df.to_csv(tmp, index=False, encoding="utf-8")
    os.replace(tmp, path)

def upload_file(local_path: str, key: str):
    bl = gcs_blob(key)
    bl.upload_from_filename(local_path)
    print(f"☁️ Uploaded: gs://{GCS_BUCKET}/{key}")

# 保存用
MANIFEST_LOCAL = os.path.join(STATE_DIR, f"manifest_{run_id}.csv")
manifest_new_df.to_csv(MANIFEST_LOCAL, index=False, encoding="utf-8")
upload_file(MANIFEST_LOCAL, gcs_run_key("manifests", f"manifest_{run_id}.csv"))

# ledger（新規VODのみ）
LEDGER_LOCAL = os.path.join(STATE_DIR, f"ledger_{run_id}.csv")
LEDGER_BACKUP_LOCAL = os.path.join(STATE_DIR, f"ledger_{run_id}.prev.csv")
cols = ["run_id","vod_id","user_name","duration_seconds",
        "step_manifest","step_chat","step_audio","step_mp4","step_whisper",
        "last_update","note","shard_id"]
if len(manifest_new_df) > 0:
    ledger = pd.DataFrame(columns=cols)
    for _, row in manifest_new_df.iterrows():
        ledger.loc[len(ledger)] = [
            run_id, str(row["id"]), row.get("user_name",""), int(row.get("duration_seconds",0)),
            "done","pending","pending","pending","pending", jst_now(), "", -1
        ]
else:
    # 新規が0件でも空の ledger を作っておくと運用が楽
    ledger = pd.DataFrame(columns=cols)

atomic_write_csv(ledger, LEDGER_LOCAL)
shutil.copy2(LEDGER_LOCAL, LEDGER_BACKUP_LOCAL)
upload_file(LEDGER_LOCAL, gcs_run_key("manifests", "ledger.csv"))
upload_file(LEDGER_BACKUP_LOCAL, gcs_run_key("manifests", "ledger.prev.csv"))


In [None]:
# cell 6
NUM_SHARDS = 10  # 並列数
def assign_shard(vod_id: str, num_shards: int) -> int:
    h = int(hashlib.sha256(str(vod_id).encode("utf-8")).hexdigest(), 16)
    return h % num_shards

if len(ledger) > 0:
    ledger["shard_id"] = ledger["vod_id"].astype(str).map(lambda v: assign_shard(v, NUM_SHARDS))
    atomic_write_csv(ledger, LEDGER_LOCAL)
    upload_file(LEDGER_LOCAL, gcs_run_key("manifests", "ledger.csv"))

assignments = ledger[["vod_id","user_name","duration_seconds","shard_id"]].copy()
ASSIGN_LOCAL = os.path.join(STATE_DIR, f"assignments_{run_id}.csv")
assignments.to_csv(ASSIGN_LOCAL, index=False, encoding="utf-8")
upload_file(ASSIGN_LOCAL, gcs_run_key("manifests", f"assignments_{run_id}.csv"))


In [None]:
# cell 7: global/seen_vod_ids.csv を更新（新規のみ追記）
if len(manifest_new_df) > 0:
    add = manifest_new_df[["id","published_at","user_name"]].copy()
    add.columns = ["vod_id","published_at","user_name"]
    add["vod_id"] = add["vod_id"].astype(str)
    add["first_seen_run_id"] = run_id
    # 既存に結合→重複排除（vod_id基準で一意）
    merged = pd.concat([seen_df, add[["vod_id","first_seen_run_id","published_at","user_name"]]], ignore_index=True)
    merged = merged.sort_values(["vod_id","first_seen_run_id"]).drop_duplicates(subset=["vod_id"], keep="first")
    atomic_write_csv(merged, SEEN_LOCAL)
    upload_file(SEEN_LOCAL, GLOBAL_SEEN_KEY)
    print(f"🗂️ seen_vod_ids 更新: 旧 {len(seen_df)} → 新 {len(merged)}")
else:
    print("🗂️ 新規VODなし → seen_vod_ids は更新不要")


In [None]:
# cell 8: メトリクス（任意）
metrics = pd.DataFrame({
    "run_id":[run_id],
    "api_count":[len(manifest_api_df)],
    "new_count":[len(manifest_new_df)],
    "seen_before":[len(manifest_api_df)-len(manifest_new_df)],
    "ts":[jst_now()],
})
METRICS_LOCAL = os.path.join(STATE_DIR, f"metrics_{run_id}.csv")
metrics.to_csv(METRICS_LOCAL, index=False, encoding="utf-8")
upload_file(METRICS_LOCAL, gcs_run_key("manifests", f"metrics_{run_id}.csv"))

print("\n=== 実行案内 (daily v2) ===")
print(f"RUN_ID = {run_id}")
print("次に、ワーカーを起動（各 SHARD_ID に対して1本）してください。")
