In [3]:
import os
import time
import math
import json
from typing import Dict, List, Optional
from datetime import datetime
import pandas as pd
from googleapiclient.discovery import build
from googleapiclient.errors import HttpError


In [4]:
# ---------------- CONFIG ----------------
API_KEY = os.getenv("YOUTUBE_API_KEY")  # o reemplaza por string literal si prefieres
OUT_DIR = "./data_rebuild"
os.makedirs(OUT_DIR, exist_ok=True)

# Salidas
F_VIDEOS = os.path.join(OUT_DIR, "videos_por_canal.csv")
F_COMMENTS = os.path.join(OUT_DIR, "0_comments_raw.csv")

In [5]:
# Tus canales (ID -> bloque)
canales_interes: Dict[str, str] = {
    'UCPH3Oz99Y_jrVBCQMjQZNSg': 'pro-ucraniano',       # Memorias de Pez
    'UC3tNpTOHsTnkmbX1M2sS4xg': 'pro-ucraniano',       # VisualPolitik
    'UCnsvJeZO4RigQ898WdDNoBw': 'noticiero',           # El PaÃ­s
    'UC7QZIf0dta-XPXsp9Hv4dTw': 'noticiero',           # RTVE Noticias
    'UClLLRs_mFTsNT5U-DqTYAGg': 'noticiero',           # La Vanguardia
    'UCwd8Byi93KbnsYmCcKLExvQ': 'pro-ruso',            # Negocios TV
    'UCgms7r9SaeYhuIBaPGOjnhw': 'pro-ruso',            # Miguel Ruiz Calvo
    'UCNKomgId0-uTA-vVLM9v1pw': 'pro-ruso',            # IntereconomÃ­a
    'UCGXbLrVe8vnkiFv7q2vYv3w': 'noticiero',           # El Mundo
    'UCCJs5mITIqxqJGeFjt9N1Mg': 'noticiero',           # laSexta Noticias
    'UCcgqSM4YEo5vVQpqwN-MaNw': 'pro-ruso',            # teleSUR
}

In [6]:
# ---------------- UTILS ----------------
def yt() :
    if not API_KEY:
        raise RuntimeError("Falta YOUTUBE_API_KEY en el entorno.")
    return build("youtube", "v3", developerKey=API_KEY)


In [7]:
def iso8601_duration_to_seconds(d: str) -> float:
    # Formato tÃ­pico: PT1H2M3S / PT15M / PT45S
    if not d or not d.startswith("P"):
        return float("nan")
    hours = minutes = seconds = 0
    # Simple parser
    t = d.split("T")
    date_part = t[0]
    time_part = t[1] if len(t) > 1 else ""
    num = ""
    for ch in time_part:
        if ch.isdigit() or ch == ".":
            num += ch
        else:
            if ch == "H":
                hours = float(num or 0)
            elif ch == "M":
                minutes = float(num or 0)
            elif ch == "S":
                seconds = float(num or 0)
            num = ""
    return hours*3600 + minutes*60 + seconds

def safe_int(x, default=0):
    try:
        return int(x)
    except:
        return default

def append_csv(path: str, rows: List[dict]):
    if not rows:
        return
    df = pd.DataFrame(rows)
    header = not os.path.exists(path)
    df.to_csv(path, index=False, mode="a", header=header)

In [8]:
# ---------------- DISCOVERY ----------------
def get_channel_meta(y, channel_id: str) -> dict:
    r = y.channels().list(
        part="snippet,statistics,contentDetails",
        id=channel_id
    ).execute()
    items = r.get("items", [])
    if not items:
        return {}
    it = items[0]
    uploads_pl = it["contentDetails"]["relatedPlaylists"]["uploads"]
    return {
        "channel_id": channel_id,
        "channel_title": it["snippet"]["title"],
        "subscriber_count": safe_int(it.get("statistics", {}).get("subscriberCount")),
        "uploads_playlist": uploads_pl
    }

In [9]:
def list_videos_from_uploads(y, uploads_playlist: str, max_items: Optional[int] = None) -> List[dict]:
    out = []
    page = None
    while True:
        r = y.playlistItems().list(
            part="snippet,contentDetails",
            playlistId=uploads_playlist,
            maxResults=50,
            pageToken=page
        ).execute()
        for it in r.get("items", []):
            sn = it["snippet"]
            cd = it["contentDetails"]
            out.append({
                "video_id": cd.get("videoId"),
                "video_title_guess": sn.get("title"),
                "video_published_at_guess": cd.get("videoPublishedAt", sn.get("publishedAt")),
            })
            if max_items and len(out) >= max_items:
                return out
        page = r.get("nextPageToken")
        if not page:
            break
        time.sleep(0.1)
    return out


In [10]:
def enrich_video_meta(y, video_ids: List[str]) -> pd.DataFrame:
    # videos().list permite 50 ids por llamada
    rows = []
    for i in range(0, len(video_ids), 50):
        chunk = video_ids[i:i+50]
        r = y.videos().list(
            part="snippet,statistics,contentDetails",
            id=",".join(chunk)
        ).execute()
        for it in r.get("items", []):
            sn = it["snippet"]; st = it.get("statistics", {}); ct = it.get("contentDetails", {})
            rows.append({
                "video_id": it["id"],
                "video_title": sn.get("title"),
                "channel_title": sn.get("channelTitle"),
                "video_published_at": sn.get("publishedAt"),
                "video_views": safe_int(st.get("viewCount")),
                "video_likes": safe_int(st.get("likeCount")),
                "video_duration": iso8601_duration_to_seconds(ct.get("duration", "")),
                "video_tags": "|".join(sn.get("tags", [])) if sn.get("tags") else "",
                "video_category_id": safe_int(sn.get("categoryId"), default=0),
            })
        time.sleep(0.2)
    return pd.DataFrame(rows)

In [11]:
def discover_videos_for_channels(canales: Dict[str, str], max_per_channel: Optional[int] = None) -> pd.DataFrame:
    y = yt()
    all_rows = []
    for channel_id, bloque in canales.items():
        try:
            meta = get_channel_meta(y, channel_id)
            if not meta:
                print(f"[WARN] Channel sin meta: {channel_id}")
                continue
            vids = list_videos_from_uploads(y, meta["uploads_playlist"], max_items=max_per_channel)
            if not vids:
                print(f"[INFO] Sin videos en uploads: {channel_id}")
                continue
            dfm = enrich_video_meta(y, [v["video_id"] for v in vids])
            if dfm.empty:
                continue
            dfm.insert(0, "channel_id", meta["channel_id"])
            dfm.insert(1, "channel_title_resolved", meta["channel_title"])
            dfm["subscriber_count"] = meta["subscriber_count"]
            dfm["bloque"] = bloque
            all_rows.append(dfm)
        except HttpError as e:
            print(f"[ERROR] {channel_id}: {e}")
            time.sleep(1.0)
            continue
    if not all_rows:
        return pd.DataFrame()
    df = pd.concat(all_rows, ignore_index=True).drop_duplicates(subset=["video_id"])
    df.to_csv(F_VIDEOS, index=False)
    return df

In [12]:
# ---------------- COMMENTS ----------------
def fetch_comments_for_video(y, video_id: str, video_meta: dict,
                             subscriber_count: Optional[int],
                             channel_id: str,
                             bloque: str,
                             max_comments: int = 800,
                             sleep_s: float = 0.3) -> List[dict]:
    out = []
    page = None
    got = 0
    while True:
        try:
            r = y.commentThreads().list(
                part="snippet",
                videoId=video_id,
                maxResults=100,
                textFormat="plainText",
                pageToken=page
            ).execute()
        except HttpError as e:
            # Backoff suave ante 403/429
            time.sleep(1.5)
            break
        for it in r.get("items", []):
            sn = it["snippet"]
            top = sn["topLevelComment"]["snippet"]
            row = {
                # --- esquema solicitado ---
                "comment_id": sn["topLevelComment"]["id"],
                "comment": top.get("textDisplay", ""),
                "comment_text_length": len(top.get("textDisplay", "")),
                "user_id": (top.get("authorChannelId") or {}).get("value"),
                "user_name": top.get("authorDisplayName"),
                "comment_time": top.get("publishedAt"),
                "comment_likes": safe_int(top.get("likeCount")),
                "total_reply_count": safe_int(sn.get("totalReplyCount")),
                "is_top_level_comment": True,
                "video_title": video_meta.get("video_title"),
                "channel_title": video_meta.get("channel_title"),
                "video_published_at": video_meta.get("video_published_at"),
                "video_views": video_meta.get("video_views"),
                "video_likes": video_meta.get("video_likes"),
                "video_duration": video_meta.get("video_duration"),
                "video_tags": video_meta.get("video_tags"),
                "video_category_id": video_meta.get("video_category_id"),
                "relacion_evento": "",
                "evento": "",
                "tipo_evento": "",
                "condiciones_cuenta": "",
                "account_created_at": None,  # no disponible
                "channel_id": channel_id,
                "subscriber_count": subscriber_count,
                # extras Ãºtiles
                "video_id": video_id,
                "bloque": bloque,
            }
            out.append(row)
            got += 1
            if got >= max_comments:
                break
        if got >= max_comments:
            break
        page = r.get("nextPageToken")
        if not page:
            break
        time.sleep(sleep_s)
    return out

In [13]:
def build_video_meta_row(r) -> dict:
    return {
        "video_title": r["video_title"],
        "channel_title": r["channel_title"],
        "video_published_at": r["video_published_at"],
        "video_views": r["video_views"],
        "video_likes": r["video_likes"],
        "video_duration": r["video_duration"],
        "video_tags": r["video_tags"],
        "video_category_id": r["video_category_id"],
    }


In [14]:
def collect_comments_incremental(
    max_comments_per_video: int = 800,
    save_every: int = 10000
):
    y = yt()
    if not os.path.exists(F_VIDEOS):
        raise RuntimeError("No existe videos_por_canal.csv. Ejecuta primero la fase de descubrimiento.")
    df_vids = pd.read_csv(F_VIDEOS)

    # Progreso previo
    seen = set()
    if os.path.exists(F_COMMENTS):
        try:
            prev = pd.read_csv(F_COMMENTS, usecols=["comment_id"])
            seen = set(prev["comment_id"].astype(str).tolist())
        except Exception:
            pass

    buffer = []
    total_new = 0
    for i, r in df_vids.iterrows():
        video_id = str(r["video_id"])
        channel_id = str(r["channel_id"])
        bloque = r["bloque"]
        subs = int(r.get("subscriber_count", 0) or 0)
        video_meta = build_video_meta_row(r)

        rows = fetch_comments_for_video(
            y,
            video_id=video_id,
            video_meta=video_meta,
            subscriber_count=subs,
            channel_id=channel_id,
            bloque=bloque,
            max_comments=max_comments_per_video
        )
        # de-dupe por comment_id
        new_rows = [x for x in rows if str(x["comment_id"]) not in seen]
        for x in new_rows:
            seen.add(str(x["comment_id"]))
        buffer.extend(new_rows)
        total_new += len(new_rows)

        if len(buffer) >= save_every:
            append_csv(F_COMMENTS, buffer)
            buffer = []
            print(f"ðŸ’¾ Guardado incremental. Nuevos: {total_new}")

        time.sleep(0.5)  # cortesÃ­a de cuota

    if buffer:
        append_csv(F_COMMENTS, buffer)

    print(f"âœ… Listo. Comentarios totales nuevos guardados: {total_new}")
    print(f"Archivo: {F_COMMENTS}")

In [15]:
# ---------------- RUN HELPERS ----------------
def fase_1_descubrimiento(max_per_channel: Optional[int] = None):
    df = discover_videos_for_channels(canales_interes, max_per_channel=max_per_channel)
    print(f"Descubiertos {len(df)} videos. CSV: {F_VIDEOS}")

def fase_2_comentarios(max_comments_per_video: int = 800, save_every: int = 10000):
    collect_comments_incremental(
        max_comments_per_video=max_comments_per_video,
        save_every=save_every
    )


In [16]:
if __name__ == "__main__":
    # Ejemplos:
    # 1) Solo descubrimiento (sin tocar comments). Limita por canal si quieres pruebas rÃ¡pidas.
    # fase_1_descubrimiento(max_per_channel=100)

    # 2) ExtracciÃ³n de comentarios (requiere haber corrido fase 1)
    # fase_2_comentarios(max_comments_per_video=800, save_every=10000)
    pass