In [None]:
import os
import csv
import json
import time
from googleapiclient.discovery import build
from tqdm import tqdm
import pandas as pd

In [None]:
# youtube_scraper.py


# ========== CẤU HÌNH ==========
API_KEY = "YOUR_YOUTUBE_API_KEY"   # <-- thay bằng API key của bạn
YOUTUBE = build("youtube", "v3", developerKey=API_KEY)

QUERY = "iPhone 15 review"        # từ khóa / campaign / product để tìm video
MAX_VIDEOS = 200                  # số video tối đa thu thập
MAX_COMMENTS_PER_VIDEO = 200      # số comments tối đa/ video (API giới hạn 100 per page)
OUTPUT_CSV = "youtube_comments.csv"
OUTPUT_JSON = "youtube_comments.json"
SLEEP_BETWEEN_REQUESTS = 0.1      # tránh quá nhanh dễ dính quota/limit

# ========== HÀM HỖ TRỢ ==========
def search_videos(query, max_results=100):
    """Tìm video theo query, trả về list video dict (videoId, title, publishedAt)."""
    videos = []
    nextPageToken = None
    fetched = 0
    while fetched < max_results:
        to_fetch = min(50, max_results - fetched)  # search.list maxResults ≤ 50
        res = YOUTUBE.search().list(
            q=query,
            part="id,snippet",
            type="video",
            maxResults=to_fetch,
            pageToken=nextPageToken
        ).execute()
        for item in res.get("items", []):
            vid = item["id"]["videoId"]
            title = item["snippet"]["title"]
            publ = item["snippet"]["publishedAt"]
            videos.append({"videoId": vid, "title": title, "publishedAt": publ})
        fetched += len(res.get("items", []))
        nextPageToken = res.get("nextPageToken")
        if not nextPageToken:
            break
        time.sleep(SLEEP_BETWEEN_REQUESTS)
    return videos

def fetch_comments_for_video(video_id, max_comments=200):
    """Lấy commentThreads (top-level comments) cho 1 video, kèm replies_count.
       Trả về list comment dicts."""
    comments = []
    nextPageToken = None
    fetched = 0
    while fetched < max_comments:
        to_fetch = min(100, max_comments - fetched)  # commentThreads maxResults ≤ 100
        res = YOUTUBE.commentThreads().list(
            part="id,snippet,replies",
            videoId=video_id,
            maxResults=to_fetch,
            pageToken=nextPageToken,
            textFormat="plainText"   # lấy raw text
        ).execute()
        for item in res.get("items", []):
            top = item["snippet"]["topLevelComment"]["snippet"]
            c = {
                "videoId": video_id,
                "commentId": item["id"],
                "authorDisplayName": top.get("authorDisplayName"),
                "authorChannelId": top.get("authorChannelId", {}).get("value"),
                "textOriginal": top.get("textOriginal"),
                "likeCount": top.get("likeCount"),
                "publishedAt": top.get("publishedAt"),
                "updatedAt": top.get("updatedAt"),
                "replyCount": item["snippet"].get("totalReplyCount", 0)
            }
            comments.append(c)
            fetched += 1
        nextPageToken = res.get("nextPageToken")
        if not nextPageToken:
            break
        time.sleep(SLEEP_BETWEEN_REQUESTS)
    return comments

# ========== CHẠY THU THẬP ==========
def main():
    print("Searching videos for query:", QUERY)
    videos = search_videos(QUERY, max_results=MAX_VIDEOS)
    print(f"Found {len(videos)} videos. Start fetching comments...")

    all_comments = []
    for v in tqdm(videos, desc="Videos"):
        vid = v["videoId"]
        title = v["title"]
        publ = v["publishedAt"]
        # fetch top-level comments
        try:
            comments = fetch_comments_for_video(vid, max_comments=MAX_COMMENTS_PER_VIDEO)
        except Exception as e:
            print(f"Error fetching comments for video {vid}: {e}")
            comments = []
        # bổ sung metadata video vào từng comment
        for c in comments:
            c["videoTitle"] = title
            c["videoPublishedAt"] = publ
        all_comments.extend(comments)
        time.sleep(SLEEP_BETWEEN_REQUESTS)

    # lưu CSV
    if all_comments:
        df = pd.DataFrame(all_comments)
        df.to_csv(OUTPUT_CSV, index=False, encoding="utf-8-sig")
        # lưu JSON
        with open(OUTPUT_JSON, "w", encoding="utf-8") as f:
            json.dump(all_comments, f, ensure_ascii=False, indent=2)
        print(f"Saved {len(all_comments)} comments to {OUTPUT_CSV} and {OUTPUT_JSON}")
    else:
        print("No comments collected.")

if __name__ == "__main__":
    main()
