In [4]:
from googleapiclient.discovery import build
import pandas as pd
import time
from tqdm import tqdm
import sys
import os

# ========= 参数配置 ========= #
API_KEY = 'AIzaSyAPFSYKG9lnXD2pBYqLd0RUweQwMpoQfVo'  # 替换为你的 API Key
SEARCH_QUERY = 'artificial intelligence'  # 搜索关键词
MAX_RESULTS = 500  # 最多获取视频数量
API_UNITS_LIMIT = 10000       # YouTube API 每日配额
SAFETY_THRESHOLD = 9000       # 安全阈值（到达即自动停止）
SAVE_PATH = 'D:/COMM5007/Data/'

# ========= 初始化 ========= #
youtube = build('youtube', 'v3', developerKey=API_KEY)
video_data = []
comment_data = []
estimated_units_used = 0  # 配额估算计数器

# ========= 搜索视频 ========= #
def search_videos(query, max_results=500):
    videos = []
    next_page_token = None
    total_fetched = 0

    while total_fetched < max_results:
        request = youtube.search().list(
            q=query,
            part='id',
            type='video',
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            videos.append(item['id']['videoId'])
        total_fetched += len(response['items'])
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
        time.sleep(1)
    return videos

# ========= 获取视频详情 ========= #
def get_video_details(video_ids):
    details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet,statistics',
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()
        for item in response['items']:
            details.append({
                'video_id': item['id'],
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
                'publishedAt': item['snippet']['publishedAt'],
                'channelTitle': item['snippet']['channelTitle'],
                'tags': ','.join(item['snippet'].get('tags', [])),
                'categoryId': item['snippet']['categoryId'],
                'viewCount': int(item['statistics'].get('viewCount', 0)),
                'likeCount': int(item['statistics'].get('likeCount', 0)),
                'commentCount': int(item['statistics'].get('commentCount', 0)),
            })
        time.sleep(1)
    return details

# ========= 获取所有评论 ========= #
def get_comments(video_id, max_comments=None):
    comments = []
    next_page_token = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat='plainText'
            )
            response = request.execute()

            for item in response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'video_id': video_id,
                    'author': top_comment['authorDisplayName'],
                    'comment': top_comment['textDisplay'],
                    'likeCount': top_comment['likeCount'],
                    'publishedAt': top_comment['publishedAt']
                })

                if max_comments and len(comments) >= max_comments:
                    return comments

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.5)

        except Exception as e:
            break

    return comments

# ========= 保存数据函数 ========= #
def save_data(video_data, comment_data, video_file='videos.csv', comment_file='comments_partial.csv'):
    # 自动创建路径（如果不存在）
    os.makedirs(SAVE_PATH, exist_ok=True)

    # 拼接完整路径
    video_path = os.path.join(SAVE_PATH, video_file)
    comment_path = os.path.join(SAVE_PATH, comment_file)

    # 保存 CSV 文件
    pd.DataFrame(video_data).to_csv(video_path, index=False)
    pd.DataFrame(comment_data).to_csv(comment_path, index=False)

    print(f"💾 已保存至：\n📁 视频文件：{video_path}\n📁 评论文件：{comment_path}")

# ========= 开始采集 ========= #
try:
    print("🔍 正在搜索视频...")
    video_ids = search_videos(SEARCH_QUERY, MAX_RESULTS)
    print(f"📺 共找到 {len(video_ids)} 个视频，正在获取详情...")
    video_data = get_video_details(video_ids)

    print("💬 开始抓取每个视频的评论...")
    for idx, video in enumerate(tqdm(video_data, desc="Fetching comments")):
        video_id = video['video_id']
        comments = get_comments(video_id)
        comment_data.extend(comments)

        # 实时保存
        save_data(video_data[:idx+1], comment_data)

        # 估算 API 用量（保守估算 1 页 ≈ 1 unit）
        units_used_this_video = max(1, len(comments) // 100 + 1)
        estimated_units_used += units_used_this_video
        print(f"📊 估算 API 使用量：{estimated_units_used} / {API_UNITS_LIMIT}")

        if estimated_units_used >= SAFETY_THRESHOLD:
            print(f"🚨 达到安全阈值 {SAFETY_THRESHOLD}，自动停止抓取")
            break

except KeyboardInterrupt:
    print("\n🛑 手动中断，保存当前抓取数据中...")
    save_data(video_data, comment_data)
    sys.exit()

# ========= 正常完成时最终保存 ========= #
save_data(video_data, comment_data, 'videos.csv', 'comments.csv')
print("✅ 全部完成！数据已保存至 videos.csv 和 comments.csv")


Fetching comments:   9%|▉         | 45/500 [13:40<2:18:17, 18.24s/it]  


KeyboardInterrupt: 