In [14]:
import os
import sys
import time

import pandas as pd
from dotenv import load_dotenv
from googleapiclient.discovery import build
from tqdm import tqdm

#tests after changing to ASUS ROG
#Works!

# ========= Configuration ========= #
SEARCH_QUERY = 'artificial intelligence'  # Search keyword
MAX_RESULTS = 500  # Max number of videos to fetch
API_UNITS_LIMIT = 10000  # YouTube API daily quota
SAFETY_THRESHOLD = 9000  # Safety threshold to stop fetching
SAVE_PATH = 'D:/COMM5007/Data/'

# ========= Load API Key ========= #
load_dotenv()
API_KEY = os.getenv("YOUTUBE_API_KEY")
print("API Key 加载成功（仅展示部分）:", API_KEY[:5] + "****")

# ========= Initialize YouTube API Client ========= #
youtube = build('youtube', 'v3', developerKey=API_KEY)

# ========= Load Previously Saved Data ========= #
video_csv_path = os.path.join(SAVE_PATH, 'videos.csv')
comment_csv_path = os.path.join(SAVE_PATH, 'comments_partial.csv')

if os.path.exists(video_csv_path):
    print("📁 Loading previously saved video data...")
    video_data = pd.read_csv(video_csv_path).to_dict('records')
else:
    video_data = []

if os.path.exists(comment_csv_path):
    print("📁 Loading previously saved comment data...")
    comment_data = pd.read_csv(comment_csv_path).to_dict('records')
else:
    comment_data = []

processed_video_ids = set(v['video_id'] for v in video_data)
estimated_units_used = 0


# ========= Search Videos ========= #
def search_videos(query, max_results=500):
    videos = []
    next_page_token = None
    total_fetched = 0

    while total_fetched < max_results:
        request = youtube.search().list(
            q=query,
            part='id',
            type='video',
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            videos.append(item['id']['videoId'])
        total_fetched += len(response['items'])
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
        time.sleep(1)
    return videos


# ========= Get Video Details ========= #
def get_video_details(video_ids):
    details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet,statistics',
            id=','.join(video_ids[i:i + 50])
        )
        response = request.execute()
        for item in response['items']:
            details.append({
                'video_id': item['id'],
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
                'publishedAt': item['snippet']['publishedAt'],
                'channelTitle': item['snippet']['channelTitle'],
                'tags': ','.join(item['snippet'].get('tags', [])),
                'categoryId': item['snippet']['categoryId'],
                'viewCount': int(item['statistics'].get('viewCount', 0)),
                'likeCount': int(item['statistics'].get('likeCount', 0)),
                'commentCount': int(item['statistics'].get('commentCount', 0)),
            })
        time.sleep(1)
    return details


# ========= Get Comments ========= #
def get_comments(video_id, max_comments=None):
    comments = []
    next_page_token = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat='plainText'
            )
            response = request.execute()

            for item in response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'video_id': video_id,
                    'author': top_comment['authorDisplayName'],
                    'comment': top_comment['textDisplay'],
                    'likeCount': top_comment['likeCount'],
                    'publishedAt': top_comment['publishedAt']
                })

                if max_comments and len(comments) >= max_comments:
                    return comments

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.5)

        except Exception as e:
            break

    return comments


# ========= Save Data ========= #
def save_data(video_data, comment_data, video_file='videos.csv', comment_file='comments_partial.csv'):
    os.makedirs(SAVE_PATH, exist_ok=True)

    video_path = os.path.join(SAVE_PATH, video_file)
    comment_path = os.path.join(SAVE_PATH, comment_file)

    pd.DataFrame(video_data).to_csv(video_path, index=False)
    pd.DataFrame(comment_data).to_csv(comment_path, index=False)

    print(f"💾 Saved to:\n📁 Video file: {video_path}\n📁 Comment file: {comment_path}")


# ========= Main Execution ========= #
try:
    print("🔍 Searching for videos...")
    all_video_ids = search_videos(SEARCH_QUERY, MAX_RESULTS)
    new_video_ids = [vid for vid in all_video_ids if vid not in processed_video_ids]

    if not new_video_ids:
        print("✅ No new videos to process.")
        sys.exit()

    print(f"📺 Found {len(new_video_ids)} new videos to process.")
    video_details_today = get_video_details(new_video_ids)

    # ✅ 跳过已有评论的视频，避免重复抓取
    existing_commented_video_ids = set(c['video_id'] for c in comment_data)
    print(f"🧠 已有评论数据的视频数量: {len(existing_commented_video_ids)}")

    video_details_today = [v for v in video_details_today if v['video_id'] not in existing_commented_video_ids]
    print(f"📺 还需要抓评论的视频数量: {len(video_details_today)}")

    print("💬 Fetching comments for new videos...")
    for idx, video in enumerate(tqdm(video_details_today, desc="Fetching comments")):
        video_id = video['video_id']
        comments = get_comments(video_id)
        comment_data.extend(comments)
        video_data.append(video)

        # ✅ 实时保存（可选）
        save_data(video_data, comment_data)

        units_used_this_video = max(1, len(comments) // 100 + 1)
        estimated_units_used += units_used_this_video
        print(f"📊 Estimated API usage: {estimated_units_used} / {API_UNITS_LIMIT}")

        if estimated_units_used >= SAFETY_THRESHOLD:
            print(f"🚨 Safety threshold of {SAFETY_THRESHOLD} reached, stopping...")
            break

except KeyboardInterrupt:
    print("\n🛑 Interrupted manually, saving current data...")

# ✅ 保存前对评论去重
seen = set()
unique_comments = []
for c in comment_data:
    key = (c['video_id'], c['comment'])
    if key not in seen:
        seen.add(key)
        unique_comments.append(c)
comment_data = unique_comments

# ========= Final Save ========= #
save_data(video_data, comment_data, 'videos.csv', 'comments.csv')
print("✅ Completed! Data saved to videos.csv and comments.csv")

#获取原始video数据和comment数据到此结束
######################################################################################

API Key 加载成功（仅展示部分）: AIzaS****
📁 Loading previously saved video data...
📁 Loading previously saved comment data...
🔍 Searching for videos...


HttpError: <HttpError 403 when requesting https://youtube.googleapis.com/youtube/v3/search?q=artificial+intelligence&part=id&type=video&maxResults=50&key=AIzaSyAPFSYKG9lnXD2pBYqLd0RUweQwMpoQfVo&alt=json returned "The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.". Details: "[{'message': 'The request cannot be completed because you have exceeded your <a href="/youtube/v3/getting-started#quota">quota</a>.', 'domain': 'youtube.quota', 'reason': 'quotaExceeded'}]">