In [7]:
from googleapiclient.discovery import build
import pandas as pd
import time
from tqdm import tqdm
import sys
import os

# ========= Configuration ========= #
API_KEY = 'AIzaSyAPFSYKG9lnXD2pBYqLd0RUweQwMpoQfVo'  # Replace with your API Key
SEARCH_QUERY = 'artificial intelligence'  # Search keyword
MAX_RESULTS = 500  # Max number of videos to fetch
API_UNITS_LIMIT = 10000  # YouTube API daily quota
SAFETY_THRESHOLD = 9000  # Safety threshold to stop fetching
SAVE_PATH = 'D:/COMM5007/Data/'

# ========= Initialize YouTube API Client ========= #
youtube = build('youtube', 'v3', developerKey=API_KEY)

# ========= Load Previously Saved Data ========= #
video_csv_path = os.path.join(SAVE_PATH, 'videos.csv')
comment_csv_path = os.path.join(SAVE_PATH, 'comments_partial.csv')

if os.path.exists(video_csv_path):
    print("📁 Loading previously saved video data...")
    video_data = pd.read_csv(video_csv_path).to_dict('records')
else:
    video_data = []

if os.path.exists(comment_csv_path):
    print("📁 Loading previously saved comment data...")
    comment_data = pd.read_csv(comment_csv_path).to_dict('records')
else:
    comment_data = []

processed_video_ids = set(v['video_id'] for v in video_data)
estimated_units_used = 0

# ========= Search Videos ========= #
def search_videos(query, max_results=500):
    videos = []
    next_page_token = None
    total_fetched = 0

    while total_fetched < max_results:
        request = youtube.search().list(
            q=query,
            part='id',
            type='video',
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            videos.append(item['id']['videoId'])
        total_fetched += len(response['items'])
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
        time.sleep(1)
    return videos

# ========= Get Video Details ========= #
def get_video_details(video_ids):
    details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet,statistics',
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()
        for item in response['items']:
            details.append({
                'video_id': item['id'],
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
                'publishedAt': item['snippet']['publishedAt'],
                'channelTitle': item['snippet']['channelTitle'],
                'tags': ','.join(item['snippet'].get('tags', [])),
                'categoryId': item['snippet']['categoryId'],
                'viewCount': int(item['statistics'].get('viewCount', 0)),
                'likeCount': int(item['statistics'].get('likeCount', 0)),
                'commentCount': int(item['statistics'].get('commentCount', 0)),
            })
        time.sleep(1)
    return details

# ========= Get Comments ========= #
def get_comments(video_id, max_comments=None):
    comments = []
    next_page_token = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat='plainText'
            )
            response = request.execute()

            for item in response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'video_id': video_id,
                    'author': top_comment['authorDisplayName'],
                    'comment': top_comment['textDisplay'],
                    'likeCount': top_comment['likeCount'],
                    'publishedAt': top_comment['publishedAt']
                })

                if max_comments and len(comments) >= max_comments:
                    return comments

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.5)

        except Exception as e:
            break

    return comments

# ========= Save Data ========= #
def save_data(video_data, comment_data, video_file='videos.csv', comment_file='comments_partial.csv'):
    os.makedirs(SAVE_PATH, exist_ok=True)

    video_path = os.path.join(SAVE_PATH, video_file)
    comment_path = os.path.join(SAVE_PATH, comment_file)

    pd.DataFrame(video_data).to_csv(video_path, index=False)
    pd.DataFrame(comment_data).to_csv(comment_path, index=False)

    print(f"💾 Saved to:\n📁 Video file: {video_path}\n📁 Comment file: {comment_path}")

# ========= Main Execution ========= #
try:
    print("🔍 Searching for videos...")
    all_video_ids = search_videos(SEARCH_QUERY, MAX_RESULTS)
    new_video_ids = [vid for vid in all_video_ids if vid not in processed_video_ids]

    if not new_video_ids:
        print("✅ No new videos to process.")
        sys.exit()

    print(f"📺 Found {len(new_video_ids)} new videos to process.")
    video_details_today = get_video_details(new_video_ids)

    print("💬 Fetching comments for new videos...")
    for idx, video in enumerate(tqdm(video_details_today, desc="Fetching comments")):
        video_id = video['video_id']
        comments = get_comments(video_id)
        comment_data.extend(comments)
        video_data.append(video)

        save_data(video_data, comment_data)

        units_used_this_video = max(1, len(comments) // 100 + 1)
        estimated_units_used += units_used_this_video
        print(f"📊 Estimated API usage: {estimated_units_used} / {API_UNITS_LIMIT}")

        if estimated_units_used >= SAFETY_THRESHOLD:
            print(f"🚨 Safety threshold of {SAFETY_THRESHOLD} reached, stopping...")
            break

except KeyboardInterrupt:
    print("\n🛑 Interrupted manually, saving current data...")
    save_data(video_data, comment_data)
    sys.exit()

# ========= Final Save ========= #
save_data(video_data, comment_data, 'videos.csv', 'comments.csv')
print("✅ Completed! Data saved to videos.csv and comments.csv")


📁 Loading previously saved video data...
📁 Loading previously saved comment data...
🔍 Searching for videos...
📺 Found 352 new videos to process.
💬 Fetching comments for new videos...


Fetching comments:   0%|          | 1/352 [00:01<06:54,  1.18s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 1 / 10000


Fetching comments:   1%|          | 2/352 [00:05<17:09,  2.94s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 6 / 10000


Fetching comments:   1%|          | 3/352 [00:11<26:56,  4.63s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 15 / 10000


Fetching comments:   1%|          | 4/352 [00:12<18:21,  3.17s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 16 / 10000


Fetching comments:   1%|▏         | 5/352 [00:15<17:54,  3.10s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 20 / 10000


Fetching comments:   2%|▏         | 6/352 [00:21<23:43,  4.11s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 28 / 10000


Fetching comments:   2%|▏         | 7/352 [00:23<19:00,  3.30s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 30 / 10000


Fetching comments:   2%|▏         | 8/352 [00:30<26:03,  4.54s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 39 / 10000


Fetching comments:   3%|▎         | 9/352 [00:36<27:25,  4.80s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 47 / 10000


Fetching comments:   3%|▎         | 10/352 [00:37<21:47,  3.82s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 49 / 10000


Fetching comments:   3%|▎         | 11/352 [00:39<17:47,  3.13s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 51 / 10000


Fetching comments:   3%|▎         | 12/352 [00:40<13:39,  2.41s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 52 / 10000


Fetching comments:   4%|▎         | 13/352 [00:46<19:47,  3.50s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 61 / 10000


Fetching comments:   4%|▍         | 14/352 [00:47<16:09,  2.87s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 63 / 10000


Fetching comments:   4%|▍         | 15/352 [00:52<20:07,  3.58s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 70 / 10000


Fetching comments:   5%|▍         | 16/352 [03:42<5:00:00, 53.57s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 294 / 10000


Fetching comments:   5%|▍         | 17/352 [03:43<3:30:24, 37.69s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 295 / 10000


Fetching comments:   5%|▌         | 18/352 [03:48<2:35:50, 27.99s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 303 / 10000


Fetching comments:   5%|▌         | 19/352 [03:49<1:49:58, 19.82s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 304 / 10000


Fetching comments:   6%|▌         | 20/352 [03:53<1:22:49, 14.97s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 309 / 10000


Fetching comments:   6%|▌         | 21/352 [04:08<1:23:12, 15.08s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 329 / 10000


Fetching comments:   6%|▋         | 22/352 [04:10<1:01:47, 11.23s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 332 / 10000


Fetching comments:   7%|▋         | 23/352 [04:11<44:25,  8.10s/it]  

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 333 / 10000


Fetching comments:   7%|▋         | 24/352 [04:23<51:04,  9.34s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 349 / 10000


Fetching comments:   7%|▋         | 25/352 [04:34<52:47,  9.69s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 363 / 10000


Fetching comments:   7%|▋         | 26/352 [04:50<1:02:58, 11.59s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 384 / 10000


Fetching comments:   8%|▊         | 27/352 [04:51<45:16,  8.36s/it]  

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 385 / 10000


Fetching comments:   8%|▊         | 28/352 [04:56<40:29,  7.50s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 392 / 10000


Fetching comments:   8%|▊         | 29/352 [04:57<29:33,  5.49s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 393 / 10000


Fetching comments:   9%|▊         | 30/352 [05:04<31:32,  5.88s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 402 / 10000


Fetching comments:   9%|▊         | 30/352 [06:39<1:11:26, 13.31s/it]



🛑 Interrupted manually, saving current data...
💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv


SystemExit: 

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
