In [5]:
from googleapiclient.discovery import build
import pandas as pd
import time
from tqdm import tqdm
import sys
import os

# ========= Configuration ========= #
API_KEY = 'AIzaSyAPFSYKG9lnXD2pBYqLd0RUweQwMpoQfVo'  # Replace with your API Key
SEARCH_QUERY = 'artificial intelligence'  # Search keyword
MAX_RESULTS = 500  # Maximum number of videos to fetch
API_UNITS_LIMIT = 10000       # YouTube API daily quota
SAFETY_THRESHOLD = 9000       # Safety threshold (auto-stop when reached)
SAVE_PATH = 'D:/COMM5007/Data/'  # Output path

# ========= Initialize ========= #
youtube = build('youtube', 'v3', developerKey=API_KEY)
video_data = []
comment_data = []
estimated_units_used = 0  # Estimated API usage

# ========= Load Previously Saved Data ========= #
video_csv_path = os.path.join(SAVE_PATH, 'videos.csv')
comment_csv_path = os.path.join(SAVE_PATH, 'comments_partial.csv')

if os.path.exists(video_csv_path):
    print("📁 Loading previously saved video data...")
    video_data = pd.read_csv(video_csv_path).to_dict('records')
else:
    video_data = []

if os.path.exists(comment_csv_path):
    print("📁 Loading previously saved comment data...")
    comment_data = pd.read_csv(comment_csv_path).to_dict('records')
else:
    comment_data = []

# Get already processed video_ids to skip
processed_video_ids = set(v['video_id'] for v in video_data)


# ========= Search Videos ========= #
def search_videos(query, max_results=500):
    videos = []
    next_page_token = None
    total_fetched = 0

    while total_fetched < max_results:
        request = youtube.search().list(
            q=query,
            part='id',
            type='video',
            maxResults=50,
            pageToken=next_page_token
        )
        response = request.execute()
        for item in response['items']:
            videos.append(item['id']['videoId'])
        total_fetched += len(response['items'])
        next_page_token = response.get('nextPageToken')
        if not next_page_token:
            break
        time.sleep(1)
    return videos

# ========= Get Video Details ========= #
def get_video_details(video_ids):
    details = []
    for i in range(0, len(video_ids), 50):
        request = youtube.videos().list(
            part='snippet,statistics',
            id=','.join(video_ids[i:i+50])
        )
        response = request.execute()
        for item in response['items']:
            details.append({
                'video_id': item['id'],
                'title': item['snippet']['title'],
                'description': item['snippet']['description'],
                'publishedAt': item['snippet']['publishedAt'],
                'channelTitle': item['snippet']['channelTitle'],
                'tags': ','.join(item['snippet'].get('tags', [])),
                'categoryId': item['snippet']['categoryId'],
                'viewCount': int(item['statistics'].get('viewCount', 0)),
                'likeCount': int(item['statistics'].get('likeCount', 0)),
                'commentCount': int(item['statistics'].get('commentCount', 0)),
            })
        time.sleep(1)
    return details

# ========= Get Comments ========= #
def get_comments(video_id, max_comments=None):
    comments = []
    next_page_token = None

    while True:
        try:
            request = youtube.commentThreads().list(
                part='snippet',
                videoId=video_id,
                maxResults=100,
                pageToken=next_page_token,
                textFormat='plainText'
            )
            response = request.execute()

            for item in response['items']:
                top_comment = item['snippet']['topLevelComment']['snippet']
                comments.append({
                    'video_id': video_id,
                    'author': top_comment['authorDisplayName'],
                    'comment': top_comment['textDisplay'],
                    'likeCount': top_comment['likeCount'],
                    'publishedAt': top_comment['publishedAt']
                })

                if max_comments and len(comments) >= max_comments:
                    return comments

            next_page_token = response.get('nextPageToken')
            if not next_page_token:
                break
            time.sleep(0.5)

        except Exception as e:
            break

    return comments

# ========= Save Data ========= #
def save_data(video_data, comment_data, video_file='videos.csv', comment_file='comments_partial.csv'):
    os.makedirs(SAVE_PATH, exist_ok=True)

    video_path = os.path.join(SAVE_PATH, video_file)
    comment_path = os.path.join(SAVE_PATH, comment_file)

    pd.DataFrame(video_data).to_csv(video_path, index=False)
    pd.DataFrame(comment_data).to_csv(comment_path, index=False)

    print(f"💾 Saved to:\n📁 Video file: {video_path}\n📁 Comment file: {comment_path}")

# ========= Main Script ========= #
try:
    print("🔍 Searching for videos...")
    video_ids = search_videos(SEARCH_QUERY, MAX_RESULTS)
    print(f"📺 Found {len(video_ids)} videos, fetching details...")
    video_data = get_video_details(video_ids)

    print("💬 Fetching comments for each video...")
    for idx, video in enumerate(tqdm(video_data, desc="Fetching comments")):
        video_id = video['video_id']
        comments = get_comments(video_id)
        comment_data.extend(comments)

        # Save incrementally
        save_data(video_data[:idx+1], comment_data)

        # Estimate API usage (1 page ≈ 1 unit conservatively)
        units_used_this_video = max(1, len(comments) // 100 + 1)
        estimated_units_used += units_used_this_video
        print(f"📊 Estimated API usage: {estimated_units_used} / {API_UNITS_LIMIT}")

        if estimated_units_used >= SAFETY_THRESHOLD:
            print(f"🚨 Safety threshold of {SAFETY_THRESHOLD} reached, stopping...")
            break

except KeyboardInterrupt:
    print("\n🛑 Interrupted manually, saving current data...")
    save_data(video_data, comment_data)
    sys.exit()

# ========= Final Save ========= #
save_data(video_data, comment_data, 'videos.csv', 'comments.csv')
print("✅ Completed! Data saved to videos.csv and comments.csv")



🔍 Searching for videos...
📺 Found 500 videos, fetching details...
💬 Fetching comments for each video...


Fetching comments:   0%|          | 1/500 [00:10<1:23:37, 10.06s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 15 / 10000


Fetching comments:   0%|          | 2/500 [00:10<38:43,  4.66s/it]  

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 17 / 10000


Fetching comments:   1%|          | 3/500 [00:14<35:03,  4.23s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 23 / 10000


Fetching comments:   1%|          | 5/500 [00:15<15:48,  1.92s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 25 / 10000
💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 26 / 10000


Fetching comments:   1%|          | 6/500 [01:16<3:01:38, 22.06s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 113 / 10000


Fetching comments:   1%|▏         | 7/500 [01:34<2:48:15, 20.48s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 138 / 10000


Fetching comments:   2%|▏         | 8/500 [01:45<2:24:50, 17.66s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 155 / 10000


Fetching comments:   2%|▏         | 9/500 [01:46<1:40:04, 12.23s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 156 / 10000


Fetching comments:   2%|▏         | 10/500 [01:49<1:18:25,  9.60s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 162 / 10000


Fetching comments:   2%|▏         | 11/500 [02:46<3:15:10, 23.95s/it]

💾 Saved to:
📁 Video file: D:/COMM5007/Data/videos.csv
📁 Comment file: D:/COMM5007/Data/comments_partial.csv
📊 Estimated API usage: 242 / 10000


Fetching comments:   2%|▏         | 11/500 [03:41<2:44:23, 20.17s/it]


PermissionError: [Errno 13] Permission denied: 'D:/COMM5007/Data/videos.csv'