In [None]:
!pip install google-api-python-client

In [15]:
import os
import googleapiclient.discovery
from datetime import datetime
import pandas as pd
import json
import csv

# Set up API key and YouTube API client
API_KEY = 'AIzaSyChka1rkrphR25Gw03l-X05GqC7I8gfI64'
youtube = googleapiclient.discovery.build("youtube", "v3", developerKey=API_KEY)

# "TamilMurasu" : "UCs0xZ60FSNxFxHPVFFsXNTA"
# "Berita Harian Singapura" : "UC_WgSFSkn7112rmJQcHSUIQ"
# "zaobaosg":"UCrbQxu0YkoVWu2dw5b1MzNg"
# "The Business Times":"UC0GP1HDhGZTLih7B89z_cTg"
# "The Straits Times":"UC4p_I9eiRewn2KoU-nawrDg"


# Channel IDs for the specified publications
channels = {
    "TamilMurasu" : "UCs0xZ60FSNxFxHPVFFsXNTA",
    "BeritaHarianSingapura" : "UC_WgSFSkn7112rmJQcHSUIQ",
    "Zaobaosg":"UCrbQxu0YkoVWu2dw5b1MzNg",
    "TheBusinessTimes":"UC0GP1HDhGZTLih7B89z_cTg",
    "TheStraitsTimes":"UC4p_I9eiRewn2KoU-nawrDg",
}

# Hardcoded last processed date
LAST_PROCESSED_DATE = datetime(2024, 8, 1)  # Replace with the desired date

def get_channel_statistics(channel_id):
    try:
        request = youtube.channels().list(
            part="statistics",
            id=channel_id
        )
        response = request.execute()
        print(response)
        if 'items' not in response or len(response['items']) == 0:
            print(f"No data found for channel ID: {channel_id}")
            return None
        stats = response['items'][0]['statistics']
        return stats
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

def get_videos(channel_id, last_processed_date):
    videos = []
    request = youtube.search().list(
        part="snippet",
        channelId=channel_id,
        maxResults=50,
        order="date",
        publishedAfter=last_processed_date.isoformat() + "Z" if last_processed_date else None
    )
    while request:
        response = request.execute()
        if 'items' in response:
            for item in response['items']:
                if item['id']['kind'] == 'youtube#video':
                    videos.append(item)
        request = youtube.search().list_next(request, response)
    return videos

def get_video_statistics(video_id):
    request = youtube.videos().list(
        part="statistics,snippet",
        id=video_id
    )
    response = request.execute()
    return response['items'][0]

def save_to_bronze(channel_name, videos):
    # Define the directory structure based on partition strategy
    current_date = datetime.now().strftime('%Y/%m')
    directory = f"data_lake/Bronze/YoutubeChannelData/{current_date}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # File path for the JSON
    file_path = os.path.join(directory, f"{channel_name}_videos_{datetime.now().strftime('%Y%m%d_%H%M%S')}.json")
    
    # Save JSON data to file
    with open(file_path, 'w', encoding='utf-8') as json_file:
        json.dump(videos, json_file, indent=4, ensure_ascii=False)
    print(f"Raw JSON data saved to {file_path}")

def save_to_silver(channel_name, data):
    # Define the directory structure based on partition strategy
    current_date = datetime.now().strftime('%Y/%m')
    directory = f"data_lake/Silver/YoutubeChannelData/{current_date}"
    if not os.path.exists(directory):
        os.makedirs(directory)
    
    # File path for the CSV
    file_path = os.path.join(directory, f"{channel_name}_youtube_data_{datetime.now().strftime('%Y%m%d_%H%M%S')}.csv")
    
    # Convert data to DataFrame
    df = pd.DataFrame(data)
    
    # Add datawarehouse metadata columns
    df['dwh_create_timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')
    df['dwh_created_by'] = 'dwh_admin_user'
    df['dwh_updated_by'] = 'dwh_admin_user'
    df['dwh_update_timestamp'] = datetime.now().strftime('%Y-%m-%d %H:%M:%S')

    # Save data to CSV with quotes around column names with spaces
    df.to_csv(file_path, mode='a', header=not os.path.exists(file_path), index=False, quoting=1)
    print(f"Data saved to {file_path}")

def main():
    all_data = []

    for name, channel_id in channels.items():
        print(f"Processing channel: {name}")
        
        # Get channel statistics
        stats = get_channel_statistics(channel_id)
        subscribers = stats['subscriberCount']
        print(f"Subscribers: {subscribers}")
        
        # Get video details (incremental data)
        videos = get_videos(channel_id, LAST_PROCESSED_DATE)

        # Save the raw JSON data to the Bronze layer
        save_to_bronze(name, videos)

        print("Raw json files has been saved to bronze layer")
        
        video_count = len(videos)
        print(f"Number of videos: {video_count}")
        
        for video in videos:
            video_id = video['id']['videoId']
            video_stats = get_video_statistics(video_id)
            if video_stats is None:
                continue
            
            # Extract necessary information
            title = video_stats['snippet']['title']
            publish_date = video_stats['snippet']['publishedAt']
            view_count = int(video_stats['statistics'].get('viewCount', 0))
            like_count = int(video_stats['statistics'].get('likeCount', 0))
            comment_count = int(video_stats['statistics'].get('commentCount', 0))
            high_thumbnail_url = video_stats['snippet']['thumbnails']['high']['url']
            description = video_stats['snippet'].get('description', '')

            # Properly decode Unicode characters in title
            title = bytes(title, 'utf-8').decode('utf-8')
            
            publish_date = datetime.strptime(publish_date, "%Y-%m-%dT%H:%M:%SZ")

            all_data.append({
                'channel': name,
                'video_id': video_id,
                'title': title,
                'publish_date': publish_date,
                'view_count': view_count,
                'like_count': like_count,
                'comment_count': comment_count,
                'subscribers': subscribers,
                'thumbnail_high_url': high_thumbnail_url,
                'description': description
            })
        if all_data:
            save_to_silver(name, all_data)
            print("CSV files has been saved to Silver layer")
            all_data = []
        else:
            print("No new videos to process.")

if __name__ == "__main__":
    main()

Processing channel: TamilMurasu
{'kind': 'youtube#channelListResponse', 'etag': 'QhUvFDJi3AKgV1j6vqeuf6ayuDY', 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5}, 'items': [{'kind': 'youtube#channel', 'etag': 'D6byA-d5fyX-FSMKpZ270h8N9bk', 'id': 'UCs0xZ60FSNxFxHPVFFsXNTA', 'statistics': {'viewCount': '1342449', 'subscriberCount': '4860', 'hiddenSubscriberCount': False, 'videoCount': '461'}}]}
Subscribers: 4860
Raw JSON data saved to data_lake/Bronze/YoutubeChannelData/2024/08/TamilMurasu_videos_20240811_144235.json
Raw json files has been saved to bronze layer
Number of videos: 33
Data saved to data_lake/Silver/YoutubeChannelData/2024/08/TamilMurasu_youtube_data_20240811_144240.csv
CSV files has been saved to Silver layer
Processing channel: BeritaHarianSingapura
{'kind': 'youtube#channelListResponse', 'etag': 'Zn7rKhITn6yiiVv4AVZoLffHZ28', 'pageInfo': {'totalResults': 1, 'resultsPerPage': 5}, 'items': [{'kind': 'youtube#channel', 'etag': '52i6iikkL0PTy0yOaZ_dk54lSMM', 'id': 'UC_WgSF