In [1]:
from googleapiclient.discovery import build
import pandas as pd
import isodate
from datetime import datetime
from dotenv import dotenv_values
from google.cloud import bigquery
from google.oauth2 import service_account
import os

In [None]:
config = dotenv_values()
api_key = config['api_key']
youtube = build("youtube", "v3", developerKey=api_key)


region_codes = [
    'US', 'CA', 'GB', 'AU', 'IN', 'JP', 'KR', 'BR', 'MX', 'FR', 'DE', 'RU', 'IT', 'ES',
    'AR', 'CO', 'CL', 'NL', 'TR', 'SA', 'AE', 'EG', 'ID', 'MY', 'TH', 'VN', 'SG', 'NG',
    'KE', 'ZA', 'PK', 'BD', 'UA', 'PL', 'SE', 'CH', 'BE', 'NO', 'DK', 'FI', 'IE', 'NZ',
    'PH', 'HK', 'TW', 'IL', 'RO', 'HU', 'CZ', 'GR', 'PT', 'SK', 'AT'
]


video_data = []

for region in region_codes:
    response = youtube.videos().list(
        part="snippet,contentDetails,statistics",
        chart="mostPopular",
        regionCode=region,
        maxResults=50
    ).execute()

    for item in response['items']:
        snippet = item['snippet']
        stats = item.get('statistics', {})
        content = item['contentDetails']
        
        video_data.append({
            'video_id': item['id'],
            'title': snippet.get('title'),
            'description': snippet.get('description'),
            'channel_title': snippet.get('channelTitle'),
            'published_at': datetime.strptime(snippet.get('publishedAt'), "%Y-%m-%dT%H:%M:%SZ"),
            'category_id' : snippet.get('categoryId'),
            'default_language' : snippet.get('defaultLanguage'),
            'tags': ', '.join(snippet.get('tags', [])),
            'duration_seconds': isodate.parse_duration(content['duration']).total_seconds(),
            'view_count': int(stats.get('viewCount', 0)),
            'like_count': int(stats.get('likeCount', 0)),
            'comment_count': int(stats.get('commentCount', 0)),
        })




unique_data = {video['video_id']: video for video in video_data}
video_data = list(unique_data.values())



load_date = datetime.utcnow()
for video in video_data:
    video["load_date"] = load_date


df = pd.DataFrame(video_data)




credentials = service_account.Credentials.from_service_account_file(
    r"C:\Users\tanju\Desktop\upheld-momentum-463013-v7-a9926786a277.json"
)

client = bigquery.Client(credentials=credentials, project='upheld-momentum-463013-v7')



# Set your dataset and table
dataset_id = 'dbt_tdereli'  # e.g. 'dbt_tdereli'
table_id = 'youtube_trending_videos'

table_ref = client.dataset(dataset_id).table(table_id)

# Define table schema (optional but recommended)
schema = [
    bigquery.SchemaField("load_date", "TIMESTAMP"),
    bigquery.SchemaField("video_id", "STRING"),
    bigquery.SchemaField("title", "STRING"),
    bigquery.SchemaField("description", "STRING"),
    bigquery.SchemaField("channel_title", "STRING"),
    bigquery.SchemaField("published_at", "TIMESTAMP"),
    bigquery.SchemaField("category_id", "STRING"),
    bigquery.SchemaField("default_language", "STRING"),
    bigquery.SchemaField("tags", "STRING"),
    bigquery.SchemaField("duration_seconds", "FLOAT"),
    bigquery.SchemaField("view_count", "INTEGER"),
    bigquery.SchemaField("like_count", "INTEGER"),
    bigquery.SchemaField("comment_count", "INTEGER"),
]

# Configure job to overwrite table if exists
job_config = bigquery.LoadJobConfig(
    schema=schema,
    write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE,
)

# Load data into BigQuery
job = client.load_table_from_dataframe(df, table_ref, job_config=job_config)
job.result()  # Wait for completion

print(f"Loaded {job.output_rows} rows into {dataset_id}.{table_id}")

  load_date = datetime.utcnow()


In [3]:
df.head(3)

Unnamed: 0,video_id,title,description,channel_title,published_at,category_id,default_language,tags,duration_seconds,view_count,like_count,comment_count,load_date
0,wSa-sjaLVOM,Five Nights at Freddy's: The Secret of the Mimic,Five Nights at Freddy's is BACK with a game th...,Markiplier,2025-06-17 00:09:51,20,en,"markiplier, five nights at freddy's, fnaf, sec...",7456.0,2102171,145667,4631,2025-06-17 22:28:19.249546
1,URlPXepBZdo,Clipse - So Be It (Official Music Video),New Album 'Let God Sort Em Out' July 11\nPre-O...,clipseVEVO,2025-06-17 16:00:46,10,,"Clipse, Roc Nation Distribution, Hip Hop/Rap, ...",199.0,394704,61060,7269,2025-06-17 22:28:19.249546
2,uLguU7WLreA,The Naked Gun | Official Trailer (2025 Movie) ...,Justice has a new Daddy. Watch the new trailer...,Paramount Pictures,2025-06-16 13:00:13,1,,"The Naked Gun, Liam Neeson, Pamela Anderson, N...",150.0,6013889,50934,5541,2025-06-17 22:28:19.249546


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1479 entries, 0 to 1478
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   video_id          1479 non-null   object        
 1   title             1479 non-null   object        
 2   description       1479 non-null   object        
 3   channel_title     1479 non-null   object        
 4   published_at      1479 non-null   datetime64[ns]
 5   category_id       1479 non-null   object        
 6   default_language  520 non-null    object        
 7   tags              1479 non-null   object        
 8   duration_seconds  1479 non-null   float64       
 9   view_count        1479 non-null   int64         
 10  like_count        1479 non-null   int64         
 11  comment_count     1479 non-null   int64         
 12  load_date         1479 non-null   datetime64[ns]
dtypes: datetime64[ns](2), float64(1), int64(3), object(7)
memory usage: 150.3+ KB


In [5]:
df.columns

Index(['video_id', 'title', 'description', 'channel_title', 'published_at',
       'category_id', 'default_language', 'tags', 'duration_seconds',
       'view_count', 'like_count', 'comment_count', 'load_date'],
      dtype='object')

In [6]:
df.describe()

Unnamed: 0,published_at,duration_seconds,view_count,like_count,comment_count,load_date
count,1479,1479.0,1479.0,1479.0,1479.0,1479
mean,2025-06-11 21:34:38.235293696,1549.943881,5894477.0,149170.4,2579.780933,2025-06-17 22:28:19.249545472
min,2025-05-15 09:05:49,0.0,20860.0,0.0,0.0,2025-06-17 22:28:19.249546
25%,2025-06-09 16:17:16.500000,48.0,279750.0,7397.0,212.0,2025-06-17 22:28:19.249545984
50%,2025-06-13 13:25:49,210.0,764605.0,21560.0,673.0,2025-06-17 22:28:19.249545984
75%,2025-06-15 17:58:44,1232.5,3442881.0,91855.0,1867.5,2025-06-17 22:28:19.249545984
max,2025-06-17 18:01:38,42097.0,251862200.0,5489090.0,185036.0,2025-06-17 22:28:19.249546
std,,4284.180317,16459430.0,400090.7,9570.563888,
