## Loading File from Local to Google Cloud Storage (GCS)

In [2]:
import pandas as pd
import os
import json

In [3]:
# For processing to parquet file type
!pip install pyarrow



In [None]:
# Import Dataset (CSV)
csv_file_path = "kaggle_data/US_youtube_trending_data.csv" 
df = pd.read_csv(csv_file_path)

df.head()

# parquet_file_path = os.path.splitext(csv_file_path)[0] + ".parquet"

# df.to_parquet(parquet_file_path, index=False)

# print(f"CSV converted to Parquet: {parquet_file_path}")

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11T19:20:14Z,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,22,2020-08-12T00:00:00Z,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11T17:00:10Z,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,20,2020-08-12T00:00:00Z,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11T16:34:06Z,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,24,2020-08-12T00:00:00Z,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11T16:38:55Z,UCbg_UMjlHJg_19SZckaKajg,XXL,10,2020-08-12T00:00:00Z,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg,False,False,Subscribe to XXL → http://bit.ly/subscribe-xxl...
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11T15:10:05Z,UCDVPcEbVLQgLZX0Rt6jo34A,Mr. Kate,26,2020-08-12T00:00:00Z,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45802,964,2196,https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg,False,False,Transforming The LaBrant Family's empty white ...


In [5]:
# Get data shape
df.shape

(268787, 16)

### Pre-processing

In [6]:
# Check for null values
df.isna().sum()

video_id                0
title                   0
publishedAt             0
channelId               0
channelTitle            0
categoryId              0
trending_date           0
tags                    0
view_count              0
likes                   0
dislikes                0
comment_count           0
thumbnail_link          0
comments_disabled       0
ratings_disabled        0
description          4549
dtype: int64

We will not remove the NA values for `description` and process it later at the model building phase.

In [7]:
# Check data types
df.dtypes

video_id             object
title                object
publishedAt          object
channelId            object
channelTitle         object
categoryId            int64
trending_date        object
tags                 object
view_count            int64
likes                 int64
dislikes              int64
comment_count         int64
thumbnail_link       object
comments_disabled      bool
ratings_disabled       bool
description          object
dtype: object

In [8]:
# Convert 'trending_date' and `publishedAt` to datetime
df[['publishedAt', 'trending_date']] = df[['publishedAt', 'trending_date']].apply(lambda x: pd.to_datetime(x))
df.dtypes

video_id                          object
title                             object
publishedAt          datetime64[ns, UTC]
channelId                         object
channelTitle                      object
categoryId                         int64
trending_date        datetime64[ns, UTC]
tags                              object
view_count                         int64
likes                              int64
dislikes                           int64
comment_count                      int64
thumbnail_link                    object
comments_disabled                   bool
ratings_disabled                    bool
description                       object
dtype: object

### Map `categoryId`

In [None]:
cat_path = "kaggle_data/US_category_id.json"

cat_dict = {}

with open(cat_path, 'r') as file:
    json_data = json.load(file)
    for item in json_data['items']:
        cat_dict[int(item['id'])] = item['snippet']['title']
    
df['categoryId'] = df['categoryId'].apply(lambda x: cat_dict[x])

In [10]:
# Check dataframe again 
df.head()

Unnamed: 0,video_id,title,publishedAt,channelId,channelTitle,categoryId,trending_date,tags,view_count,likes,dislikes,comment_count,thumbnail_link,comments_disabled,ratings_disabled,description
0,3C66w5Z0ixs,I ASKED HER TO BE MY GIRLFRIEND...,2020-08-11 19:20:14+00:00,UCvtRTOMP2TqYqu51xNrqAzg,Brawadis,People & Blogs,2020-08-12 00:00:00+00:00,brawadis|prank|basketball|skits|ghost|funny vi...,1514614,156908,5855,35313,https://i.ytimg.com/vi/3C66w5Z0ixs/default.jpg,False,False,SUBSCRIBE to BRAWADIS ▶ http://bit.ly/Subscrib...
1,M9Pmf9AB4Mo,Apex Legends | Stories from the Outlands – “Th...,2020-08-11 17:00:10+00:00,UC0ZV6M2THA81QT9hrVWJG3A,Apex Legends,Gaming,2020-08-12 00:00:00+00:00,Apex Legends|Apex Legends characters|new Apex ...,2381688,146739,2794,16549,https://i.ytimg.com/vi/M9Pmf9AB4Mo/default.jpg,False,False,"While running her own modding shop, Ramya Pare..."
2,J78aPJ3VyNs,I left youtube for a month and THIS is what ha...,2020-08-11 16:34:06+00:00,UCYzPXprvl5Y-Sf0g4vX-m6g,jacksepticeye,Entertainment,2020-08-12 00:00:00+00:00,jacksepticeye|funny|funny meme|memes|jacksepti...,2038853,353787,2628,40221,https://i.ytimg.com/vi/J78aPJ3VyNs/default.jpg,False,False,I left youtube for a month and this is what ha...
3,kXLn3HkpjaA,XXL 2020 Freshman Class Revealed - Official An...,2020-08-11 16:38:55+00:00,UCbg_UMjlHJg_19SZckaKajg,XXL,Music,2020-08-12 00:00:00+00:00,xxl freshman|xxl freshmen|2020 xxl freshman|20...,496771,23251,1856,7647,https://i.ytimg.com/vi/kXLn3HkpjaA/default.jpg,False,False,Subscribe to XXL → http://bit.ly/subscribe-xxl...
4,VIUo6yapDbc,Ultimate DIY Home Movie Theater for The LaBran...,2020-08-11 15:10:05+00:00,UCDVPcEbVLQgLZX0Rt6jo34A,Mr. Kate,Howto & Style,2020-08-12 00:00:00+00:00,The LaBrant Family|DIY|Interior Design|Makeove...,1123889,45802,964,2196,https://i.ytimg.com/vi/VIUo6yapDbc/default.jpg,False,False,Transforming The LaBrant Family's empty white ...


### Convert File to Parquet 

In [11]:
parquet_file_path = os.path.splitext(csv_file_path)[0] + ".parquet"

df.to_parquet(parquet_file_path, index=False)

print(f"CSV converted to Parquet: {parquet_file_path}")

CSV converted to Parquet: US_youtube_trending_data.parquet


### Load Parquet to GCS Bucket

In [12]:
import logging
from google.cloud import storage

In [15]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "aesthetic-nova-454803-r7-94e7eb0af61c.json"

In [16]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

class ParquetToGCPLoader:
    def __init__(self, parquet_path, bucket_name, gcp_project_id):
        """
        Initialize the loader with local Parquet path and GCP config
        """
        self.parquet_path = parquet_path
        self.bucket_name = bucket_name
        self.gcp_project_id = gcp_project_id

        # Initialize GCS client
        self.storage_client = storage.Client(project=self.gcp_project_id)
        self.bucket = self.storage_client.bucket(self.bucket_name)

    def upload_to_gcs(self, destination_blob_name):
        """
        Upload the Parquet file to Google Cloud Storage
        """
        try:
            blob = self.bucket.blob(destination_blob_name)
            blob.upload_from_filename(self.parquet_path)
            logger.info(f"Parquet file uploaded to {destination_blob_name}")
        except Exception as e:
            logger.error(f"Upload failed: {e}")
            raise

if __name__ == "__main__":
    PARQUET_PATH = "US_youtube_trending_data.parquet"
    BUCKET_NAME = "youtube-trending-videos-dataset"
    GCP_PROJECT_ID = "aesthetic-nova-454803-r7"
    GCS_DEST_PATH = "youtube_trending_data/US_youtube_trending_data.parquet"

    uploader = ParquetToGCPLoader(PARQUET_PATH, BUCKET_NAME, GCP_PROJECT_ID)
    uploader.upload_to_gcs(GCS_DEST_PATH)

2025-04-08 22:05:20,273 - INFO - Parquet file uploaded to youtube_trending_data/US_youtube_trending_data.parquet


### 