#### 02 â€“ GCP Connectivity & Data Ingestion
> **Phase:** Research & Development (R&D)  
> **Source:** Local Staging (../data/)   
> **Destination:** Google Cloud Storage (GCS) & BigQuery
---
**Goal:**  
> Validate cloud infrastructure and automate the ingestion of raw data into the Google Cloud ecosystem.
* **Auth:** Verify Application Default Credentials (ADC) and Service Account permissions.
* **GCS (Bronze):** Upload the local raw Parquet file to the cloud storage bucket.
* **BigQuery (Silver):** Create an External Table to query GCS Parquet files directly.

---

#### Setup & Authentication

In [None]:
import os
import pandas as pd
from google.cloud import storage, bigquery
from pathlib import Path
from dotenv import load_dotenv
from tqdm import tqdm

In [None]:
# find the project root 
root_path = Path.cwd().parent if 'notebooks' in os.getcwd() else Path.cwd()
load_dotenv(dotenv_path=root_path / ".env")

In [None]:
project_id = os.getenv('GCP_PROJECT_ID')
bucket_name = os.getenv('GCP_GCS_BUCKET')

In [None]:
# Connectivity test
try:
    storage_client = storage.Client(project=project_id)
    bq_client = bigquery.Client(project=project_id)
    print(f"Auth Success: Using Application Default Credentials (ADC)")
except Exception as e:
    print(f"Auth Failed: {e}")

#### GCS Ingestion (Bronze Layer)

In [None]:
def upload_to_gcs(bucket_name, source_file_path, destination_blob_name):
    bucket = storage_client.bucket(bucket_name)
    blob = bucket.blob(destination_blob_name)
    file_size = os.path.getsize(source_file_path)
    
    print(f"Uploading to gs://{bucket_name}/{destination_blob_name}...")
    with tqdm(total=file_size, unit='B', unit_scale=True, desc="Progress") as pbar:
        blob.upload_from_filename(source_file_path)
        pbar.update(file_size)
    print(f"Upload Complete.")

local_file = root_path / 'data' / 'green_tripdata_2025-11.parquet'
gcs_destination = 'bronze/green_taxi_2025_11.parquet'

upload_to_gcs(bucket_name, str(local_file), gcs_destination)

#### BigQuery Staging (Silver Layer)
Register the external data and load the lookup dimensions.

In [None]:
# create external table for taxi trips
table_id = f"{project_id}.trips_data_silver.ext_green_taxi"
external_config = bigquery.ExternalConfig("PARQUET")
external_config.source_uris = [f"gs://{bucket_name}/{gcs_destination}"]

table = bigquery.Table(table_id)
table.external_data_configuration = external_config

try:
    bq_client.create_table(table, exists_ok=True)
    print(f"External Table registered: {table_id}")
except Exception as e:
    print(f"BigQuery Table creation failed: {e}")

# load native zones table 
zones_path = root_path / 'data' / 'taxi_zone_lookup.csv'
df_zones = pd.read_csv(zones_path)
table_fq = f"{project_id}.trips_data_silver.zones"

try:
    job_config = bigquery.LoadJobConfig(write_disposition="WRITE_TRUNCATE")
    job = bq_client.load_table_from_dataframe(df_zones, table_fq, job_config=job_config)
    job.result() 
    print(f"Dimension Table 'zones' loaded to BigQuery: {table_fq}")
except Exception as e:
    print(f"Zones load failed: {e}")

#### Infrastructure Validation
Run a quick SQL query to ensure the schema was mapped correctly.

In [None]:
# final check: join tables to prove infra is fully functional
sql = f"""
    SELECT 
        t.vendorid, 
        t.lpep_pickup_datetime, 
        z.Zone 
    FROM `{table_id}` t
    JOIN `{table_fq}` z ON t.PULocationID = z.LocationID
    LIMIT 5
"""
try:
    df_val = bq_client.query(sql).to_dataframe()
    print("Sample Joined Data:")
    display(df_val)
    print("Infrastructure Validated. Staging Layer is ready for Analytics.")
except Exception as e:
    print(f"Validation Query failed: {e}")