In [16]:
! uv add pandas pyarrow google-cloud-storage
import os
import sys
import urllib.request
from concurrent.futures import ThreadPoolExecutor
from google.cloud import storage
from google.api_core.exceptions import NotFound, Forbidden
import time

[2mResolved [1m136 packages[0m [2min 4ms[0m[0m
[2mAudited [1m123 packages[0m [2min 13ms[0m[0m


In [17]:
os.makedirs(DOWNLOAD_DIR, exist_ok=True)

In [2]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "extras/gcp.json"

In [4]:
os.environ["GCP_GCS_BUCKET"] = "tyler_taxi_bucket"
BUCKET_NAME = os.environ["GCP_GCS_BUCKET"]

In [8]:
client = storage.Client.from_service_account_json(os.environ["GOOGLE_APPLICATION_CREDENTIALS"])
bucket = client.bucket(BUCKET_NAME)

In [5]:
BASE_URL = "https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-"
MONTHS = [f"{i:02d}" for i in range(1, 7)]
DOWNLOAD_DIR = "."

In [6]:
CHUNK_SIZE = 8 * 1024 * 1024

In [9]:
def download_file(month):
    url = f"{BASE_URL}{month}.parquet"
    file_path = os.path.join(DOWNLOAD_DIR, f"yellow_tripdata_2024-{month}.parquet")

    try:
        print(f"Downloading {url}...")
        urllib.request.urlretrieve(url, file_path)
        print(f"Downloaded: {file_path}")
        return file_path
    except Exception as e:
        print(f"Failed to download {url}: {e}")
        return None

In [11]:
def create_bucket(bucket_name):
    try:
        # Get bucket details
        bucket = client.get_bucket(bucket_name)

        # Check if the bucket belongs to the current project
        project_bucket_ids = [bckt.id for bckt in client.list_buckets()]
        if bucket_name in project_bucket_ids:
            print(
                f"Bucket '{bucket_name}' exists and belongs to your project. Proceeding..."
            )
        else:
            print(
                f"A bucket with the name '{bucket_name}' already exists, but it does not belong to your project."
            )
            sys.exit(1)

    except NotFound:
        # If the bucket doesn't exist, create it
        bucket = client.create_bucket(bucket_name)
        print(f"Created bucket '{bucket_name}'")
    except Forbidden:
        # If the request is forbidden, it means the bucket exists but you don't have access to see details
        print(
            f"A bucket with the name '{bucket_name}' exists, but it is not accessible. Bucket name is taken. Please try a different bucket name."
        )
        sys.exit(1)

In [12]:
def verify_gcs_upload(blob_name):
    return storage.Blob(bucket=bucket, name=blob_name).exists(client)

In [13]:
def upload_to_gcs(file_path, max_retries=3):
    blob_name = os.path.basename(file_path)
    blob = bucket.blob(blob_name)
    blob.chunk_size = CHUNK_SIZE

    create_bucket(BUCKET_NAME)

    for attempt in range(max_retries):
        try:
            print(f"Uploading {file_path} to {BUCKET_NAME} (Attempt {attempt + 1})...")
            blob.upload_from_filename(file_path)
            print(f"Uploaded: gs://{BUCKET_NAME}/{blob_name}")

            if verify_gcs_upload(blob_name):
                print(f"Verification successful for {blob_name}")
                return
            else:
                print(f"Verification failed for {blob_name}, retrying...")
        except Exception as e:
            print(f"Failed to upload {file_path} to GCS: {e}")

        time.sleep(5)

    print(f"Giving up on {file_path} after {max_retries} attempts.")

In [14]:
create_bucket(BUCKET_NAME)

with ThreadPoolExecutor(max_workers=4) as executor:
    file_paths = list(executor.map(download_file, MONTHS))

with ThreadPoolExecutor(max_workers=4) as executor:
    executor.map(upload_to_gcs, filter(None, file_paths))

print("All files processed and verified.")

Bucket 'tyler_taxi_bucket' exists and belongs to your project. Proceeding...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-01.parquet...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-02.parquet...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-03.parquet...
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-04.parquet...
Downloaded: ./yellow_tripdata_2024-01.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-05.parquet...
Downloaded: ./yellow_tripdata_2024-02.parquet
Downloading https://d37ci6vzurychx.cloudfront.net/trip-data/yellow_tripdata_2024-06.parquet...
Downloaded: ./yellow_tripdata_2024-03.parquet
Downloaded: ./yellow_tripdata_2024-04.parquet
Downloaded: ./yellow_tripdata_2024-05.parquet
Downloaded: ./yellow_tripdata_2024-06.parquet
Bucket 'tyler_taxi_bucket' exists and belongs to your project. Proceeding...
