In [2]:
! uv add pandas pyarrow google-cloud-storage
import os
import requests
import pandas as pd
from google.cloud import storage

[2mResolved [1m136 packages[0m [2min 4ms[0m[0m
[2mAudited [1m123 packages[0m [2min 1ms[0m[0m


In [119]:
os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = "extras/gcp.json"

In [120]:
os.environ["GCP_GCS_BUCKET"] = "tyler_taxi_bucket"
BUCKET = os.environ["GCP_GCS_BUCKET"]

In [121]:
# services = ['fhv','green','yellow']
init_url = 'https://github.com/DataTalksClub/nyc-tlc-data/releases/download/'

In [122]:
def upload_to_gcs(bucket, object_name, local_file):
    """
    Ref: https://cloud.google.com/storage/docs/uploading-objects#storage-upload-object-python
    """
    # # WORKAROUND to prevent timeout for files > 6 MB on 800 kbps upload speed.
    # # (Ref: https://github.com/googleapis/python-storage/issues/74)
    # storage.blob._MAX_MULTIPART_SIZE = 5 * 1024 * 1024  # 5 MB
    # storage.blob._DEFAULT_CHUNKSIZE = 5 * 1024 * 1024  # 5 MB

    client = storage.Client()
    bucket = client.bucket(bucket)
    blob = bucket.blob(object_name)
    blob.upload_from_filename(local_file)

In [123]:
def web_to_gcs(year, service):
    for i in range(12):
        
        # sets the month part of the file_name string
        month = '0'+str(i+1)
        month = month[-2:]

        # csv file_name
        file_name = f"{service}_tripdata_{year}-{month}.csv.gz"

        # download it using requests via a pandas df
        request_url = f"{init_url}{service}/{file_name}"
        r = requests.get(request_url)
        open(file_name, 'wb').write(r.content)
        print(f"Local: {file_name}")

        # read it back into a parquet file, but explicitly state 'passenger_count'& 'store_and_fwd_flag' types.
        df = pd.read_csv(file_name, compression='gzip', dtype={
            'VendorID': 'Int64',
            'passenger_count': 'Int64',
            'trip_distance': 'float64',
            'RatecodeID': 'Int64',
            'store_and_fwd_flag': 'str',
            'payment_type': 'Int64',
            'fare_amount': 'float64',
            'extra': 'float64',
            'mta_tax': 'float64',
            'tip_amount': 'float64',
            'tolls_amount': 'float64',
            'improvement_surcharge': 'float64',
            'total_amount': 'float64',
            'congestion_surcharge': 'float64'
            })
        
        if 'tpep_pickup_datetime' in df.columns:
            df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
            df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
        elif 'lpep_pickup_datetime' in df.columns:
            df['lpep_pickup_datetime'] = pd.to_datetime(df['lpep_pickup_datetime'])
            df['lpep_dropoff_datetime'] = pd.to_datetime(df['lpep_dropoff_datetime'])
            
        file_name = file_name.replace('.csv.gz', '.parquet')
        df.to_parquet(file_name, engine='pyarrow')
        print(f"Parquet: {file_name}")

        # upload it to gcs 
        upload_to_gcs(BUCKET, f"{service}/{file_name}", file_name)
        print(f"GCS: {service}/{file_name}")

In [124]:
web_to_gcs('2019', 'green')

Local: green_tripdata_2019-01.csv.gz
Parquet: green_tripdata_2019-01.parquet
GCS: green/green_tripdata_2019-01.parquet
Local: green_tripdata_2019-02.csv.gz
Parquet: green_tripdata_2019-02.parquet
GCS: green/green_tripdata_2019-02.parquet
Local: green_tripdata_2019-03.csv.gz
Parquet: green_tripdata_2019-03.parquet
GCS: green/green_tripdata_2019-03.parquet
Local: green_tripdata_2019-04.csv.gz
Parquet: green_tripdata_2019-04.parquet
GCS: green/green_tripdata_2019-04.parquet
Local: green_tripdata_2019-05.csv.gz
Parquet: green_tripdata_2019-05.parquet
GCS: green/green_tripdata_2019-05.parquet
Local: green_tripdata_2019-06.csv.gz
Parquet: green_tripdata_2019-06.parquet
GCS: green/green_tripdata_2019-06.parquet
Local: green_tripdata_2019-07.csv.gz
Parquet: green_tripdata_2019-07.parquet
GCS: green/green_tripdata_2019-07.parquet
Local: green_tripdata_2019-08.csv.gz
Parquet: green_tripdata_2019-08.parquet
GCS: green/green_tripdata_2019-08.parquet
Local: green_tripdata_2019-09.csv.gz
Parquet: gr

In [125]:
web_to_gcs('2020', 'green')

Local: green_tripdata_2020-01.csv.gz
Parquet: green_tripdata_2020-01.parquet
GCS: green/green_tripdata_2020-01.parquet
Local: green_tripdata_2020-02.csv.gz
Parquet: green_tripdata_2020-02.parquet
GCS: green/green_tripdata_2020-02.parquet
Local: green_tripdata_2020-03.csv.gz
Parquet: green_tripdata_2020-03.parquet
GCS: green/green_tripdata_2020-03.parquet
Local: green_tripdata_2020-04.csv.gz
Parquet: green_tripdata_2020-04.parquet
GCS: green/green_tripdata_2020-04.parquet
Local: green_tripdata_2020-05.csv.gz
Parquet: green_tripdata_2020-05.parquet
GCS: green/green_tripdata_2020-05.parquet
Local: green_tripdata_2020-06.csv.gz
Parquet: green_tripdata_2020-06.parquet
GCS: green/green_tripdata_2020-06.parquet
Local: green_tripdata_2020-07.csv.gz
Parquet: green_tripdata_2020-07.parquet
GCS: green/green_tripdata_2020-07.parquet
Local: green_tripdata_2020-08.csv.gz
Parquet: green_tripdata_2020-08.parquet
GCS: green/green_tripdata_2020-08.parquet
Local: green_tripdata_2020-09.csv.gz
Parquet: gr

In [126]:
web_to_gcs('2019', 'yellow')

Local: yellow_tripdata_2019-01.csv.gz
Parquet: yellow_tripdata_2019-01.parquet
GCS: yellow/yellow_tripdata_2019-01.parquet
Local: yellow_tripdata_2019-02.csv.gz
Parquet: yellow_tripdata_2019-02.parquet
GCS: yellow/yellow_tripdata_2019-02.parquet
Local: yellow_tripdata_2019-03.csv.gz
Parquet: yellow_tripdata_2019-03.parquet
GCS: yellow/yellow_tripdata_2019-03.parquet
Local: yellow_tripdata_2019-04.csv.gz
Parquet: yellow_tripdata_2019-04.parquet
GCS: yellow/yellow_tripdata_2019-04.parquet
Local: yellow_tripdata_2019-05.csv.gz
Parquet: yellow_tripdata_2019-05.parquet
GCS: yellow/yellow_tripdata_2019-05.parquet
Local: yellow_tripdata_2019-06.csv.gz
Parquet: yellow_tripdata_2019-06.parquet
GCS: yellow/yellow_tripdata_2019-06.parquet
Local: yellow_tripdata_2019-07.csv.gz
Parquet: yellow_tripdata_2019-07.parquet
GCS: yellow/yellow_tripdata_2019-07.parquet
Local: yellow_tripdata_2019-08.csv.gz
Parquet: yellow_tripdata_2019-08.parquet
GCS: yellow/yellow_tripdata_2019-08.parquet
Local: yellow_tr

In [127]:
web_to_gcs('2020', 'yellow')

Local: yellow_tripdata_2020-01.csv.gz
Parquet: yellow_tripdata_2020-01.parquet
GCS: yellow/yellow_tripdata_2020-01.parquet
Local: yellow_tripdata_2020-02.csv.gz
Parquet: yellow_tripdata_2020-02.parquet
GCS: yellow/yellow_tripdata_2020-02.parquet
Local: yellow_tripdata_2020-03.csv.gz
Parquet: yellow_tripdata_2020-03.parquet
GCS: yellow/yellow_tripdata_2020-03.parquet
Local: yellow_tripdata_2020-04.csv.gz
Parquet: yellow_tripdata_2020-04.parquet
GCS: yellow/yellow_tripdata_2020-04.parquet
Local: yellow_tripdata_2020-05.csv.gz
Parquet: yellow_tripdata_2020-05.parquet
GCS: yellow/yellow_tripdata_2020-05.parquet
Local: yellow_tripdata_2020-06.csv.gz
Parquet: yellow_tripdata_2020-06.parquet
GCS: yellow/yellow_tripdata_2020-06.parquet
Local: yellow_tripdata_2020-07.csv.gz
Parquet: yellow_tripdata_2020-07.parquet
GCS: yellow/yellow_tripdata_2020-07.parquet
Local: yellow_tripdata_2020-08.csv.gz
Parquet: yellow_tripdata_2020-08.parquet
GCS: yellow/yellow_tripdata_2020-08.parquet
Local: yellow_tr