# FHV retrieval prototype

source: https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet, for all 2019

Can partition the url to

- base: https://d37ci6vzurychx.cloudfront.net/trip-data/
- filename: `<taxi_type>_tripdata_<yyyy>-<mm>.parqet`

In [17]:
from pathlib import Path

import pandas as pd
from logging import getLogger

In [3]:
logger = getLogger(name="fhv.ipynb")
def fetch(dataset_url: str) -> pd.DataFrame:
    """Read taxi data in parquet format from web and 
    return as dataframe

    Set retries=3 to get around web traffic jitters
    """
    # logger = get_run_logger()
    df = pd.read_parquet(dataset_url, engine='pyarrow')
    logger.info(f"{len(df)} rows loaded from url")
    return df

In [13]:
from collections import defaultdict
urls = defaultdict(str)
urls[0] = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet"
# df = fetch(urls[0])
# print(f'num records in fhv jan: {len(df)}')

In [7]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00001,2019-01-01 00:30:00,2019-01-01 02:51:55,,,,B00001
1,B00001,2019-01-01 00:45:00,2019-01-01 00:54:49,,,,B00001
2,B00001,2019-01-01 00:15:00,2019-01-01 00:54:52,,,,B00001
3,B00008,2019-01-01 00:19:00,2019-01-01 00:39:00,,,,B00008
4,B00008,2019-01-01 00:27:00,2019-01-01 00:37:00,,,,B00008


In [5]:
df.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                          float64
Affiliated_base_number            object
dtype: object

Datatypes are all a-okay. Try with `pd.io.ql.get_schema`?

In [6]:
print(pd.io.sql.get_schema(df, name='fhv_taxi_data'))

CREATE TABLE "fhv_taxi_data" (
"dispatching_base_num" TEXT,
  "pickup_datetime" TIMESTAMP,
  "dropOff_datetime" TIMESTAMP,
  "PUlocationID" REAL,
  "DOlocationID" REAL,
  "SR_Flag" REAL,
  "Affiliated_base_number" TEXT
)


In [11]:
import pyarrow.parquet as pq

In [14]:
urls[1] = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-02.parquet"
table = pq.read_table(urls[1])

ArrowInvalid: Expected a local filesystem path, got a URI: 'https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-02.parquet'

In [18]:
fp = Path(urls[1])
fname = fp.name
fname

'fhv_tripdata_2019-02.parquet'

In [16]:
data_dir = "../data/taxi_ingest_data"
taxi_type = "fhv"
year = 2019
month = 1
dataset_file = f"{taxi_type}_tripdata_{year}-{month:02}"
fpath = Path(f"{data_dir}/{taxi_type}/{dataset_file}.parquet")
local_path = write_local(df, fpath)

NameError: name 'Path' is not defined

In [10]:
from prefect import flow, task
from prefect_gcp.cloud_storage import GcsBucket

In [11]:
@task()
def upload_gcs(block_name: str, fpath: Path) -> None:
    """Upload the local parquet file to GCS"""
    gcs_block = GcsBucket.load(block_name)
    # this will return <color>/<filename>.parquet
    gcs_path = Path(fpath.parts[-2]) / fpath.parts[-1]
    gcs_block.upload_from_path(from_path=fpath, to_path=gcs_path)
    return

In [12]:
@flow()
def web_gcs_parq(
    taxi_type: str, year: int, month: int, block_name: str, data_dir: str = "../data/cache"
) -> None:
    """Main ETL function"""
    dataset_file = f"{taxi_type}_tripdata_{year}-{month:02}"
    dataset_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{dataset_file}.parquet"

    fpath = Path(f"{data_dir}/{taxi_type}/{dataset_file}.parquet")
    if not fpath.exists():
        df = fetch(dataset_url)
        # df_clean = clean(df)
        fpath = write_local(df, fpath)
    upload_gcs(block_name, fpath)

In [13]:
block_name = "ny-taxi-gcs"
web_gcs_parq(taxi_type, year, month, block_name, data_dir=data_dir)

[Completed(message=None, type=COMPLETED, result=LiteralResult(type='literal', value=None))]

In [2]:
mths = "1-12"
a, b = list(map(int, mths.split("-")))
print(a, b)

1 12


In [7]:
mths = "12"
if '-' in mths:
    a, b = list(map(int, mths.split("-")))
else:
    a = int(mths)

In [9]:
list(range(a, a+1))

[12]

In [10]:
a = b = 2
print(a, b)

2 2


In [1]:
blobs = ['<Blob: dtc_data_lake_de-zoom-83, data/fhv/fhv_tripdata_2019-01.parquet, 1675877819813642>', '<Blob: dtc_data_lake_de-zoom-83, data/fhv/fhv_tripdata_2019-02.parquet, 1675878686204679>']
fname = "fhv_tripdata_2019-02.parquet"

In [10]:
b = [[fn for fn in blob.split() if '/' in fn][0] for blob in blobs]
b

['data/fhv/fhv_tripdata_2019-01.parquet,',
 'data/fhv/fhv_tripdata_2019-02.parquet,']

In [13]:
present = [fname in n for n in b]
present

[False, True]

In [15]:
any(present)

True