# FHV retrieval prototype

source: https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet, for all 2019

Can partition the url to

- base: https://d37ci6vzurychx.cloudfront.net/trip-data/
- filename: `<taxi_type>_tripdata_<yyyy>-<mm>.parqet`

In [1]:
from pathlib import Path

import pandas as pd
from logging import getLogger

In [3]:
logger = getLogger(name="fhv.ipynb")
def fetch(dataset_url: str) -> pd.DataFrame:
    """Read taxi data in parquet format from web and 
    return as dataframe

    Set retries=3 to get around web traffic jitters
    """
    # logger = get_run_logger()
    df = pd.read_parquet(dataset_url, engine='pyarrow')
    logger.info(f"{len(df)} rows loaded from url")
    return df

In [13]:
from collections import defaultdict
urls = defaultdict(str)
urls[0] = "https://d37ci6vzurychx.cloudfront.net/trip-data/fhv_tripdata_2019-01.parquet"
# df = fetch(urls[0])
# print(f'num records in fhv jan: {len(df)}')

In [7]:
df.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00001,2019-01-01 00:30:00,2019-01-01 02:51:55,,,,B00001
1,B00001,2019-01-01 00:45:00,2019-01-01 00:54:49,,,,B00001
2,B00001,2019-01-01 00:15:00,2019-01-01 00:54:52,,,,B00001
3,B00008,2019-01-01 00:19:00,2019-01-01 00:39:00,,,,B00008
4,B00008,2019-01-01 00:27:00,2019-01-01 00:37:00,,,,B00008


In [5]:
df.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                          float64
Affiliated_base_number            object
dtype: object

Datatypes are all a-okay. Try with `pd.io.ql.get_schema`?

In [6]:
print(pd.io.sql.get_schema(df, name='fhv_taxi_data'))

CREATE TABLE "fhv_taxi_data" (
"dispatching_base_num" TEXT,
  "pickup_datetime" TIMESTAMP,
  "dropOff_datetime" TIMESTAMP,
  "PUlocationID" REAL,
  "DOlocationID" REAL,
  "SR_Flag" REAL,
  "Affiliated_base_number" TEXT
)


In [2]:
import pyarrow as pa
import pyarrow.parquet as pq
import pyarrow.compute as pc

In [3]:
path_feb = Path("../data/taxi_ingest_data/fhv/fhv_tripdata_2019-02.parquet")
feb = pq.read_table(path_feb)
# ignore the casting error:
# pyarrow.lib.ArrowInvalid: Casting from timestamp[us] to timestamp[ns] would result in out of bounds
df_feb = feb.to_pandas(safe=False)
df_feb.head()

Unnamed: 0,dispatching_base_num,pickup_datetime,dropOff_datetime,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number
0,B00037,2019-02-01 00:08:44,2019-02-01 00:23:35,264.0,265.0,,B00037
1,B00037,2019-02-01 00:27:51,2019-02-01 00:32:54,264.0,265.0,,B00037
2,B00037,2019-02-01 00:18:30,2019-02-01 00:25:45,264.0,265.0,,B00037
3,B00037,2019-02-01 00:43:15,2019-02-01 00:48:29,264.0,265.0,,B00037
4,B00037,2019-02-01 00:01:45,2019-02-01 00:09:13,264.0,265.0,,B00037


In [8]:
# but results in weird times; originally meant to be 3019-02-03 17:30:00.000000
print(f"num_recs: {len(df_feb)}\nweird timestamp: {df_feb['dropOff_datetime'].min()}")

num_recs: 1707650
weird timestamp: 1849-12-25 18:20:52.580896768


Filter out the values before converting to `dataframe`:

In [9]:
# table = pq.read_table(path_feb)
df_feb_clean = feb.filter(
    pc.less_equal(feb["dropOff_datetime"], pa.scalar(pd.Timestamp.max))
).to_pandas()

In [10]:
print(f"num_recs: {len(df_feb_clean)}\nweird timestamp: {df_feb_clean['dropOff_datetime'].min()}")

num_recs: 1707649
weird timestamp: 2019-02-01 00:01:00


In [4]:
# feb.column_names
dt_cols = [col for col in feb.column_names if "datetime" in col]
nondt_cols = [col for col in feb.column_names if col not in dt_cols]
print(dt_cols, nondt_cols)
df_feb_dts = pd.DataFrame()
for dt_col in dt_cols:
    feb_dt = feb.column(dt_col)
    df_feb_dts[dt_col] = pd.to_datetime(feb_dt, errors='coerce')

# print(feb_dt)
# dts = pd.to_datetime(feb_dt, errors='coerce')
df_feb = feb.select(nondt_cols).to_pandas()
df_feb = pd.concat([df_feb, df_feb_dts], axis=1)
df_feb.head()

['pickup_datetime', 'dropOff_datetime'] ['dispatching_base_num', 'PUlocationID', 'DOlocationID', 'SR_Flag', 'Affiliated_base_number']


Unnamed: 0,dispatching_base_num,PUlocationID,DOlocationID,SR_Flag,Affiliated_base_number,pickup_datetime,dropOff_datetime
0,B00037,264.0,265.0,,B00037,2019-02-01 00:08:44,2019-02-01 00:23:35
1,B00037,264.0,265.0,,B00037,2019-02-01 00:27:51,2019-02-01 00:32:54
2,B00037,264.0,265.0,,B00037,2019-02-01 00:18:30,2019-02-01 00:25:45
3,B00037,264.0,265.0,,B00037,2019-02-01 00:43:15,2019-02-01 00:48:29
4,B00037,264.0,265.0,,B00037,2019-02-01 00:01:45,2019-02-01 00:09:13


In [15]:
df_taxi = df_feb
df_taxi['SR_Flag'] = df_taxi['SR_Flag'].astype('Int8', errors='ignore')
id_cols = [col for col in df_taxi.columns if "locationID" in col]
df_taxi[id_cols] = df_taxi[id_cols].astype('Int32', errors='ignore')
df_taxi.dtypes

dispatching_base_num              object
PUlocationID                       Int32
DOlocationID                       Int32
SR_Flag                             Int8
Affiliated_base_number            object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
dtype: object

In [14]:
df_taxi[id_cols + ['SR_Flag']].head().astype('Int32', errors='ignore')

Unnamed: 0,PUlocationID,DOlocationID,SR_Flag
0,264,265,
1,264,265,
2,264,265,
3,264,265,
4,264,265,


In [7]:
foo = pd.Series([None, 1, 2])
foo.astype()

dtype('float64')

In [12]:
path_dec = Path("../data/taxi_ingest_data/fhv/fhv_tripdata_2019-12.parquet")
dec = pq.read_table(path_dec)
dec.schema

dispatching_base_num: string
pickup_datetime: timestamp[us]
dropOff_datetime: timestamp[us]
PUlocationID: double
DOlocationID: double
SR_Flag: null
Affiliated_base_number: string
-- schema metadata --
pandas: '{"index_columns": [], "column_indexes": [], "columns": [{"name":' + 1000

`table.cast(target_schema)` allows us to set the datatypes before sending to GCS for bigquery external table

In [20]:
fhv_schema = pa.schema([
    ('dispatching_base_num', pa.string()),
    ('pickup_datetime', pa.timestamp('us')),
    ('dropOff_datetime', pa.timestamp('us')),
    ('PUlocationID', pa.int16()),
    ('DOlocationID', pa.int16()),
    ('SR_Flag', pa.int8()),
    ('Affiliated_base_number', pa.string()),
])
dec_typed = dec.cast(target_schema=fhv_schema)
dec_typed.schema

dispatching_base_num: string
pickup_datetime: timestamp[us]
dropOff_datetime: timestamp[us]
PUlocationID: int16
DOlocationID: int16
SR_Flag: int8
Affiliated_base_number: string

In [25]:
typed_parq_path = Path("../data/cache/fhv-2019-12.parquet")
pq.write_table(dec_typed, typed_parq_path)

In [26]:
# df_schema = {
#     pa.int16() : pd.
# }
df_dec_typed = pd.read_parquet(typed_parq_path)
df_dec_typed.dtypes

dispatching_base_num              object
pickup_datetime           datetime64[ns]
dropOff_datetime          datetime64[ns]
PUlocationID                     float64
DOlocationID                     float64
SR_Flag                          float64
Affiliated_base_number            object
dtype: object

In [27]:
dec_read = pq.read_table(typed_parq_path)
dec_read.schema

dispatching_base_num: string
pickup_datetime: timestamp[us]
dropOff_datetime: timestamp[us]
PUlocationID: int16
DOlocationID: int16
SR_Flag: int8
Affiliated_base_number: string

In [18]:
fp = Path(urls[1])
fname = fp.name
fname

'fhv_tripdata_2019-02.parquet'

In [16]:
data_dir = "../data/taxi_ingest_data"
taxi_type = "fhv"
year = 2019
month = 1
dataset_file = f"{taxi_type}_tripdata_{year}-{month:02}"
fpath = Path(f"{data_dir}/{taxi_type}/{dataset_file}.parquet")
local_path = write_local(df, fpath)

NameError: name 'Path' is not defined

In [10]:
from prefect import flow, task
from prefect_gcp.cloud_storage import GcsBucket

In [11]:
@task()
def upload_gcs(block_name: str, fpath: Path) -> None:
    """Upload the local parquet file to GCS"""
    gcs_block = GcsBucket.load(block_name)
    # this will return <color>/<filename>.parquet
    gcs_path = Path(fpath.parts[-2]) / fpath.parts[-1]
    gcs_block.upload_from_path(from_path=fpath, to_path=gcs_path)
    return

In [12]:
@flow()
def web_gcs_parq(
    taxi_type: str, year: int, month: int, block_name: str, data_dir: str = "../data/cache"
) -> None:
    """Main ETL function"""
    dataset_file = f"{taxi_type}_tripdata_{year}-{month:02}"
    dataset_url = f"https://d37ci6vzurychx.cloudfront.net/trip-data/{dataset_file}.parquet"

    fpath = Path(f"{data_dir}/{taxi_type}/{dataset_file}.parquet")
    if not fpath.exists():
        df = fetch(dataset_url)
        # df_clean = clean(df)
        fpath = write_local(df, fpath)
    upload_gcs(block_name, fpath)

In [13]:
block_name = "ny-taxi-gcs"
web_gcs_parq(taxi_type, year, month, block_name, data_dir=data_dir)

[Completed(message=None, type=COMPLETED, result=LiteralResult(type='literal', value=None))]

In [2]:
mths = "1-12"
a, b = list(map(int, mths.split("-")))
print(a, b)

1 12


In [7]:
mths = "12"
if '-' in mths:
    a, b = list(map(int, mths.split("-")))
else:
    a = int(mths)

In [9]:
list(range(a, a+1))

[12]

In [10]:
a = b = 2
print(a, b)

2 2


In [1]:
blobs = ['<Blob: dtc_data_lake_de-zoom-83, data/fhv/fhv_tripdata_2019-01.parquet, 1675877819813642>', '<Blob: dtc_data_lake_de-zoom-83, data/fhv/fhv_tripdata_2019-02.parquet, 1675878686204679>']
fname = "fhv_tripdata_2019-02.parquet"

In [10]:
b = [[fn for fn in blob.split() if '/' in fn][0] for blob in blobs]
b

['data/fhv/fhv_tripdata_2019-01.parquet,',
 'data/fhv/fhv_tripdata_2019-02.parquet,']

In [13]:
present = [fname in n for n in b]
present

[False, True]

In [15]:
any(present)

True