-
Notifications
You must be signed in to change notification settings - Fork 1
/
nyc-taxi.parquet.sh
27 lines (24 loc) · 1.15 KB
/
nyc-taxi.parquet.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# The DuckDB executable must be on your environment path!
# Use DuckDB version 0.9.2 or later
# Write to a named file as portable file descriptors such as
# (/dev/stdout) appear to be unavailable in GitHub actions.
duckdb :memory: << EOF
-- Load spatial extension
INSTALL spatial; LOAD spatial;
-- Project, following the example at https://github.com/duckdb/duckdb_spatial
CREATE TEMP TABLE rides AS SELECT
pickup_datetime::TIMESTAMP AS datetime,
ST_Transform(ST_Point(pickup_latitude, pickup_longitude), 'EPSG:4326', 'ESRI:102718') AS pick,
ST_Transform(ST_Point(dropoff_latitude, dropoff_longitude), 'EPSG:4326', 'ESRI:102718') AS drop
FROM 'https://uwdata.github.io/mosaic-datasets/data/nyc-rides-2010.parquet';
-- Write output parquet file
COPY (SELECT
HOUR(datetime) + MINUTE(datetime) / 60 AS time,
ST_X(pick)::INTEGER AS px, -- extract pickup x-coord
ST_Y(pick)::INTEGER AS py, -- extract pickup y-coord
ST_X(drop)::INTEGER AS dx, -- extract dropff x-coord
ST_Y(drop)::INTEGER AS dy -- extract dropff y-coord
FROM rides) TO 'trips.parquet' WITH (FORMAT PARQUET);
EOF
cat trips.parquet >&1 # Write output to stdout
rm trips.parquet # Clean up