Possible demand features:
- Hour of the day
- day of the week
- month
- holiday indicator

- temperature
- precipitation

- pick-up and drop-off locations


In [65]:
import pandas as pd
from h3 import h3
from dask import dataframe
import geopandas as gpd

In [66]:
# load data
ddf = dataframe.read_parquet("data/taxi_data_preprocessed.gzip")
ddf_weather = dataframe.read_parquet("data/prepared/weather_data_hourly_prepared.gzip")

In [67]:
ddf = ddf.repartition(npartitions=20)
ddf_weather = ddf_weather.repartition(npartitions=20)

## Calculate different Hexagons

In [4]:
H3_HEXAGON_RESOLUTIONS = [1,2, 3,4,5,6,9,12]

In [5]:
# get all unique pickup locations
df_locations = ddf.pickup_centroid_location.unique().compute().to_frame()

In [6]:
# convert to geodataframe
df_locations["pickup_centroid_location"] = gpd.GeoSeries.from_wkt(df_locations["pickup_centroid_location"])
df_geo = gpd.GeoDataFrame(df_locations, geometry='pickup_centroid_location', crs=4326)

In [8]:
# get hexagons with different resolutions
for resolution in H3_HEXAGON_RESOLUTIONS:
    df_geo[f"h3_{resolution}"] = df_geo.apply(lambda row: h3.geo_to_h3(row.pickup_centroid_location.y, row.pickup_centroid_location.x, resolution), axis=1)

In [9]:
# safe the hexagons as an csv-file
df_geo.to_csv("hexagons.csv")

## Add and Adjust Data Columns

In [68]:
# change data type
ddf["trip_start_timestamp"] = dataframe.to_datetime(ddf.trip_start_timestamp)
ddf["trip_end_timestamp"] = dataframe.to_datetime(ddf.trip_end_timestamp)

In [69]:
# add columns to taxi df
ddf["hour"] = ddf.trip_start_timestamp.dt.hour
ddf["date"] = ddf.trip_start_timestamp.dt.date

In [70]:
# add columns to weather df
ddf_weather["hour"] = ddf_weather.time.dt.hour
ddf_weather["date"] = ddf_weather.time.dt.date

In [71]:
# create timebins
def create_timebins(ddf, steps):
    for step in steps:
        bins = list(range(0,25,step))
        labels = range(len(bins)-1)
        ddf[f"time_bin_{step}"] = ddf["hour"].map_partitions(pd.cut, bins=bins,labels=labels, right=False, include_lowest=True)
    return ddf

In [72]:
# create timebin columns
ddf = create_timebins(ddf, [1,2,6,24])

In [73]:
# add weekday and month columns
ddf["weekday"] = ddf.trip_start_timestamp.dt.weekday
ddf["month"] = ddf.trip_start_timestamp.dt.month

In [74]:
# select columns
ddf_features = ddf[["hour", "weekday", "time_bin_1", "time_bin_2", "time_bin_6", "time_bin_24", "month", "date", "pickup_census_tract", "pickup_centroid_location"]]

In [75]:
# join dfs
ddf_features = ddf_features.merge(ddf_weather, on=["date", "hour"], how="inner")

### Data grouped by census tract

#### Group using different timebins

In [18]:
def group_by_different_timebins_to_csv(ddf, time_steps, location_dimension):
    for step in time_steps:  
        df_grouped = (ddf.groupby(by=["date", f"time_bin_{step}", location_dimension])
         .agg({
            "time": "size",
            'relativehumidity_2m (%)': "mean",
            "temperature_2m (°C)": "mean",
            "apparent_temperature (°C)": "mean",
            "precipitation (mm)": "mean",
            "cloudcover (%)": "mean",
            "windspeed_10m (km/h)": "mean",
            "weekday": "mean",
            "month": "mean"
            }
            )
         .rename(columns={"time": "ntrips"})
         .reset_index().compute()
        )
        df_grouped = df_grouped.dropna()
        df_grouped.to_csv(f"{location_dimension}_time_bin_{step}.csv")


In [191]:
# group data by different timebins and census tracts
group_by_different_timebins_to_csv(ddf_features, [1,2,6,24], "pickup_census_tract")

### Data grouped by H3

In [76]:
# load hexagons
ddf_h3 = dataframe.read_csv("hexagons.csv").drop(columns="Unnamed: 0")

In [77]:
ddf_h3

Unnamed: 0_level_0,pickup_centroid_location,h3_1,h3_2,h3_4,h3_5
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
,object,object,object,object,object
,...,...,...,...,...


In [78]:
# join hexagons
ddf_features = ddf_features.merge(ddf_h3, on="pickup_centroid_location")

In [57]:
# group by different timebins and hexagon resolutions and save as csv
for resolution in H3_HEXAGON_RESOLUTIONS:    
    
    # group data by time and location dimension
    group_by_different_timebins_to_csv(ddf_features, [1,2,6,24], f"h3_{resolution}")

### Allow trip demand to be zero

In [79]:
ddf_features

Unnamed: 0_level_0,hour,weekday,time_bin_1,time_bin_2,time_bin_6,time_bin_24,month,date,pickup_census_tract,pickup_centroid_location,time,temperature_2m (°C),relativehumidity_2m (%),apparent_temperature (°C),precipitation (mm),cloudcover (%),windspeed_10m (km/h),h3_1,h3_2,h3_4,h3_5
npartitions=20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
,int64,int64,category[known],category[known],category[known],category[known],int64,object,int64,object,datetime64[ns],float64,int64,float64,float64,int64,float64,object,object,object,object
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...


In [80]:
# select columns
ddf_features = ddf_features[["time", "hour", "weekday", "time_bin_1", "month", "date", "h3_5"]]

In [92]:
# calculate number of trips
df_trip_demand = ddf_features.groupby(by=["date", "time_bin_1", "h3_5"])["time"].size().to_frame().rename(columns={"time": "ntrips"}).reset_index().compute()

In [103]:
# add columns
df_trip_demand["date"] = pd.to_datetime(df_trip_demand.date)
df_trip_demand["weekday"] = df_trip_demand.date.dt.weekday
df_trip_demand["month"] = df_trip_demand.date.dt.month

In [105]:
df_weather = ddf_weather.compute()

In [108]:
df_weather["date"] = pd.to_datetime(df_weather.date)

In [109]:
# join weather data
df = df_trip_demand.merge(df_weather, left_on=["date", "time_bin_1"], right_on=["date", "hour"], how="inner")

In [111]:
df.to_csv("h3_5_time_bin_1_zero.csv")