In [1]:
import pandas as pd
import h3
import json
from tqdm import tqdm

def timestamp_to_json(obj):
    if isinstance(obj, pd.Timestamp):
        return obj.isoformat()  # 转换为 ISO 格式的字符串
    raise TypeError(f"Type {type(obj)} not serializable")

def get_epoch(days, hours, minutes):
    return ((days-22) * 24 * 60 + hours * 60 + minutes) // 5

# 读取 Parquet 文件
df = pd.read_csv('yellow_tripdata_2016-02.csv')
# df = pd.read_parquet('green_tripdata_2016-02.parquet')

# 确保日期列是 datetime 类型，进行转换
df['tpep_pickup_datetime'] = pd.to_datetime(df['tpep_pickup_datetime'])
df['tpep_dropoff_datetime'] = pd.to_datetime(df['tpep_dropoff_datetime'])
pickup_day = df['tpep_pickup_datetime'].dt.day.astype(int)
dropoff_day = df['tpep_dropoff_datetime'].dt.day.astype(int)
# 设置日期范围
start_date = 22
end_date = 28

# 过滤日期范围，假设你想筛选pickup和dropoff时间都在2016-02-22到2016-02-28之间
filtered_df = df[
    (pickup_day >= start_date) &
    (pickup_day <= end_date) &
    (dropoff_day >= start_date) &
    (dropoff_day <= end_date)
]


"""
Index(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',
       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',
       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',
       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',
       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',
       'pickup_epoch', 'dropoff_epoch'],
      dtype='object')
372526

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')
2798992
"""


"\nIndex(['VendorID', 'lpep_pickup_datetime', 'lpep_dropoff_datetime',\n       'store_and_fwd_flag', 'RatecodeID', 'PULocationID', 'DOLocationID',\n       'passenger_count', 'trip_distance', 'fare_amount', 'extra', 'mta_tax',\n       'tip_amount', 'tolls_amount', 'ehail_fee', 'improvement_surcharge',\n       'total_amount', 'payment_type', 'trip_type', 'congestion_surcharge',\n       'pickup_epoch', 'dropoff_epoch'],\n      dtype='object')\n372526\n\nIndex(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',\n       'passenger_count', 'trip_distance', 'pickup_longitude',\n       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',\n       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',\n       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',\n       'improvement_surcharge', 'total_amount'],\n      dtype='object')\n2798992\n"

In [2]:
nyc_center_long = -74.0060
nyc_center_lat = 40.7128
center_cell = h3.latlng_to_cell(nyc_center_lat, nyc_center_long, 8)
hexagons = h3.grid_disk(center_cell, 18)
selected_hexagons = list(hexagons)[:370]

In [3]:
def convert_to_h3_index(lng, lat, resolution):
    # 使用h3.latlng_to_cell函数直接将经纬度转换为H3索引
    h3_index = h3.latlng_to_cell(lat, lng, resolution)
    #return h3_index
    
    return h3_index if h3_index in selected_hexagons else -1

def filter_out_of_bounds(data):
    # 纽约市的边界坐标
    bounds = {
        'min_longitude': -74.3280,
        'max_longitude': -73.6317,
        'min_latitude': 40.5503,
        'max_latitude': 40.8445
    }
    
    # 将边长转换为单位
    res = 8
    
    
    # 过滤掉超出边界的行程
    data['pickup_h3_index'] = data.apply(lambda row: convert_to_h3_index(row['pickup_longitude'], row['pickup_latitude'], res), axis=1)
    data['dropoff_h3_index'] = data.apply(lambda row: convert_to_h3_index(row['dropoff_longitude'], row['dropoff_latitude'], res), axis=1)
    
    data = data[data["pickup_h3_index"] != -1]
    data = data[data["dropoff_h3_index"] != -1]
    data = data[data["pickup_h3_index"] != data["dropoff_h3_index"]]
    
    data = data[(data['pickup_longitude'] >= bounds['min_longitude']) & 
               (data['pickup_longitude'] <= bounds['max_longitude']) &
               (data['pickup_latitude'] >= bounds['min_latitude']) & 
               (data['pickup_latitude'] <= bounds['max_latitude'])]
    data = data[(data['dropoff_longitude'] >= bounds['min_longitude']) & 
               (data['dropoff_longitude'] <= bounds['max_longitude']) &
               (data['dropoff_latitude'] >= bounds['min_latitude']) & 
               (data['dropoff_latitude'] <= bounds['max_latitude'])]
    
    return data
filtered_df2 = filter_out_of_bounds(filtered_df)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['pickup_h3_index'] = data.apply(lambda row: convert_to_h3_index(row['pickup_longitude'], row['pickup_latitude'], res), axis=1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['dropoff_h3_index'] = data.apply(lambda row: convert_to_h3_index(row['dropoff_longitude'], row['dropoff_latitude'], res), axis=1)


In [4]:
print(len(filtered_df2), len(set(filtered_df2['pickup_h3_index']).union(set(filtered_df2['dropoff_h3_index']))))

2048640 292


In [5]:
# 生成request
import joblib
requests = []
for i in tqdm(range(len(filtered_df2))):
    record = filtered_df2.iloc[i]
    days = record['tpep_pickup_datetime'].day
    hours = record['tpep_pickup_datetime'].hour
    minutes = record['tpep_pickup_datetime'].minute
    start_epoch = get_epoch(days, hours, minutes)
    
    days = record['tpep_dropoff_datetime'].day
    hours = record['tpep_dropoff_datetime'].hour
    minutes = record['tpep_dropoff_datetime'].minute
    end_epoch = get_epoch(days, hours, minutes)
    requests.append({
        'id': i+1,
        'pickup_zone': record["pickup_h3_index"],
        'dropoff_zone': record["dropoff_h3_index"],
        'pickup_time': start_epoch,
        'dropoff_time': end_epoch
    })
joblib.dump(requests, "requests.pth")

100%|███████████████████████████████████████████████████████████████████████| 2048640/2048640 [05:43<00:00, 5960.38it/s]


['requests.pth']

In [12]:
# 生成zone travel times
import osmnx as ox
from datetime import timedelta
zone_travel_times = {}
G = ox.graph_from_place('New York City, New York, USA', network_type='drive')



In [None]:
for zone1 in tqdm(selected_hexagons):
    for zone2 in selected_hexagons:
        if zone1 == zone2:
            zone_travel_times[(zone1, zone2)] = 0.
        else:
            lat1, lng1 = h3.cell_to_latlng(zone1)
            lat2, lng2 = h3.cell_to_latlng(zone2)
            point1 = (lat1, lng1)
            point2 = (lat2, lng2)
            orig_node = ox.distance.nearest_nodes(G, lng1, lat1)
            dest_node = ox.distance.nearest_nodes(G, lng2, lat2)
            route = ox.shortest_path(G, orig_node, dest_node)
            distance = sum(ox.utils_graph.routing.route_to_gdf(G, route)["length"])
            
            distance_t = distance / 8.5 # 秒
            td = timedelta(seconds=distance_t)
            
            days = td.days + 22
            seconds_remaining = td.seconds 

            hours = seconds_remaining // 3600
            minutes = (seconds_remaining % 3600) // 60

            spend_epoch = get_epoch(days, hours, minutes)

            zone_travel_times[(zone1, zone2)] = spend_epoch
joblib.dump("zone_travel_times.pth", zone_travel_times)

  0%|                                                                                           | 0/370 [00:00<?, ?it/s]