In [1]:
import os
from math import degrees, radians

import numpy as np
import pandas as pd
import pyproj
import requests
import shapefile
from sklearn.cluster import KMeans

In [2]:
df = pd.read_parquet("nyc-dataset/data/trips")

In [3]:
sampled_df = df.sample(frac=0.1, random_state=42)
print(sampled_df.shape)
sampled_df.to_parquet("nyc-dataset/sampled_yellow_tripdata_2022.parquet")

(3625655, 19)


In [4]:
df.sample(3).transpose()

Unnamed: 0,11677130,15963909,23420511
VendorID,2,2,2
tpep_pickup_datetime,2022-04-23 15:00:11,2022-05-30 12:19:26,2022-08-05 08:33:50
tpep_dropoff_datetime,2022-04-23 15:00:21,2022-05-30 12:24:31,2022-08-05 08:53:54
passenger_count,1.0,1.0,1.0
trip_distance,0.1,0.89,5.32
RatecodeID,1.0,1.0,1.0
store_and_fwd_flag,N,N,N
PULocationID,132,141,52
DOLocationID,132,141,68
payment_type,3,1,1


In [5]:
# LocationID: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc
# Shape file: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# Analysis: https://chih-ling-hsu.github.io/2018/05/14/NYC

df["PULocationID"].unique()

array([142, 236, 166, 114,  68, 138, 233, 238, 141, 234, 246,  43, 239,
       148, 237,   7, 107, 263, 161,  79, 170, 162,   4, 262, 249, 132,
       137,  90,  45,  70,  48, 211, 113, 164,  50, 265,  88, 186, 144,
       224,  95,  24, 158,  74, 140, 163,  75, 229, 209, 264, 219, 232,
       151, 256, 231,  87, 116,  65, 188,  42,  13,  33,  41, 220, 146,
       100, 261, 125, 152, 127, 143, 243,  66, 260, 181, 195, 112, 129,
        10, 226,  25, 255, 244,  12, 168, 230, 189,  97,  82, 190,  52,
        49,  61, 145, 223, 202, 228, 179,  36,  40, 159,  80,  17, 167,
       208,  69, 254, 213, 193,  77,  37, 225, 171, 106,  93, 247, 165,
       117, 212, 250,  14, 157, 198,  83, 169, 136, 217, 173, 123, 160,
         3,   1,  76,  89,  22,  39,  86,  18,  28, 252, 227, 135,  91,
       124,  38, 216, 119, 205,  81, 122, 194,  92, 133,  35, 134, 177,
       215, 147, 248,  63,  47, 210, 197, 201, 131, 200,  56,   5, 130,
       191,  85, 218,  51,  46, 149, 241, 139, 235,  71, 102,  6

In [6]:
def get_lat_lon(sf):
    content = []
    transformer = pyproj.Transformer.from_crs(2263, 4326, always_xy=True)

    for sr in sf.shapeRecords():
        shape = sr.shape
        rec = sr.record
        loc_id = rec[shp_dic["LocationID"]]

        x = (shape.bbox[0] + shape.bbox[2]) / 2
        y = (shape.bbox[1] + shape.bbox[3]) / 2
        lon, lat = transformer.transform(x, y)

        content.append((loc_id, lon, lat))

    return pd.DataFrame(
        content, columns=["LocationID", "longitude", "latitude"]
    )


sf = shapefile.Reader("nyc-dataset/data/taxi_zones/taxi_zones.shp")
fields_name = [field[0] for field in sf.fields[1:]]
shp_dic = dict(zip(fields_name, list(range(len(fields_name)))))
attributes = sf.records()
shp_attr = [dict(zip(fields_name, attr)) for attr in attributes]

df_loc = pd.DataFrame(shp_attr).join(
    get_lat_lon(sf).set_index("LocationID"), on="LocationID"
)

# Perform K-means clustering
df_loc["latitude_rad"] = df_loc["latitude"].apply(radians)
df_loc["longitude_rad"] = df_loc["longitude"].apply(radians)
coordinates = df_loc[["latitude_rad", "longitude_rad"]]
kmeans = KMeans(n_clusters=10, random_state=42, n_init=100)
kmeans.fit(coordinates)
df_loc["cluster"] = kmeans.labels_

# Calculate GPS coordinates of cluster centroids
cluster_centroids_rad = kmeans.cluster_centers_
cluster_centroids_deg = np.degrees(cluster_centroids_rad)
cluster_centroids_df = pd.DataFrame(
    cluster_centroids_deg, columns=["centroid_latitude", "centroid_longitude"]
)
cluster_centroids_df.index.name = "cluster"

df_loc = df_loc.join(cluster_centroids_df, on="cluster")

df_loc.head()

Unnamed: 0,OBJECTID,Shape_Leng,Shape_Area,zone,LocationID,borough,longitude,latitude,latitude_rad,longitude_rad,cluster,centroid_latitude,centroid_longitude
0,1,0.116357,0.000782,Newark Airport,1,EWR,-74.171526,40.689488,0.710166,-1.294537,4,40.590677,-74.139021
1,2,0.43347,0.004866,Jamaica Bay,2,Queens,-73.82249,40.610791,0.708792,-1.288446,2,40.643594,-73.803186
2,3,0.084341,0.000314,Allerton/Pelham Gardens,3,Bronx,-73.844947,40.865745,0.713242,-1.288837,0,40.855965,-73.846324
3,4,0.043567,0.000112,Alphabet City,4,Manhattan,-73.977726,40.724137,0.71077,-1.291155,1,40.701514,-74.001664
4,5,0.092146,0.000498,Arden Heights,5,Staten Island,-74.187537,40.550665,0.707743,-1.294817,4,40.590677,-74.139021


In [7]:
def fetch_weather_data(lat, lon, start_date, end_date):
    url = f"https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={start_date}&end_date={end_date}&daily=weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,rain_sum&timezone=GMT"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(
            f"Failed to fetch weather data for latitude={lat}, longitude={lon}"
        )
        return None


# Fetch weather data for each cluster centroid
output_dir = "nyc-dataset/data/weather"
os.makedirs(output_dir, exist_ok=True)

for cluster, centroid in df_loc.groupby("cluster"):
    centroid_lat = centroid["centroid_latitude"].values[0]
    centroid_lon = centroid["centroid_longitude"].values[0]
    filename = f"{output_dir}/cluster_{cluster}.csv"

    weather_data = fetch_weather_data(
        centroid_lat, centroid_lon, "2021-01-01", "2023-01-01"
    )

    if weather_data is not None:
        df_weather = pd.DataFrame(weather_data["daily"])
        df_weather.to_csv(filename, index=False)
        print(f"Weather data for cluster {cluster} saved to {filename}")
    else:
        print(f"No weather data available for cluster {cluster}")

Weather data for cluster 0 saved to nyc-dataset/data/weather/cluster_0.csv
Weather data for cluster 1 saved to nyc-dataset/data/weather/cluster_1.csv
Weather data for cluster 2 saved to nyc-dataset/data/weather/cluster_2.csv
Weather data for cluster 3 saved to nyc-dataset/data/weather/cluster_3.csv
Weather data for cluster 4 saved to nyc-dataset/data/weather/cluster_4.csv
Weather data for cluster 5 saved to nyc-dataset/data/weather/cluster_5.csv
Weather data for cluster 6 saved to nyc-dataset/data/weather/cluster_6.csv
Weather data for cluster 7 saved to nyc-dataset/data/weather/cluster_7.csv
Weather data for cluster 8 saved to nyc-dataset/data/weather/cluster_8.csv
Weather data for cluster 9 saved to nyc-dataset/data/weather/cluster_9.csv
