In [None]:
import glob
import os
import random
import time
from math import degrees, radians

import numpy as np
import pandas as pd
import pyproj
import requests
import shapefile
from sklearn.cluster import KMeans

In [None]:
df = pd.read_parquet("nyc-dataset/data/trips")

In [None]:
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df = df[
    (df["tpep_pickup_datetime"].dt.year >= 2021)
    & (df["tpep_pickup_datetime"].dt.year <= 2022)
]
df["tpep_pickup_hour"] = df["tpep_pickup_datetime"].dt.round("1h")

In [None]:
df = (
    df.groupby(["tpep_pickup_hour", "PULocationID"])
    .size()
    .reset_index(name="trip_count")
)
df["time"] = df["tpep_pickup_hour"].dt.date
df.rename({"PULocationID": "location_id"}, axis=1, inplace=True)

In [None]:
# LocationID: https://data.cityofnewyork.us/Transportation/NYC-Taxi-Zones/d3c5-ddgc
# Shape file: https://www.nyc.gov/site/tlc/about/tlc-trip-record-data.page
# Analysis: https://chih-ling-hsu.github.io/2018/05/14/NYC

len(df["location_id"].unique())

In [None]:
def get_lat_lon(sf):
    content = []
    transformer = pyproj.Transformer.from_crs(2263, 4326, always_xy=True)

    for sr in sf.shapeRecords():
        shape = sr.shape
        rec = sr.record
        loc_id = rec[shp_dic["LocationID"]]

        x = (shape.bbox[0] + shape.bbox[2]) / 2
        y = (shape.bbox[1] + shape.bbox[3]) / 2
        lon, lat = transformer.transform(x, y)

        content.append((loc_id, lon, lat))

    return pd.DataFrame(
        content, columns=["LocationID", "longitude", "latitude"]
    )


# Read shape file
sf = shapefile.Reader("nyc-dataset/data/taxi_zones/taxi_zones.shp")
fields_name = [field[0] for field in sf.fields[1:]]
shp_dic = dict(zip(fields_name, list(range(len(fields_name)))))
attributes = sf.records()
shp_attr = [dict(zip(fields_name, attr)) for attr in attributes]

df_loc = pd.DataFrame(shp_attr).join(
    get_lat_lon(sf).set_index("LocationID"), on="LocationID"
)


df_loc.head()

In [None]:
def fetch_weather_data(lat, lon, start_date, end_date):
    url = f"https://archive-api.open-meteo.com/v1/archive?latitude={lat}&longitude={lon}&start_date={start_date}&end_date={end_date}&daily=weathercode,temperature_2m_max,temperature_2m_min,temperature_2m_mean,precipitation_sum,rain_sum&timezone=GMT"
    response = requests.get(url)
    if response.status_code == 200:
        return response.json()
    else:
        print(
            f"Failed to fetch weather data for latitude={lat}, longitude={lon}"
        )
        return None


# Fetch weather data for each cluster centroid
output_dir = "nyc-dataset/data/weather"
os.makedirs(output_dir, exist_ok=True)

for cluster, centroid in df_loc.groupby("LocationID"):
    centroid_lat = centroid["latitude"].values[0]
    centroid_lon = centroid["longitude"].values[0]
    filename = f"{output_dir}/cluster_{cluster}.csv"

    if not os.path.exists(filename):
        weather_data = fetch_weather_data(
            centroid_lat, centroid_lon, "2021-01-01", "2023-01-01"
        )

        if weather_data is not None:
            df_weather = pd.DataFrame(weather_data["daily"])
            df_weather.to_csv(filename, index=False)
            time.sleep(random.randint(1, 5))
        else:
            print(f"No weather data available for cluster {cluster}")

In [None]:
dfs = []
for filename in glob.glob(f"{output_dir}/*.csv"):
    df_weather = pd.read_csv(filename)
    df_weather["time"] = pd.to_datetime(df_weather["time"]).dt.date
    df_weather["location_id"] = int(filename.split("_")[1].split(".")[0])
    dfs.append(df_weather)
df_weather = pd.concat(dfs)

In [None]:
df_weather.head()

In [None]:
dataset = df.merge(
    df_weather,
    left_on=["time", "location_id"],
    right_on=["time", "location_id"],
    how="left",
)

In [None]:
dataset.tail(10)

In [None]:
dataset.to_parquet("nyc-dataset/data/dataset.parquet")