In [None]:
import pandas as pd
import pytz
import os
from pathlib import Path
pwd = Path(os.getcwd())

import sys
sys.path.append(str(pwd.parent / "config"))

from setting import LocalRegionBound

LONGITUDE_LOWER_BOUND = LocalRegionBound[0]
LONGITUDE_UPPER_BOUND = LocalRegionBound[1]
LATITUDE_LOWER_BOUND = LocalRegionBound[2]
LATITUDE_UPPER_BOUND = LocalRegionBound[3]

df = pd.read_csv("../data/Order/original/yellow_tripdata_2016-06.csv")

In [None]:
df.columns

In [None]:
USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}


In [None]:
rename_df = df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_without_outlier_df = rename_df[
    (rename_df["PointS_Longitude"] > LONGITUDE_LOWER_BOUND)
    & (rename_df["PointS_Longitude"] < LONGITUDE_UPPER_BOUND)
    & (rename_df["PointE_Longitude"] > LONGITUDE_LOWER_BOUND)
    & (rename_df["PointE_Longitude"] < LONGITUDE_UPPER_BOUND)
    & (rename_df["PointS_Latitude"] > LATITUDE_LOWER_BOUND)
    & (rename_df["PointS_Latitude"] < LATITUDE_UPPER_BOUND)
    & (rename_df["PointE_Latitude"] > LATITUDE_LOWER_BOUND)
    & (rename_df["PointE_Latitude"] < LATITUDE_UPPER_BOUND)
]

In [None]:
node_df = pd.read_csv("../data/Node.csv")
lnglat = node_df[["Longitude", "Latitude"]].values
lnglat

In [None]:
len(rename_without_outlier_df)

In [None]:
NodeIDs[0]

In [None]:
from tqdm import tqdm
from dataclasses import dataclass
from sklearn.neighbors import NearestNeighbors
from concurrent.futures import ThreadPoolExecutor
nn = NearestNeighbors(algorithm='ball_tree')
nn.fit(node_df[["Longitude", "Latitude"]].values)

NodeIDs = node_df["NodeID"].values
_, start_indices = nn.kneighbors(rename_without_outlier_df[["PointS_Longitude", "PointS_Latitude"]], n_neighbors=1)

rename_without_outlier_df["NodeS"] = [NodeIDs[i][0] for i in start_indices]
_, end_indices = nn.kneighbors(rename_without_outlier_df[["PointE_Longitude", "PointE_Latitude"]], n_neighbors=1)
rename_without_outlier_df["NodeE"] = [NodeIDs[i][0] for i in end_indices]

In [None]:
rename_without_outlier_df.head()

In [None]:
from datetime import datetime, timedelta

rename_without_outlier_df["Start_datetime"] = rename_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

In [None]:
def newyork_datetime_to_utc(datetime_str: str) -> int:
    eastern = pytz.timezone('US/Eastern')
    date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    date.astimezone(eastern)
    return int(date.timestamp())

In [None]:
rename_without_outlier_df["ID"] = range(len(rename_without_outlier_df))

In [None]:
start_date = datetime(year=2016, month=6, day=1)
directory = "train"
while True:
    next_date = start_date + timedelta(days=1)
    tmp_df = rename_without_outlier_df[
        (rename_without_outlier_df["Start_datetime"]<next_date)
        & (rename_without_outlier_df["Start_datetime"]>start_date)
    ]
    tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_datetime_to_utc)
    tmp_df["End_time"] = tmp_df["End_time"].apply(newyork_datetime_to_utc)
    if start_date.day >= 24:
        directory = "test"
    tmp_df[["ID"] + list(RENAME_DICT.values()) + ["NodeS", "NodeE"]].to_csv(
        f"../data/Order/modified/{directory}/order_2016{str(start_date.month).zfill(2)}{str(start_date.day).zfill(2)}.csv",
        index=False
    )
    start_date = next_date
    if start_date.month != 6:
        break