In [88]:
import pandas as pd
import pytz
import os
from pathlib import Path
pwd = Path(os.getcwd())

import sys
sys.path.append(str(pwd.parent / "config"))

from setting import LocalRegionBound

LONGITUDE_LOWER_BOUND = LocalRegionBound[0]
LONGITUDE_UPPER_BOUND = LocalRegionBound[1]
LATITUDE_LOWER_BOUND = LocalRegionBound[2]
LATITUDE_UPPER_BOUND = LocalRegionBound[3]

df = pd.read_csv("../data/Order/original/yellow_tripdata_2016-06.csv")

In [89]:
df.columns

Index(['VendorID', 'tpep_pickup_datetime', 'tpep_dropoff_datetime',
       'passenger_count', 'trip_distance', 'pickup_longitude',
       'pickup_latitude', 'RatecodeID', 'store_and_fwd_flag',
       'dropoff_longitude', 'dropoff_latitude', 'payment_type', 'fare_amount',
       'extra', 'mta_tax', 'tip_amount', 'tolls_amount',
       'improvement_surcharge', 'total_amount'],
      dtype='object')

In [90]:
USE_COLUMNS = [
    "tpep_pickup_datetime",
    "tpep_dropoff_datetime",
    "pickup_longitude",
    "pickup_latitude",
    "dropoff_longitude",
    "dropoff_latitude",
]
RENAME_DICT = {
    "tpep_pickup_datetime": "Start_time",
    "tpep_dropoff_datetime": "End_time",
    "pickup_longitude": "PointS_Longitude",
    "pickup_latitude": "PointS_Latitude",
    "dropoff_longitude": "PointE_Longitude",
    "dropoff_latitude": "PointE_Latitude",
}


In [91]:
len(df)

11135470

In [92]:
rename_df = df[USE_COLUMNS].rename(columns=RENAME_DICT).sort_values("Start_time").reset_index(drop=True)

# NewYorkエリア外のレコードを除去する.
rename_without_outlier_df = rename_df[
    (rename_df["PointS_Longitude"] > LONGITUDE_LOWER_BOUND)
    & (rename_df["PointS_Longitude"] < LONGITUDE_UPPER_BOUND)
    & (rename_df["PointE_Longitude"] > LONGITUDE_LOWER_BOUND)
    & (rename_df["PointE_Longitude"] < LONGITUDE_UPPER_BOUND)
    & (rename_df["PointS_Latitude"] > LATITUDE_LOWER_BOUND)
    & (rename_df["PointS_Latitude"] < LATITUDE_UPPER_BOUND)
    & (rename_df["PointE_Latitude"] > LATITUDE_LOWER_BOUND)
    & (rename_df["PointE_Latitude"] < LATITUDE_UPPER_BOUND)
]

In [93]:
from datetime import datetime, timedelta

rename_without_outlier_df["Start_datetime"] = rename_without_outlier_df["Start_time"].apply(
    lambda x: datetime.strptime(x, "%Y-%m-%d %H:%M:%S")
)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  rename_without_outlier_df["Start_datetime"] = rename_without_outlier_df["Start_time"].apply(


In [94]:
def newyork_datetime_to_utc(datetime_str: str) -> int:
    eastern = pytz.timezone('US/Eastern')
    date = datetime.strptime(datetime_str, "%Y-%m-%d %H:%M:%S")
    date.astimezone(eastern)
    return int(date.timestamp())

In [95]:
start_date = datetime(year=2016, month=6, day=1)
while True:
    next_date = start_date + timedelta(days=1)
    tmp_df = rename_without_outlier_df[
        (rename_without_outlier_df["Start_datetime"]<next_date)
        & (rename_without_outlier_df["Start_datetime"]>start_date)
    ]
    tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_datetime_to_utc)
    tmp_df["End_time"] = tmp_df["End_time"].apply(newyork_datetime_to_utc)
    tmp_df[RENAME_DICT.values()].to_csv(
        f"../data/Order/modified/order_2016{str(start_date.month).zfill(2)}{str(start_date.day).zfill(2)}.csv",
        index=False
    )
    start_date = next_date
    if start_date.month != 6:
        break

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_datetime_to_utc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df["End_time"] = tmp_df["End_time"].apply(newyork_datetime_to_utc)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  tmp_df["Start_time"] = tmp_df["Start_time"].apply(newyork_d