In [10]:
import pandas as pd
import numpy as np

In [11]:
df = pd.read_csv(
    "../data/nyc_taxi_2019-07.csv",
    usecols=[
        "tpep_pickup_datetime",
        "passenger_count",
        "trip_distance",
        "total_amount",
    ],
)
df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,1.0,0.0,4.94
1,2019-07-01 00:46:04,1.0,4.16,20.3
2,2019-07-01 00:25:09,1.0,18.8,70.67
3,2019-07-01 00:33:32,1.0,18.46,66.36
4,2019-07-01 00:00:55,0.0,1.7,15.3


1. Export the dataframe to a tab-delimited CSV
  - Datetimes should be in `DD/MM/YY HHh:MMm:SSs` format
2. Ingest the newly written dataframe, parsing the datetime column appropriately.

In [12]:
# df["tpep_pickup_datetime"] = df["tpep_pickup_datetime"].dt.strftime("%d/%m/%y %Hh:%Mm:%Ss")
# check a sample to see if the date format works correctly
df["tpep_pickup_datetime"].iloc[:5].dt.strftime("%d/%m/%y %Hh:%Mm:%Ss")

0    01/07/19 00h:51m:04s
1    01/07/19 00h:46m:04s
2    01/07/19 00h:25m:09s
3    01/07/19 00h:33m:32s
4    01/07/19 00h:00m:55s
Name: tpep_pickup_datetime, dtype: object

In [16]:
df.to_csv(
    "ex40_out.csv",
    date_format="%d/%m/%y %Hh:%Mm:%Ss",
    sep="\t",
    index=False,
    header=True,
)

In [19]:
# re-ingest the data from the new file and verify the format
df_in = pd.read_csv(
    "ex40_out.csv",
    usecols=[
        "tpep_pickup_datetime",
        "passenger_count",
        "trip_distance",
        "total_amount",
    ],
    sep="\t",
)
df_in["tpep_pickup_datetime"] = pd.to_datetime(
    df_in["tpep_pickup_datetime"], format="%d/%m/%y %Hh:%Mm:%Ss"
)
df_in.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,1.0,0.0,4.94
1,2019-07-01 00:46:04,1.0,4.16,20.3
2,2019-07-01 00:25:09,1.0,18.8,70.67
3,2019-07-01 00:33:32,1.0,18.46,66.36
4,2019-07-01 00:00:55,0.0,1.7,15.3


In [24]:
# bucket the time of day into pre-work hours (6-9) work hours (9-5), after work (5-12), graveyard (12-6)
# and have a look at trip distances, passenger counts, and total amount
df_in["time_of_day"] = pd.cut(
    df["tpep_pickup_datetime"].dt.hour,
    bins=[0, 5, 8, 17, 23],
    labels=["graveyard", "pre-work", "work", "after-work"],
    include_lowest=True,
)
df_in["time_of_day"].value_counts(normalize=True) * 100

time_of_day
work          47.553277
after-work    33.303066
pre-work      10.215724
graveyard      8.927933
Name: proportion, dtype: float64

In [29]:
df_in.groupby("time_of_day", observed=False)[
    ["total_amount", "passenger_count", "trip_distance"]
].describe()

Unnamed: 0_level_0,total_amount,total_amount,total_amount,total_amount,total_amount,total_amount,total_amount,total_amount,passenger_count,passenger_count,passenger_count,passenger_count,passenger_count,trip_distance,trip_distance,trip_distance,trip_distance,trip_distance,trip_distance,trip_distance,trip_distance
Unnamed: 0_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
time_of_day,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2
graveyard,563390.0,20.587932,17.001265,-450.8,11.3,15.3,23.3,843.36,560581.0,1.585516,...,2.0,9.0,563390.0,3.878165,4.48918,0.0,1.2,2.2,4.69,168.44
pre-work,644655.0,18.903439,15.52671,-199.8,10.8,13.92,19.8,807.8,637830.0,1.496496,...,1.0,9.0,644655.0,3.134645,4.12093,0.0,1.0,1.65,3.1,169.47
work,3000811.0,19.640332,16.410195,-442.8,11.16,14.76,20.8,6667.45,2982876.0,1.572868,...,2.0,9.0,3000811.0,2.964033,4.063094,0.0,0.9,1.54,2.84,311.56
after-work,2101563.0,19.528164,14.636061,-360.8,11.8,15.3,21.3,750.07,2095173.0,1.590269,...,2.0,9.0,2101563.0,3.10533,3.907007,0.0,1.04,1.79,3.3,139.56
