In [47]:
import pandas as pd
import numpy as np

In [48]:
df19 = pd.read_csv(
    "../data/nyc_taxi_2019-07.csv",
    usecols=[
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime",
        "passenger_count",
        "trip_distance",
        "total_amount",
    ],
    # parse_dates=["tpep_dropoff_datetime", "tpep_pickup_datetime"]
)
df19["tpep_dropoff_datetime"] = pd.to_datetime(df19["tpep_dropoff_datetime"])
df19["tpep_pickup_datetime"] = pd.to_datetime(df19["tpep_pickup_datetime"])
df19.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3


1. Create a new timedelta column called `trip_time` for the trip duration.
2. Determine the number and percentage of rides that took less than 1 minute.
3. Determine the average fare paid by people taking these short trips.
4. Determine the number and percentage of rides that took more than 10 hours.
5. Create a new column `trip_time_group` in which the values a short (<= 10 mins), medium (> 10 mins <= 1 hr) and long (> 1 hr)
6. Determine the proportion of rides in each group.
7. For each value in trip_time_group determine the average number of passengers.

In [49]:
df19["trip_time"] = df19["tpep_dropoff_datetime"] - df19["tpep_pickup_datetime"]
df19["trip_time"].head()

0   0 days 00:00:29
1   0 days 00:19:42
2   0 days 00:35:47
3   0 days 00:41:55
4   0 days 00:12:10
Name: trip_time, dtype: timedelta64[ns]

In [50]:
# 2. number and percentage of rides that took less than 1 minute
short_rides = df19.loc[df19["trip_time"] < "1 minute", "trip_time"]
print(f"There were {short_rides.count()} rides of less than 1 minute.")
(df19["trip_time"] < "1 minute").value_counts(normalize=True) * 100
# almost 99% of rides were longer than 1 minute

There were 70212 rides of less than 1 minute.


trip_time
False    98.887364
True      1.112636
Name: proportion, dtype: float64

In [51]:
# 3. determine the average fare paid by people taking the short rides
df19.loc[df19["trip_time"] < "1 minute", "total_amount"].describe()

count    70212.000000
mean        30.397584
std         41.240120
min       -442.800000
25%          4.300000
50%          8.800000
75%         57.300000
max       1569.920000
Name: total_amount, dtype: float64

In [52]:
# 4. > 10 hour trips
long_rides = df19.loc[df["trip_time"] > "10 hours", "trip_time"]
print(f"There were {long_rides.count()} trips over 10 hours.")
df19.loc[df19["trip_time"] > "10 hours", "total_amount"].describe()
# most of these look like errors since the mean cost of a long trip is less than
# the mean cost of a short trip

There were 16698 trips over 10 hours.


count    16698.000000
mean        23.773606
std         23.748988
min       -403.300000
25%         12.430000
50%         17.300000
75%         26.300000
max        690.300000
Name: total_amount, dtype: float64

In [53]:
# 5. bin trips into short (<= 10 min), medium (10 min - 1 hour), long (> 1 hour)
trip_bins = [
    pd.to_timedelta(x) for x in ("0 seconds", "10 minutes", "1 hour", "100 hours")
]
df19["trip_time_group"] = pd.cut(
    df19["trip_time"],
    trip_bins,
    labels=["short", "medium", "long"],
    include_lowest=True,
)
df19.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time,trip_time_group
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94,0 days 00:00:29,short
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3,0 days 00:19:42,medium
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67,0 days 00:35:47,medium
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36,0 days 00:41:55,medium
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3,0 days 00:12:10,medium


In [54]:
# 6. proportion of each ride group
df19["trip_time_group"].value_counts(normalize=True) * 100

trip_time_group
medium    55.251886
short     43.535023
long       1.213091
Name: proportion, dtype: float64

In [55]:
# 7. average number of passengers in each `trip_time_group`
df19.groupby("trip_time_group", observed=False)["passenger_count"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
trip_time_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
short,2746242.0,1.551241,1.205293,0.0,1.0,1.0,2.0,9.0
medium,3455863.0,1.585806,1.21909,0.0,1.0,1.0,2.0,9.0
long,74353.0,1.700859,1.303305,0.0,1.0,1.0,2.0,9.0


# Extension questions
1. How many trips are not from July 2019 and are in the wrong file?
2. What was the mean trip time for each number of passengers?
3. Load the taxi data from July 2019 and July 2020. For each year, and then for each number of passengers, what was the mean amount paid?

In [92]:
# load the 2020 data
df20 = pd.read_csv(
    "../data/nyc_taxi_2020-07.csv",
    usecols=[
        "tpep_pickup_datetime",
        "tpep_dropoff_datetime",
        "passenger_count",
        "trip_distance",
        "total_amount",
    ],
)
df20["tpep_dropoff_datetime"] = pd.to_datetime(df20["tpep_dropoff_datetime"])
df20["tpep_pickup_datetime"] = pd.to_datetime(df20["tpep_pickup_datetime"])
df20["trip_time"] = df20["tpep_dropoff_datetime"] - df20["tpep_pickup_datetime"]
trip_bins = [
    pd.to_timedelta(x) for x in ("0 seconds", "10 minutes", "1 hour", "100 hours")
]
df20["trip_time_group"] = pd.cut(
    df20["trip_time"],
    trip_bins,
    labels=["short", "medium", "long"],
    include_lowest=True,
)
df20.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time,trip_time_group
0,2020-07-01 00:25:32,2020-07-01 00:33:39,1.0,1.5,9.3,0 days 00:08:07,short
1,2020-07-01 00:03:19,2020-07-01 00:25:43,1.0,9.5,27.8,0 days 00:22:24,medium
2,2020-07-01 00:15:11,2020-07-01 00:29:24,1.0,5.85,22.3,0 days 00:14:13,medium
3,2020-07-01 00:30:49,2020-07-01 00:38:26,1.0,1.9,14.16,0 days 00:07:37,short
4,2020-07-01 00:31:26,2020-07-01 00:38:02,1.0,1.25,7.8,0 days 00:06:36,short


In [133]:
# 1. how many trips not from July 2019 are in the 2019 data file?
# len(
df19.loc[
    (df19["tpep_pickup_datetime"] < "2019-07-01")
    | (df19["tpep_pickup_datetime"] >= "2019-08-01"),
    "tpep_pickup_datetime",
]
# )


184       2019-06-30 14:54:49
185       2019-06-30 15:19:34
206       2019-06-30 23:41:12
274       2019-06-30 23:52:06
421       2019-06-30 23:56:48
                  ...        
6275762   2019-08-01 00:00:11
6275859   2019-08-01 00:00:23
6275948   2019-08-01 00:00:19
6276069   2019-08-01 00:04:50
6276300   2019-08-01 00:00:34
Name: tpep_pickup_datetime, Length: 285, dtype: datetime64[ns]

In [138]:
df19.loc[
    ~(
        (df19["tpep_pickup_datetime"].dt.year == 2019)
        & (df19["tpep_pickup_datetime"].dt.month == 7)
    ),
    "tpep_pickup_datetime",
]

184       2019-06-30 14:54:49
185       2019-06-30 15:19:34
206       2019-06-30 23:41:12
274       2019-06-30 23:52:06
421       2019-06-30 23:56:48
                  ...        
6275762   2019-08-01 00:00:11
6275859   2019-08-01 00:00:23
6275948   2019-08-01 00:00:19
6276069   2019-08-01 00:04:50
6276300   2019-08-01 00:00:34
Name: tpep_pickup_datetime, Length: 285, dtype: datetime64[ns]

In [139]:
# book version
# I think my version is correct - this still contains July dates in the results
df19[
    (df19["tpep_pickup_datetime"] < "2019-07-01")
    | (df19["tpep_pickup_datetime"] > "2019-07-31 23:59:59")
]["tpep_pickup_datetime"]

184       2019-06-30 14:54:49
185       2019-06-30 15:19:34
206       2019-06-30 23:41:12
274       2019-06-30 23:52:06
421       2019-06-30 23:56:48
                  ...        
6275762   2019-08-01 00:00:11
6275859   2019-08-01 00:00:23
6275948   2019-08-01 00:00:19
6276069   2019-08-01 00:04:50
6276300   2019-08-01 00:00:34
Name: tpep_pickup_datetime, Length: 285, dtype: datetime64[ns]

In [85]:
# 2. mean trip time for each number of passengers
df19.groupby("passenger_count")["trip_time"].mean()

passenger_count
0.0   0 days 00:14:18.929810752
1.0   0 days 00:17:46.148103924
2.0   0 days 00:18:34.024342704
3.0   0 days 00:19:02.079604271
4.0   0 days 00:20:10.057290100
5.0   0 days 00:22:29.870464324
6.0   0 days 00:20:54.109564300
7.0   0 days 00:16:38.206896551
8.0      0 days 00:11:00.500000
9.0      0 days 00:49:16.125000
Name: trip_time, dtype: timedelta64[ns]

In [111]:
# 3. Compare mean fare and mean per-passenger-count fare for 2019 and 2020
df_19_20 = pd.concat([df19, df20])
df_19_20["year"] = df_19_20["tpep_pickup_datetime"].dt.year
# drop all rows that are pre-2019
df_19_20 = df_19_20.loc[~(df_19_20["year"] < 2019)]
df_19_20.head()

Unnamed: 0,tpep_pickup_datetime,tpep_dropoff_datetime,passenger_count,trip_distance,total_amount,trip_time,trip_time_group,year
0,2019-07-01 00:51:04,2019-07-01 00:51:33,1.0,0.0,4.94,0 days 00:00:29,short,2019
1,2019-07-01 00:46:04,2019-07-01 01:05:46,1.0,4.16,20.3,0 days 00:19:42,medium,2019
2,2019-07-01 00:25:09,2019-07-01 01:00:56,1.0,18.8,70.67,0 days 00:35:47,medium,2019
3,2019-07-01 00:33:32,2019-07-01 01:15:27,1.0,18.46,66.36,0 days 00:41:55,medium,2019
4,2019-07-01 00:00:55,2019-07-01 00:13:05,0.0,1.7,15.3,0 days 00:12:10,medium,2019


In [112]:
df_19_20.groupby("year")["total_amount"].mean()

year
2019    19.612254
2020    18.631500
Name: total_amount, dtype: float64

In [117]:
df_19_20.pivot_table(
    columns=["year", "passenger_count"], values=["total_amount", "trip_time"]
).T

Unnamed: 0_level_0,Unnamed: 1_level_0,total_amount,trip_time
year,passenger_count,Unnamed: 2_level_1,Unnamed: 3_level_1
2019,0.0,18.981793,0 days 00:14:18.929810752
2019,1.0,19.284646,0 days 00:17:46.094980284
2019,2.0,20.097442,0 days 00:18:33.760405013
2019,3.0,20.208111,0 days 00:19:02.079604271
2019,4.0,21.063172,0 days 00:20:10.057290100
2019,5.0,19.419311,0 days 00:22:29.877636583
2019,6.0,19.386516,0 days 00:20:54.109564300
2019,7.0,70.08069,0 days 00:16:38.206896551
2019,8.0,74.760455,0 days 00:11:00.500000
2019,9.0,93.509375,0 days 00:49:16.125000
