In [126]:
import pandas as pd
import numpy as np

pd.set_option("display.float_format", "{:.5f}".format)

In [127]:
files = ["../data/nyc_taxi_2019-01.csv", "../data/nyc_taxi_2019-07.csv"]
df = pd.concat(
    [
        pd.read_csv(
            fn,
            usecols=[
                "tpep_pickup_datetime",
                "passenger_count",
                "trip_distance",
                "fare_amount",
                "extra",
                "mta_tax",
                "tip_amount",
                "tolls_amount",
                "improvement_surcharge",
                "total_amount",
                "congestion_surcharge",
            ],
        )
        for fn in files
    ]
)

df["tpep_pickup_datetime"] = pd.to_datetime(df["tpep_pickup_datetime"])
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,


1. Create a new column `pre_tip_amount` with all the payment columns except `total_amount` and `tip_amount`. `total_amount` is the sum of each of the other payment columns.
2. Create a new column `tip_percentage` showing the percentage of `pre_tip_amount` the tip was.

In [128]:
df.dtypes

tpep_pickup_datetime     datetime64[ns]
passenger_count                 float64
trip_distance                   float64
fare_amount                     float64
extra                           float64
mta_tax                         float64
tip_amount                      float64
tolls_amount                    float64
improvement_surcharge           float64
total_amount                    float64
congestion_surcharge            float64
dtype: object

In [129]:
# replace all the NaN values in the numeric columns with 0
for c in [
    "fare_amount",
    "extra",
    "mta_tax",
    "tolls_amount",
    "improvement_surcharge",
    "congestion_surcharge",
    "tip_amount",
    "total_amount",
]:
    df.loc[df[c].isna(), c] = 0
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,0.0
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,0.0
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,0.0
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,0.0
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,0.0


In [130]:
df["pre_tip_amount"] = df[
    [
        "fare_amount",
        "extra",
        "mta_tax",
        "tolls_amount",
        "improvement_surcharge",
        "congestion_surcharge",
    ]
].sum(axis="columns")
df["tip_percentage"] = df["tip_amount"] / df["pre_tip_amount"]
df.head()

Unnamed: 0,tpep_pickup_datetime,passenger_count,trip_distance,fare_amount,extra,mta_tax,tip_amount,tolls_amount,improvement_surcharge,total_amount,congestion_surcharge,pre_tip_amount,tip_percentage
0,2019-01-01 00:46:40,1.0,1.5,7.0,0.5,0.5,1.65,0.0,0.3,9.95,0.0,8.3,0.1988
1,2019-01-01 00:59:47,1.0,2.6,14.0,0.5,0.5,1.0,0.0,0.3,16.3,0.0,15.3,0.06536
2,2018-12-21 13:48:30,3.0,0.0,4.5,0.5,0.5,0.0,0.0,0.3,5.8,0.0,5.8,0.0
3,2018-11-28 15:52:25,5.0,0.0,3.5,0.5,0.5,0.0,0.0,0.3,7.55,0.0,4.8,0.0
4,2018-11-28 15:56:57,5.0,0.0,52.0,0.0,0.5,0.0,0.0,0.3,55.55,0.0,52.8,0.0


1. What was the mean tip percentage across all trips in the data set?
2. How many times did people tip more than the pre-tip amount?
3. On which day of the week do people tip the greatest percentage of the fare, on average?
4. At which hour do people tip the greatest percentage (Me: look at the std on this too)
5. Do people typically tip more in January or July?
6. What was the 1-day period in our data set when people tipped the greatest percentage?

In [131]:
# 1. mean tip percentage across all trips
df["tip_percentage"].mean()

0.13003974566357937

In [132]:
# what proportion tipped over the fare or 0?
print(
    f"Non-tippers: {df["tip_percentage"].value_counts(normalize=True).loc[0] * 100:.2f}%"
)
print(
    f"Tipped more than 100%: {df.loc[df["tip_percentage"] > 1]["tip_percentage"].count() / df["tip_percentage"].count() * 100:.2f}%"
)
df["tip_percentage"].describe()

Non-tippers: 32.08%
Tipped more than 100%: 0.06%


count   13975205.00000
mean           0.13004
std            0.59431
min           -3.96552
25%            0.00000
50%            0.16129
75%            0.20000
max          733.33333
Name: tip_percentage, dtype: float64

In [133]:
# 2. trips where the tip amount was more than the pre-tip amount
df.loc[df["tip_amount"] > df["pre_tip_amount"]]["tip_amount"].count()


28232

In [134]:
# what was the fare like for the tips greater than 100%?
df.loc[
    (df["pre_tip_amount"] > 0) & (df["tip_percentage"] > 1), "pre_tip_amount"
].describe()

count   7821.00000
mean       9.28611
std        9.56468
min        0.30000
25%        3.80000
50%        6.30000
75%       10.30000
max      131.06000
Name: pre_tip_amount, dtype: float64

In [135]:
# 3. on which day of the week do people tip the greatest percentage of the fare on average?
df.groupby(df["tpep_pickup_datetime"].dt.day_name())[
    "tip_percentage"
].mean().sort_values(ascending=False)

tpep_pickup_datetime
Thursday    0.13397
Wednesday   0.13222
Tuesday     0.13142
Friday      0.12914
Monday      0.12872
Sunday      0.12663
Saturday    0.12580
Name: tip_percentage, dtype: float64

In [136]:
# 4. At which hour do people tip the greatest percentage?
df.groupby(df["tpep_pickup_datetime"].dt.hour)["tip_percentage"].describe().sort_values(
    by="mean", ascending=False
).head()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
tpep_pickup_datetime,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
22,692750.0,0.13882,0.89857,-0.2,0.0,0.17341,0.2,466.66667
20,779002.0,0.13816,0.55686,-0.0,0.0,0.17213,0.2,320.0
21,764025.0,0.13768,0.30895,-0.25,0.0,0.17409,0.2,200.0
8,658122.0,0.13712,0.51878,-0.07519,0.0,0.16667,0.2,316.66667
19,868721.0,0.13517,0.46691,-0.2,0.0,0.16772,0.2,266.66667


In [137]:
# 5. do people typically tip more in January or July?
df.groupby(df["tpep_pickup_datetime"].dt.month_name())["tip_percentage"].mean().loc[
    ["January", "July"]
]

tpep_pickup_datetime
January   0.13701
July      0.12157
Name: tip_percentage, dtype: float64

In [138]:
# 6. What was the 1-day period in the data set when people tipped the greatest percentage?
df.resample("1D", on="tpep_pickup_datetime")["tip_percentage"].mean().sort_values(
    ascending=False
).iloc[:5]

tpep_pickup_datetime
2019-02-13   0.35813
2019-02-25   0.25000
2019-08-20   0.24187
2019-11-27   0.20000
2019-08-15   0.20000
Name: tip_percentage, dtype: float64

In [139]:
# ignore data outside of January and July

(
    df.loc[
        (df["tpep_pickup_datetime"].dt.year == 2019)
        & (df["tpep_pickup_datetime"].dt.month.isin((1, 7)))
    ]
    .set_index("tpep_pickup_datetime")
    .resample("1D")["tip_percentage"]
    .mean()
    .dropna()  # resampling re-adds null values between Jan-July
    .sort_values(ascending=False)
)

tpep_pickup_datetime
2019-01-31   0.14435
2019-01-30   0.14353
2019-01-24   0.14343
2019-01-22   0.14277
2019-01-15   0.14233
               ...  
2019-07-03   0.11462
2019-07-20   0.11454
2019-07-04   0.10699
2019-07-06   0.10676
2019-07-05   0.10572
Name: tip_percentage, Length: 62, dtype: float64

# Extension questions
1. You saw that 32% of riders don't tip at all. Of those that do, what percentage do they tip, on average?
2. How many of the rides in the data set, supposedly from January and July 2019 are from outside the specified data range?
3. Looking only at dates in January and July, in what week did passengers tip the greatest percentage?

In [140]:
# 1. of passengers who tip, how much do they tip on average?
print(f"Tip on average ${df.loc[df["tip_amount"] > 0, "tip_amount"].mean():.2f}")
print(f"This is {df.loc[df["tip_amount"] > 0, "tip_percentage"].mean() * 100:.2f}%")

Tip on average $2.93
This is 19.15%


In [None]:
# 2. How man rides in the data set are from outside of the specified January and July 2019 range?
outside_range = df.loc[
    ~(
        (df["tpep_pickup_datetime"].dt.year == 2019)
        & (df["tpep_pickup_datetime"].dt.month.isin((1, 7)))
    )
].shape[0]
print(
    f"There are {outside_range} values outside 2019 January or July, or {outside_range / df["tpep_pickup_datetime"].count() * 100:.5f}%"
)

There are 816 values outside 2019 January or July, or 0.00584%


In [142]:
# 3. In the valid date range, which week did passnegers tip the greatest percentage?
(
    df.loc[
        (df["tpep_pickup_datetime"].dt.year == 2019)
        & (df["tpep_pickup_datetime"].dt.month.isin((1, 7)))
    ]
    .set_index("tpep_pickup_datetime")
    .resample("1W")["tip_percentage"]
    .mean()
    .dropna()
    .sort_values(ascending=False)
)
# the last week of January was the highest tipping week

tpep_pickup_datetime
2019-02-03   0.14198
2019-01-27   0.13893
2019-01-20   0.13854
2019-01-13   0.13790
2019-01-06   0.12698
2019-08-04   0.12491
2019-07-14   0.12346
2019-07-21   0.12334
2019-07-28   0.12304
2019-07-07   0.11295
Name: tip_percentage, dtype: float64