In [12]:
import pandas as pd
import numpy as np


In [13]:
df = pd.read_csv(
    "../data/nyc_taxi_2019-01.csv",
    usecols=["passenger_count", "trip_distance", "payment_type", "total_amount"],
)
df.head()

Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
0,1,1.5,1,9.95
1,1,2.6,1,16.3
2,3,0.0,1,5.8
3,5,0.0,2,7.55
4,5,0.0,2,55.55


In [14]:
df.describe()

Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
count,7667792.0,7667792.0,7667792.0,7667792.0
mean,1.567078,2.801084,1.291776,15.68222
std,1.224431,3.737529,0.4733229,262.2932
min,0.0,0.0,1.0,-362.8
25%,1.0,0.9,1.0,8.19
50%,1.0,1.53,1.0,11.27
75%,2.0,2.8,2.0,16.56
max,9.0,831.8,4.0,623261.7


1. How many taxi rides had more than eight passengers?
2. How many taxi rides had zero passengers?
3. How many taxi rides were paid for in cash and cost more than $1000?
4. How many taxi rides cost less than $0?
5. How many taxi rides were for below average distances but cost above average?

In [15]:
print(f"{df.query('passenger_count > 8').shape[0]} trips with more than 8 passengers.")
df.query("passenger_count > 8")

9 trips with more than 8 passengers.


Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
949907,9,0.0,1,12.6
1296224,9,0.0,1,9.3
2012030,9,0.0,1,11.3
2883943,9,0.0,1,12.25
4534691,9,0.0,1,110.76
4852210,9,0.0,1,12.74
4997772,9,0.0,2,9.8
7286548,9,0.0,1,10.3
7373737,9,13.38,1,90.8


In [16]:
zero_passengers = df.query("passenger_count == 0")
print(f"{zero_passengers.shape[0]} trips with 0 passengers.")
zero_passengers

117381 trips with 0 passengers.


Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
156,0,5.3,1,4.55
228,0,18.0,1,59.80
229,0,8.9,1,38.50
298,0,1.0,2,9.30
905,0,0.7,1,8.30
...,...,...,...,...
7667552,0,2.8,1,16.30
7667581,0,4.8,1,20.80
7667612,0,3.2,1,15.95
7667764,0,1.4,2,8.80


In [17]:
# paid in cash and more than $1000
big_cash = df.query("payment_type == 2 & total_amount > 1000")
print(f"{big_cash.shape[0]} fares paid in cash and more than $1000")
big_cash

5 fares paid in cash and more than $1000


Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
478791,1,0.1,2,6667.45
3715690,1,0.0,2,1079.4
4300990,1,21.7,2,3345.3
5323430,1,0.0,2,356214.78
6827160,1,1.0,2,3004.8


In [None]:
# taxi rides less than $0
taxi_pays = df.query("total_amount < 0")
print(f"{taxi_pays.shape[0]} fares with negative totals.")
taxi_pays
# Apparently payment types are this:
# 1. Credit card
# 2. Cash
# 3. No charge
# 4. Dispute
# 5. Unknown
# 6. Voided trip
# Source: https://www.nyc.gov/assets/tlc/downloads/pdf/data_dictionary_trip_records_yellow.pdf

7131 fares with negative totals.


Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
663,2,0.10,3,-3.8
2402,1,4.13,4,-20.3
2541,1,1.35,4,-9.8
2544,1,0.00,4,-3.8
2547,1,0.16,4,-4.3
...,...,...,...,...
7665612,1,0.34,3,-4.3
7666343,1,0.17,3,-4.3
7666452,2,0.58,3,-9.3
7666601,1,0.68,4,-6.8


In [None]:
# taxi below average distances but above average prices
short_expensive = df.query(
    "trip_distance < trip_distance.mean() & total_amount > total_amount.mean()"
)
print(f"{short_expensive.shape[0]} short but expensive trips.")
short_expensive

411255 short but expensive trips.


Unnamed: 0,passenger_count,trip_distance,payment_type,total_amount
1,1,2.60,1,16.30
4,5,0.00,2,55.55
6,5,0.00,2,55.55
10,2,2.80,1,19.55
32,1,1.03,2,52.80
...,...,...,...,...
7667648,1,1.90,1,22.55
7667659,1,0.00,1,63.36
7667665,1,2.34,1,24.96
7667729,2,2.72,1,15.96


# Extension questions
1. Repeat this using query (which I've already done).
2. How many rides that cost less than $0 involved a payment dispue (type 4) or voided trip (type 6).
3. Find the percentage of payments of credit card vs cash.

In [None]:
# 2. Break down the dispute and voided trip negative cost types
print(
    f"{taxi_pays.loc[taxi_pays['payment_type'] == 4, 'payment_type'].count()} payment dispute trips."
)
print(
    f"{taxi_pays.loc[taxi_pays['payment_type'] == 6, 'payment_type'].count()} voided trips."
)

2666 payment dispute trips.
0 voided trips.


In [23]:
# 3. Cash vs credit card breakdown
total_payments = df.query("payment_type.isin([1, 2])").shape[0]
cc_payments = df.query("payment_type == 1").shape[0]
cash_payments = df.query("payment_type == 2").shape[0]
print(f"Of {total_payments} {cc_payments/total_payments * 100:.2f}% were credit card.")
print(f"Of {total_payments} {cash_payments/total_payments * 100:.2f}% were cash.")

Of 7623442 71.96% were credit card.
Of 7623442 28.04% were cash.


In [None]:
# The book reminds me that value_counts(normalize=True) is a thing
df["payment_type"].value_counts(normalize=True)[[1, 2]]
# this however looks like it uses all of the payment types
# instead of just payments of cash or card, so includes things like
# disputes etc which IMO shouldn't be included

payment_type
1    0.715464
2    0.278752
Name: proportion, dtype: float64