In [2]:
import pandas as pd
import numpy as np

In [None]:
data2019 = pd.read_csv(
    "../data/nyc_taxi_2019-07.csv",
    usecols=["passenger_count", "total_amount", "payment_type"],
)
data2019["year"] = 2019
data2020 = pd.read_csv(
    "../data/nyc_taxi_2020-07.csv",
    usecols=["passenger_count", "total_amount", "payment_type"],
)
data2020["year"] = 2020
df = pd.concat([data2019, data2020], ignore_index=True)
df.head()

Unnamed: 0,passenger_count,payment_type,total_amount,year
0,1.0,1.0,4.94,2019
1,1.0,2.0,20.3,2019
2,1.0,1.0,70.67,2019
3,1.0,1.0,66.36,2019
4,0.0,1.0,15.3,2019


1. How many rides were taken in 2019 and 2020, and what is the difference between these two figures?
2. How much money was taken in total in the two years and what was the difference between them?
3. Did the proportion of trips with more than one passenger change dramatically?
4. Did people use cash (`payment_type` 2) less in 2020 than 2019?

In [None]:
# difference in trip numbers
trips19 = df.loc[df["year"] == 2019, "passenger_count"].count()
trips20 = df.loc[df["year"] == 2020, "passenger_count"].count()
print(f"Trips in 2019: {trips19}")
print(f"Trips in 2020: {trips20}")
print(f"Difference: {abs(trips19 - trips20):,}")

Trips in 2019: 6276460
Trips in 2020: 737565
Difference: 5,538,895


In [9]:
# money collected
money19 = df.loc[df["year"] == 2019, "total_amount"].sum()
money20 = df.loc[df["year"] == 2020, "total_amount"].sum()
print(f"Money collected in 2019: ${money19:,.2f}")
print(f"Money collected in 2020: ${money20:,.2f}")
print(f"Difference: ${abs(money19 - money20):,.2f}")

Money collected in 2019: $123,761,823.33
Money collected in 2020: $14,912,844.09
Difference: $108,848,979.24


In [None]:
# single passenger trips
single19a = (
    df.loc[
        (df["year"] == 2019) & (df["passenger_count"] == 1), "passenger_count"
    ].count()
    / df.loc[df["year"] == 2019, "passenger_count"].count()
)
single20a = (
    df.loc[
        (df["year"] == 2020) & (df["passenger_count"] == 1), "passenger_count"
    ].count()
    / df.loc[df["year"] == 2020, "passenger_count"].count()
)
print(
    f"Single fare proportion in 2019 is {single19a*100:.2f}% and 2020 is {single20a*100:.2f}%"
)


single19 = (
    df.loc[df["year"] == 2019, "passenger_count"]
    .value_counts(normalize=True)[[1]]
    .sum()
)
single20 = (
    df.loc[df["year"] == 2020, "passenger_count"]
    .value_counts(normalize=True)[[1]]
    .sum()
)
print(
    f"Single fare proportion in 2019 is {single19*100:.2f}% and 2020 is {single20*100:.2f}%"
)

Single fare proportion in 2019 is 69.80% and 2020 is 76.74%
Single fare proportion in 2019 is 69.80% and 2020 is 76.74%


In [None]:
# proportion of cash users
cash19 = (
    df.loc[(df["year"] == 2019) & (df["payment_type"] == 2), "payment_type"].count()
    / df.loc[df["year"] == 2019, "payment_type"].count()
).sum()
cash20 = (
    df.loc[(df["year"] == 2020) & (df["payment_type"] == 2), "payment_type"].count()
    / df.loc[df["year"] == 2020, "payment_type"].count()
).sum()

print(
    f"Cash user percentage in 2019 is {cash19*100:.2f}% and in 2020 is {cash20*100:.2f}%"
)

Cash user percentage in 2019 is 28.71% and in 2020 is 32.06%


# Beyond the exercise
1. Use the `corr` method on `df` to find the correlations among the columns. How would you interpret these results?
2. Show, with a single command, the difference in descriptive statistics for `total_amount` between 2019 and 2020. Round values to use no more than two decimal places.
3. If we assume that zero-passenger trips are for delivering packages, how were these affected in 2020? Show the proportion of these in 2019 and 2020.

In [None]:
df.corr()
# looking for larger magnitude numbers. The biggest is `payment_type` vs `total_amount` which is
# a very weakly negative correlated number. However, `payment_type` is not a progression, it's
# categorical data, so not super meaningful

Unnamed: 0,passenger_count,payment_type,total_amount,year
passenger_count,1.0,0.01641,0.014943,-0.049558
payment_type,0.01641,1.0,-0.138561,0.029277
total_amount,0.014943,-0.138561,1.0,-0.019706
year,-0.049558,0.029277,-0.019706,1.0


In [None]:
(
    df.loc[df["year"] == 2019, "total_amount"].describe()
    - df.loc[df["year"] == 2020, "total_amount"].describe()
).round(2)
# not much difference overall - shift in the mean of around 1, and the s.d. of 0.75

count    5510007.00
mean           0.98
std            0.75
min          -53.20
25%            0.50
50%            0.60
75%            0.75
max         4672.45
Name: total_amount, dtype: float64

In [None]:
# using `.value_counts(normalize=True)` - but this has produced some different results with others
zero19 = (
    df.loc[df["year"] == 2019, "passenger_count"]
    .value_counts(normalize=True)[[0]]
    .sum()
)
zero20 = (
    df.loc[df["year"] == 2020, "passenger_count"]
    .value_counts(normalize=True)[[0]]
    .sum()
)
print(
    f"Zero passenger in 2019 is {zero19*100:.2f}% and 2020 is {zero20*100:.2f}% delta is {(zero19-zero20)*100:.2f}%"
)

# a comparison using df.loc method
zero19a = (
    df.loc[
        (df["year"] == 2019) & (df["passenger_count"] == 0), "passenger_count"
    ].count()
    / df.loc[df["year"] == 2019, "passenger_count"].count()
)
zero20a = (
    df.loc[
        (df["year"] == 2020) & (df["passenger_count"] == 0), "passenger_count"
    ].count()
    / df.loc[df["year"] == 2020, "passenger_count"].count()
)
print(
    f"Zero passenger in 2019 is {zero19a*100:.2f}% and 2020 is {zero20a*100:.2f}% delta is {(zero19a-zero20a)*100:.2f}%"
)

Zero passenger in 2019 is 1.86% and 2020 is 2.64% delta is -0.78%
Zero passenger in 2019 is 1.86% and 2020 is 2.64% delta is -0.78%


In [None]:
# working out the previous one using counts instead
zero19b = (
    df.loc[
        (df["year"] == 2019) & (df["passenger_count"] == 0), "passenger_count"
    ].count()
    / df["passenger_count"].count()
)
zero20b = (
    df.loc[
        (df["year"] == 2020) & (df["passenger_count"] == 0), "passenger_count"
    ].count()
    / df["passenger_count"].count()
)
print(
    f"Zero passenger in 2019 is {zero19b*100:.2f}% and 2020 is {zero20b*100:.2f}% delta is {(zero19b-zero20b)*100:.2f}%"
)

Zero passenger in 2019 is 1.67% and 2020 is 0.28% delta is 1.39%


In [36]:
df.info

<bound method DataFrame.info of          passenger_count  payment_type  total_amount  year
0                    1.0           1.0          4.94  2019
1                    1.0           2.0         20.30  2019
2                    1.0           1.0         70.67  2019
3                    1.0           1.0         66.36  2019
4                    0.0           1.0         15.30  2019
...                  ...           ...           ...   ...
7110826              NaN           NaN         83.50  2020
7110827              NaN           NaN         19.78  2020
7110828              NaN           NaN         38.45  2020
7110829              NaN           NaN         29.77  2020
7110830              NaN           NaN         51.90  2020

[7110831 rows x 4 columns]>

In [39]:
# how many passenger_count entries do we have that are NaN? We can't use `.count()` since that
# skips NaN values
df.loc[np.isnan(df["passenger_count"]), "passenger_count"].shape[0]

96806

In [None]:
# how many payment_type entries do we have are NaN?
df.loc[np.isnan(df["payment_type"]), "payment_type"].shape[0]
# same number as the passenger_count - bunch of junk data at the end?

96806

In [41]:
# let's find out what the descriptive stats are of those values?
df.loc[np.isnan(df["passenger_count"])].describe()

Unnamed: 0,passenger_count,payment_type,total_amount,year
count,0.0,0.0,96806.0,96806.0
mean,,,39.950912,2019.649206
std,,,18.395952,0.477221
min,,,-91.0,2019.0
25%,,,25.5,2019.0
50%,,,37.75,2020.0
75%,,,51.9,2020.0
max,,,198.51,2020.0
