In [1]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv(
    "../data/nyc_taxi_2019-01.csv",
    usecols=["passenger_count", "trip_distance", "total_amount"],
)
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount
0,1,1.5,9.95
1,1,2.6,16.3
2,3,0.0,5.8
3,5,0.0,7.55
4,5,0.0,55.55


1. Load the taxi data from Jan 2019 using `passenger_count`, `trip_distance` and `total_amount`
2. For each number of passengers, find the mean cost of a taxi ride. Sort this result from lowest to highest.
3. Sort the results again by increasing the number of passengers.
4. Create a new column, `trip_distance_group` in which the values a short (< 2 miles), medium (>= 2 miles and <= 10 miles) and long (> 10 miles). What is the average number of passengers per trip length category? Sort this result from highest (most passengers) to lowest (fewest passengers)

In [None]:
df["trip_distance_group"] = pd.cut(
    df["trip_distance"],
    bins=[0, 2, 10, float("inf")],
    labels=["short", "medium", "long"],
    include_lowest=True,
)
df.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,trip_distance_group
0,1,1.5,9.95,short
1,1,2.6,16.3,medium
2,3,0.0,5.8,short
3,5,0.0,7.55,short
4,5,0.0,55.55,short


In [None]:
# Average cost of trip by passenger count from lowest mean cost to highest mean cost
df.groupby("passenger_count")["total_amount"].mean().sort_values()

passenger_count
6    15.437892
5    15.546940
3    15.604015
1    15.609601
4    15.650307
2    15.831294
0    18.663658
9    31.094444
7    48.278421
8    64.105517
Name: total_amount, dtype: float64

In [12]:
# Average cost of trip by passenger count from lowest passenger count to highest passenger count
df.groupby("passenger_count")["total_amount"].mean()

passenger_count
0    18.663658
1    15.609601
2    15.831294
3    15.604015
4    15.650307
5    15.546940
6    15.437892
7    48.278421
8    64.105517
9    31.094444
Name: total_amount, dtype: float64

In [None]:
# What is the average number of passengers per trip length category? Sort this result from highest (most passengers) to lowest (fewest passengers)
df.groupby("trip_distance_group", observed=True)["passenger_count"].mean().sort_values(
    ascending=False
)

trip_distance_group
long      1.590035
medium    1.585319
short     1.555906
Name: passenger_count, dtype: float64

# Extension questions
1. Create a single data frame containing rides from both January 2019 and January 2020 with a column `year` indicating which year the ride comes from. Use a `groupby` to compare the average cost of a taxi in January from each of these two years.
2. Create a two-level grouping, first by `year` and then by `passenger_count`.
3. The `corr` method allows us to see how strongly two columns correlate with one another. Use `corr` and then `sort_values()` to find which columns have the highest correlation.

In [None]:
t19 = pd.read_csv(
    "../data/nyc_taxi_2019-01.csv",
    usecols=["passenger_count", "total_amount", "trip_distance"],
)
t20 = pd.read_csv(
    "../data/nyc_taxi_2020-01.csv",
    usecols=["passenger_count", "total_amount", "trip_distance"],
)
t19["year"] = 2019
t20["year"] = 2020
trips = pd.concat([t19, t20], ignore_index=True)
trips.head()

Unnamed: 0,passenger_count,trip_distance,total_amount,year
0,1.0,1.5,9.95,2019
1,1.0,2.6,16.3,2019
2,3.0,0.0,5.8,2019
3,5.0,0.0,7.55,2019
4,5.0,0.0,55.55,2019


In [18]:
trips.groupby("year")["total_amount"].mean()

year
2019    15.682222
2020    18.663149
Name: total_amount, dtype: float64

In [29]:
trips.groupby(["year", "passenger_count"])["total_amount"].mean()

year  passenger_count
2019  0.0                18.663658
      1.0                15.609601
      2.0                15.831294
      3.0                15.604015
      4.0                15.650307
      5.0                15.546940
      6.0                15.437892
      7.0                48.278421
      8.0                64.105517
      9.0                31.094444
2020  0.0                18.059724
      1.0                18.343110
      2.0                19.050504
      3.0                18.736862
      4.0                19.128092
      5.0                18.234443
      6.0                18.367962
      7.0                71.143103
      8.0                58.197059
      9.0                81.244211
Name: total_amount, dtype: float64

In [None]:
trips.corr().sort_values("passenger_count", ascending=False)
# essentially no correlation between passenger_count and other values so
# no real incentive to take on more passengers (particularly since there
# are far fewer larger passenger groups anyway)


Unnamed: 0,passenger_count,trip_distance,total_amount,year
passenger_count,1.0,0.008974,-0.000136,-0.021602
trip_distance,0.008974,1.0,0.004331,0.00114
total_amount,-0.000136,0.004331,1.0,0.007657
year,-0.021602,0.00114,0.007657,1.0


In [None]:
# but let's look at removing some of the junk data from this first
# strip out:
#   - 0 passenger rides
#   - trips with 0 distance
#   - chargebacks and other negative fare totals
(
    trips.loc[
        (trips["passenger_count"] > 0)
        & (trips["trip_distance"] > 0)
        & (trips["total_amount"] > 0)
    ]
    .corr()
    .sort_values("passenger_count", ascending=False)
)
# ...and we get: even weaker correlations!

Unnamed: 0,passenger_count,trip_distance,total_amount,year
passenger_count,1.0,0.007072,8.7e-05,-0.019575
trip_distance,0.007072,1.0,0.074333,0.007913
total_amount,8.7e-05,0.074333,1.0,0.008473
year,-0.019575,0.007913,0.008473,1.0
