In [1]:
import pandas as pd
from pandas import Series
import numpy as np
import math

In [None]:
s = (
    pd.read_csv("../data/taxi-distance.csv", header=None)
    .squeeze()
    .rename("Taxi Trip Distance")
)
s.describe()

count    9999.000000
mean        3.158511
std         4.037516
min         0.000000
25%         1.000000
50%         1.700000
75%         3.300000
max        64.600000
Name: Taxi Trip Distance, dtype: float64

In [16]:
# Categorise trips into Short (<= 2mi), Medium (<= 10mi), Long (> 10mi)
# This is my naive way without referring to the book, creating a new 3-item series with the
# counts of the original series with an index using the trip length
s2 = Series(
    [s[s <= 2.0].count(), s[(s > 2.0) & (s <= 10.0)].count(), s[s > 10.0].count()],
    index=["Short", "Medium", "Long"],
)
print(f"Short trips: {s2['Short']}")
print(f"Medium trips: {s2['Medium']}")
print(f"Long trips: {s2['Long']}")
s2

Short trips: 5890
Medium trips: 3402
Long trips: 707


Short     5890
Medium    3402
Long       707
dtype: int64

In [None]:
# The recommended method is to use `pd.cut` to cut the series into a series of bins
# that can have category labels applied to it, and seems much more sensible
s3 = pd.cut(
    s, bins=[0, 2, 10, s.max()], include_lowest=True, labels=["Short", "Medium", "Long"]
)
s3.value_counts()
s3  # note the output that shows the hierarchy of bins when they are printed out

0        Short
1        Short
2        Short
3       Medium
4        Short
         ...  
9994    Medium
9995    Medium
9996    Medium
9997     Short
9998    Medium
Name: Taxi Trip Distance, Length: 9999, dtype: category
Categories (3, object): ['Short' < 'Medium' < 'Long']

In [19]:
# Extension questions
# 1. Compare mean and median trip distances. What does this tell you about the distribution of data?
print(f"Median: {s.median()}")
s.describe()

Median: 1.7


count    9999.000000
mean        3.158511
std         4.037516
min         0.000000
25%         1.000000
50%         1.700000
75%         3.300000
max        64.600000
Name: Taxi Trip Distance, dtype: float64

The median is 1.7mi, and the mean is 3.15mi with a 4mi std dev.

The max is 64.6mi, so the distribution is very much on the low end, and skewed low with
the mean being brough up by a few long distance trips.

In [None]:
# 2. How many short, medium, and long trips were for trips that only had one
#    passenger?
passengers = (
    pd.read_csv("../data/taxi-passenger-count.csv", header=None)
    .squeeze()
    .rename("Taxi Passengers")
)
# Use a mask of passengers == 1 with the trip distance category data
# since we have been told the values are corresponding.
# I guess this is where we think things should be in a dataframe or just
# combine them into a dataframe
s3[passengers == 1].value_counts()

Taxi Trip Distance
Short     4333
Medium    2387
Long       487
Name: count, dtype: int64

In [23]:
# 3. What happens if we don't provide interval cutoffs to cut and just ask for 3 bins?
print(f"A third of the max value is: {s.max() / 3}")
pd.cut(s, bins=3)

A third of the max value is: 21.53333333333333


0       (-0.0646, 21.533]
1       (-0.0646, 21.533]
2       (-0.0646, 21.533]
3       (-0.0646, 21.533]
4       (-0.0646, 21.533]
              ...        
9994    (-0.0646, 21.533]
9995    (-0.0646, 21.533]
9996    (-0.0646, 21.533]
9997    (-0.0646, 21.533]
9998    (-0.0646, 21.533]
Name: Taxi Trip Distance, Length: 9999, dtype: category
Categories (3, interval[float64, right]): [(-0.0646, 21.533] < (21.533, 43.067] < (43.067, 64.6]]

This looks like it has just split it into equally sized ranges based of the minimum and maximum
values in the data series.