In [2]:
import pandas as pd
import numpy as np

In [35]:
passengers = pd.read_csv("../data/taxi-passenger-count.csv", header=None).squeeze()
distances = pd.read_csv("../data/taxi-distance.csv", header=None).squeeze()
trip_type = pd.cut(
    distances,
    bins=[0, 2, 10, distances.max()],
    labels=["short", "medium", "long"],
    include_lowest=True,
)
# this works, but requires transposing
# df = pd.DataFrame([passengers, distances]).transpose()
# the book recommends using one of the methods we used before like a dict
df = pd.DataFrame(
    {"distance": distances, "passengers": passengers, "trip_type": trip_type}
)
df.head()

Unnamed: 0,distance,passengers,trip_type
0,1.63,1,short
1,0.46,1,short
2,0.87,1,short
3,2.13,1,medium
4,1.4,1,short


In [None]:
# The book talks about one method for outliers using the IQR (distance from `quantile(0.25)` to `quantile(0.75)`)
# with outliers being values outside of (quantile(0.25) - IQR * 1.5) to (quantile(0.75) + IQR * 1.5)
# Alternate definitions are outside 2 s.d. of the mean

iqr_p = (df["passengers"].quantile(0.75) - df["passengers"].quantile(0.25)) * 1.5
iqr_d = (df["distance"].quantile(0.75) - df["distance"].quantile(0.25)) * 1.5

print(
    f"IQR modifier for distance is {iqr_d:.2f} so outliers from {df['distance'].quantile(0.25) - iqr_d:.2f} to {df['distance'].quantile(0.75) + iqr_d:.2f}"
)
print(
    f"IQR modifier for passengers is {iqr_p:.2f} so outliers from {df['passengers'].quantile(0.25) - iqr_p:.2f} to {df['passengers'].quantile(0.75) + iqr_p:.2f}"
)

IQR modifier for distance is 3.45 so outliers from -2.45 to 6.75
IQR modifier for passengers is 1.50 so outliers from -0.50 to 3.50


In [None]:
# how many trip distances were outliers?
distance_outliers = df.query(
    "distance < distance.quantile(0.25) - @iqr_d | distance > distance.quantile(0.75) + @iqr_d"
)

# what is the mean number of passengers for these outliers?
print(f"There are {distance_outliers['distance'].count()} distance outliers.")
print(
    f"The mean passengers for these trips is {distance_outliers['passengers'].mean():.2f}"
)
print(f"The overall passenger mean is {df['passengers'].mean():.2f}")
print(
    f"Overall passenger median is {df['passengers'].median()} vs {distance_outliers['passengers'].median()} for outliers."
)

There are 1219 distance outliers.
The mean passengers for these trips is 1.73
The overall passenger mean is 1.66
Overall passenger median is 1.0 vs 1.0 for outliers.


In [None]:
# what is the mean for outlier passengers? how does it compare with the mean for all passengers?
print(f"Mean for all trips for passengers: {df["passengers"].mean()}")

p_outliers = df.query(
    "passengers < passengers.quantile(0.25) - @iqr_p | passengers > passengers.quantile(0.75) + @iqr_p"
)["passengers"]

print(
    f"Mean for passenger outliers is: {p_outliers.mean()} vs median of {p_outliers.median()}"
)


Mean for all trips for passengers: 1.6594659465946595
Mean for passenger outliers is: 5.174603174603175 vs median of 5.0


# Extension questions
1. Define the outliers to be the top and bottom 10% of values. How many are there? Why is or isn't this a good measure?
2. How many short, medium, and long trips only had one passenger? You can use the `scipy.stats.trimboth` function to remove non-outlier values, which can remove a proportion from the top and bottom values.
3. The `scipy.stats.zscore` function rescales and centres the data set. So will set the mean to 0, and values can be above and below (noramlises). Find all instances for which the absolute value of the z-score is greater than 3.

In [None]:
# 1. Use 10% to be outliers (`.quantile(0.1)`)
top_n = df.query("distance > distance.quantile(0.9)")["distance"]
bottom_n = df.query("distance < distance.quantile(0.1)")["distance"]

print(
    f"There are {bottom_n.count()} low end outliers and {top_n.count()} top end outliers ({bottom_n.count() + top_n.count()} total)."
)
# so 1984 total vs the 1219 from the IQR method (which had 0 low end outliers and all top end)
top_n.describe()
# bottom_n.describe()

# This method includes the ultra short trips, which the IQR method ignored
# The upper end cutoff was about 2mi higher than the IQR method (which from eyeballing the
# data distribution is not actually that bad)

There are 985 low end outliers and 999 top end outliers (1984 total).


count    999.000000
mean      13.304845
std        4.999922
min        8.210000
25%        9.770000
50%       11.400000
75%       16.445000
max       64.600000
Name: distance, dtype: float64

In [None]:
# 2. How many short, medium, and long trips only had one passenger?
df.query("passengers == 1")["trip_type"].value_counts()
# this was actually a dupe of the Ex 7 question.

from scipy.stats import trimboth

trimboth(df["distance"], 0.1)

array([0.63, 0.63, 0.63, ..., 8.2 , 8.2 , 8.2 ])

In [None]:
# 3. Calculate the z-score and find all values where the z-score is greater than 3.
from scipy.stats import zscore

dists = zscore(df["distance"])
dists[abs(dists) > 3].count()

# 306 values are outside 3 standard deviations of the mean for distances
# 15.3mi - 64.6mi range (unsurprisingly nothing on the low end)

count    306.000000
mean      19.247418
std        4.898164
min       15.300000
25%       16.902500
50%       18.005000
75%       19.907500
max       64.600000
Name: distance, dtype: float64