In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import statsmodels.formula.api as sm

pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [2]:
# Import Cleaned Data

rides2022 = pq.read_table("april_2022_cleaned.parquet", memory_map=True).to_pandas()
rides2019 = pq.read_table("february_2019_cleaned.parquet", memory_map=True).to_pandas()


In [3]:
#CONSTANTS

NON_WAV_PER_MILE_22 = 1.161
NON_WAV_PER_MIN_22 = 0.529

WAV_PER_MILE_22 = 1.504
WAV_PER_MIN_22 = 0.529

OUT_CITY_NON_WAV_PER_MILE_22 = 1.348
OUT_CITY_WAV_PER_MILE_22 = 1.746
OUT_CITY_PER_MIN_22 = 0.613


NON_WAV_PER_MILE_19 = 0.914
NON_WAV_PER_MIN_19 = 0.416

WAV_PER_MILE_19 = 1.186
WAV_PER_MIN_19 = 0.416


OUT_CITY_NON_WAV_PER_MILE_19 = 1.262
OUT_CITY_WAV_PER_MILE_19 = 1.636
OUT_CITY_PER_MIN_19 = 0.574

JUNO = "HV0002"
UBER = "HV0003"
VIA = "HV0004"
LYFT = "HV0005"

In [4]:
# Create minimum rates 2022 based on WAV and Out of City Conditions

conditions2022 = [
    (rides2022['wav_request_flag'] == 'N') & (rides2022['out_of_city'] == False),
    (rides2022['wav_request_flag'] == 'Y') & (rides2022['out_of_city'] == False),
    (rides2022['wav_request_flag'] == 'N') & (rides2022['out_of_city'] == True),
    (rides2022['wav_request_flag'] == 'Y') & (rides2022['out_of_city'] == True)]

choices2022 = [rides2022.trip_miles*NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*NON_WAV_PER_MIN_22,
              rides2022.trip_miles*WAV_PER_MILE_22 + (rides2022.trip_time/60)*WAV_PER_MIN_22,
              rides2022.trip_miles*OUT_CITY_NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*OUT_CITY_PER_MIN_22,
              rides2022.trip_miles*OUT_CITY_WAV_PER_MILE_22 + (rides2022.trip_time/60)*OUT_CITY_PER_MIN_22]

rides2022['min_rate'] = np.select(conditions2022, choices2022, default=rides2022.trip_miles*NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*NON_WAV_PER_MIN_22)


In [5]:
# Create minimum rates 2019 based on WAV and Out of City Conditions

conditions2019 = [
    (rides2019['wav_request_flag'] == 'N') & (rides2019['out_of_city'] == False),
    (rides2019['wav_request_flag'] == 'Y') & (rides2019['out_of_city'] == False),
    (rides2019['wav_request_flag'] == 'N') & (rides2019['out_of_city'] == True),
    (rides2019['wav_request_flag'] == 'Y') & (rides2019['out_of_city'] == True)]

choices2019 = [rides2019.trip_miles*NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*NON_WAV_PER_MIN_19,
              rides2019.trip_miles*WAV_PER_MILE_19 + (rides2019.trip_time/60)*WAV_PER_MIN_19,
              rides2019.trip_miles*OUT_CITY_NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*OUT_CITY_PER_MIN_19,
              rides2019.trip_miles*OUT_CITY_WAV_PER_MILE_19 + (rides2019.trip_time/60)*OUT_CITY_PER_MIN_19]

rides2019['min_rate'] = np.select(conditions2019, choices2019, default=rides2019.trip_miles*NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*NON_WAV_PER_MIN_19)



In [6]:
# Create variable for dollar amount of driver pay over min rate

rides2022['over_min_rate'] = (rides2022.driver_pay - rides2022.min_rate)
rides2019['over_min_rate'] = (rides2019.driver_pay - rides2019.min_rate)

In [7]:
# Create variable for percentage of driver pay over min rate

rides2022['perc_over_min_rate'] = (rides2022.over_min_rate/rides2022.min_rate)
rides2019['perc_over_min_rate'] = (rides2019.over_min_rate/rides2019.min_rate)

In [8]:
# Create rate of passenger fare, by mile and minute


rides2022['passenger_fare_mile'] = rides2022.base_passenger_fare/rides2022.trip_miles
rides2022['passenger_fare_min'] = rides2022.base_passenger_fare/(rides2022.trip_time/60)

rides2019['passenger_fare_mile'] = rides2019.base_passenger_fare/rides2019.trip_miles
rides2019['passenger_fare_min'] = rides2019.base_passenger_fare/(rides2019.trip_time/60)

## Driver Pay Analysis


In [9]:
# adjust 2019 data to 2022 dollar inflation rate

rides2019['driver_pay_adj'] = rides2019.driver_pay * 1.159 #inflation rate between 2019 and 2022


#### Distribution of driver pay (in 2022 dollars)

In [10]:
rides2019.driver_pay_adj.describe()


count   17889002.00000
mean          17.95874
std           16.98783
min            0.03477
25%            7.84643
50%           12.62151
75%           21.25606
max          994.09748
Name: driver_pay_adj, dtype: float64

In [11]:
rides2022.driver_pay.describe()

count   17669620.00000
mean          19.09101
std           15.26613
min            0.04000
25%            9.13000
50%           14.86000
75%           23.96000
max         2053.10000
Name: driver_pay, dtype: float64

#### Distribution of percentage of driver earnings over the minimum required rate for the trip (In 2022 the bottom 25% of trips made less than 0.5% over the minimum rate, the top 25% of trips made at least 30% over the minimum rate, and the median percent made over the minimum rate was 7%)

In [12]:
rides2022.perc_over_min_rate.describe()

count   17669620.00000
mean           0.30308
std            1.52763
min           -0.99965
25%            0.00473
50%            0.07713
75%            0.31204
max         3272.78962
Name: perc_over_min_rate, dtype: float64

In [13]:
rides2019.perc_over_min_rate.describe()

count   17889002.00000
mean           0.34397
std            1.09115
min           -0.99987
25%            0.19565
50%            0.21160
75%            0.39533
max         3170.68937
Name: perc_over_min_rate, dtype: float64

#### Percentage of rides where the driver made more than 5% of the minimum required rate

In [14]:
len(rides2022[rides2022.perc_over_min_rate > 0.05])/len(rides2022)

0.5582869354292849

In [15]:
len(rides2019[rides2019.perc_over_min_rate > 0.05])/len(rides2019)

0.8319822424973735

#### Percentage of rides where the driver made more than 10% of the minimum required rate

In [16]:
len(rides2022[rides2022.perc_over_min_rate > 0.10])/len(rides2022)

0.45310289638373663

In [17]:
len(rides2019[rides2019.perc_over_min_rate > 0.10])/len(rides2019)

0.8197035251044189

#### Percentage of rides where the driver made more than 20% of the minimum required rate

In [18]:
len(rides2022[rides2022.perc_over_min_rate > 0.20])/len(rides2022)

0.325850640817403

In [19]:
len(rides2019[rides2019.perc_over_min_rate > 0.20])/len(rides2019)

0.5801210151354447

#### Distribution of driver pay for rides where the driver made over 20% of the minimum rate (2022 dollars)

In [20]:
rides2022[rides2022.perc_over_min_rate > .2].driver_pay.describe()

count   5757657.00000
mean         17.64329
std          15.35353
min           0.83000
25%           7.82000
50%          13.57000
75%          22.19000
max        2053.10000
Name: driver_pay, dtype: float64

In [22]:
rides2019[rides2019.perc_over_min_rate > .2].driver_pay_adj.describe()

count   10377786.00000
mean          18.72158
std           18.39862
min            0.09272
25%            7.26693
50%           12.32017
75%           22.43824
max          982.25250
Name: driver_pay_adj, dtype: float64

## Passenger Fare Analysis

In [23]:
# Convert 2019 base passenger fare to 2022 dollars

rides2019['passenger_fare_adj'] = rides2019.base_passenger_fare * 1.159 #inflation rate between 2019 and 2022
rides2019['passenger_fare_min_adj'] = rides2019.passenger_fare_min * 1.159
rides2019['passenger_fare_mile_adj'] = rides2019.passenger_fare_mile * 1.159



#### Distribution of passenger fare (2022 Dollars)

In [24]:
rides2022.base_passenger_fare.describe()

count   17669620.00000
mean          24.12110
std           19.48718
min            0.02000
25%           11.91000
50%           18.87000
75%           29.81000
max         3732.90000
Name: base_passenger_fare, dtype: float64

In [25]:
rides2019.passenger_fare_adj.describe()

count   17889002.00000
mean          19.14969
std           19.31890
min            0.02318
25%            8.26367
50%           13.23578
75%           23.14523
max         1271.75911
Name: passenger_fare_adj, dtype: float64

#### Average fare per minute cost (2022 Dollars)

In [26]:
np.mean(rides2022['passenger_fare_min'])

1.3393001986143314

In [27]:
np.mean(rides2019['passenger_fare_min_adj'])

1.0677118565277983

#### Average fare per mile cost (2022 Dollars)

In [28]:
np.mean(rides2022['passenger_fare_mile'])

6.909444600134648

In [29]:
np.mean(rides2019['passenger_fare_mile_adj'])

5.542751940034122

#### Regression of minutes and miles by passenger fare (In 2022, each additional mile per trip increases the fare by 2.03 dollars and each additional minute per trip increases the fare by 0.47 dollars)



In [32]:
# convert trip time to minutes
rides2019['trip_time_min'] = rides2019.trip_time/60
rides2022['trip_time_min'] = rides2022.trip_time/60



In [33]:
result = sm.ols(formula="base_passenger_fare ~ trip_miles + trip_time_min", data=rides2022).fit()
print(result.params)

Intercept       4.74943
trip_miles      2.02529
trip_time_min   0.47004
dtype: float64


In [34]:
result = sm.ols(formula="passenger_fare_adj ~ trip_miles + trip_time_min", data=rides2019).fit()
print(result.params)

Intercept       1.84069
trip_miles      2.47411
trip_time_min   0.29684
dtype: float64
