In [2]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import statsmodels.formula.api as sm
#import matplotlib.pyplot as plt

pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [3]:
# Import Cleaned Data

rides2022 = pq.read_table("april_2022_cleaned.parquet", memory_map=True).to_pandas()
rides2019 = pq.read_table("february_2019_cleaned.parquet", memory_map=True).to_pandas()


In [4]:
#CONSTANTS

# to do: change the rates so that there are company specific values as well (for 2019, need Via rates), maybe store in a dataframe

NON_WAV_PER_MILE_22 = 1.161
NON_WAV_PER_MIN_22 = 0.529

WAV_PER_MILE_22 = 1.504
WAV_PER_MIN_22 = 0.529

OUT_CITY_NON_WAV_PER_MILE_22 = 1.348
OUT_CITY_WAV_PER_MILE_22 = 1.746
OUT_CITY_PER_MIN_22 = 0.613


NON_WAV_PER_MILE_19 = 0.914
NON_WAV_PER_MIN_19 = 0.416

WAV_PER_MILE_19 = 1.186
WAV_PER_MIN_19 = 0.416


OUT_CITY_NON_WAV_PER_MILE_19 = 1.262
OUT_CITY_WAV_PER_MILE_19 = 1.636
OUT_CITY_PER_MIN_19 = 0.574

JUNO = "HV0002"
UBER = "HV0003"
VIA = "HV0004"
LYFT = "HV0005"

In [5]:
# Create minimum rates 2022 based on WAV and Out of City Conditions

conditions2022 = [
    (rides2022['wav_match_flag'] == 'N') & (rides2022['out_of_city'] == False),
    (rides2022['wav_match_flag'] == 'Y') & (rides2022['out_of_city'] == False),
    (rides2022['wav_match_flag'] == 'N') & (rides2022['out_of_city'] == True),
    (rides2022['wav_match_flag'] == 'Y') & (rides2022['out_of_city'] == True)]

choices2022 = [rides2022.trip_miles*NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*NON_WAV_PER_MIN_22,
              rides2022.trip_miles*WAV_PER_MILE_22 + (rides2022.trip_time/60)*WAV_PER_MIN_22,
              rides2022.trip_miles*OUT_CITY_NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*OUT_CITY_PER_MIN_22,
              rides2022.trip_miles*OUT_CITY_WAV_PER_MILE_22 + (rides2022.trip_time/60)*OUT_CITY_PER_MIN_22]

rides2022['min_rate'] = np.select(conditions2022, choices2022, default=rides2022.trip_miles*NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*NON_WAV_PER_MIN_22)


In [6]:
# Create minimum rates 2019 based on WAV and Out of City Conditions

conditions2019 = [
    (rides2019['wav_match_flag'] == 'N') & (rides2019['out_of_city'] == False),
    (rides2019['wav_match_flag'] == 'Y') & (rides2019['out_of_city'] == False),
    (rides2019['wav_match_flag'] == 'N') & (rides2019['out_of_city'] == True),
    (rides2019['wav_match_flag'] == 'Y') & (rides2019['out_of_city'] == True)]

choices2019 = [rides2019.trip_miles*NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*NON_WAV_PER_MIN_19,
              rides2019.trip_miles*WAV_PER_MILE_19 + (rides2019.trip_time/60)*WAV_PER_MIN_19,
              rides2019.trip_miles*OUT_CITY_NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*OUT_CITY_PER_MIN_19,
              rides2019.trip_miles*OUT_CITY_WAV_PER_MILE_19 + (rides2019.trip_time/60)*OUT_CITY_PER_MIN_19]

rides2019['min_rate'] = np.select(conditions2019, choices2019, default=rides2019.trip_miles*NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*NON_WAV_PER_MIN_19)



In [7]:
# Create variable for dollar amount of driver pay over min rate

rides2022['over_min_rate'] = (rides2022.driver_pay - rides2022.min_rate)
rides2019['over_min_rate'] = (rides2019.driver_pay - rides2019.min_rate)

In [8]:
# Create variable for percentage of driver pay over min rate

rides2022['perc_over_min_rate'] = (rides2022.over_min_rate/rides2022.min_rate)
rides2019['perc_over_min_rate'] = (rides2019.over_min_rate/rides2019.min_rate)#

In [9]:
# Create rate of passenger fare, by mile and minute


rides2022['passenger_fare_mile'] = rides2022.base_passenger_fare/rides2022.trip_miles
rides2022['passenger_fare_min'] = rides2022.base_passenger_fare/(rides2022.trip_time/60)

rides2019['passenger_fare_mile'] = rides2019.base_passenger_fare/rides2019.trip_miles
rides2019['passenger_fare_min'] = rides2019.base_passenger_fare/(rides2019.trip_time/60)

In [10]:
# Determine percent of trips that leave city 
# Drop Shared rides, out of town rides, and VIA rides


rides2019 = rides2019.loc[rides2019['shared_request_flag'] == "N"]

rides2019 = rides2019.loc[rides2019['hvfhs_license_num'] != VIA]
rides2022 = rides2022.loc[rides2022['hvfhs_license_num'] != VIA]

rides2022 = rides2022.loc[rides2022['out_of_city'] == False]
rides2019 = rides2019.loc[rides2019['out_of_city'] == False]




## Driver Pay Analysis


In [13]:
# adjust 2019 data to 2022 dollar inflation rate

rides2019['driver_pay_adj'] = rides2019.driver_pay * 1.159 #inflation rate between 2019 and 2022


#### Distribution of driver pay (in 2022 dollars)

In [14]:
rides2019.driver_pay_adj.describe()


count   13395779.00000
mean          17.67265
std           14.41927
min            0.03477
25%            8.18254
50%           12.98080
75%           21.61535
max          902.28150
Name: driver_pay_adj, dtype: float64

In [15]:
rides2022.driver_pay.describe()

count   16954718.00000
mean          17.89936
std           12.51835
min            0.04000
25%            8.95000
50%           14.41000
75%           22.93000
max         2053.10000
Name: driver_pay, dtype: float64

#### Distribution of percentage of driver earnings over the minimum required rate for the trip (In 2022 the bottom 25% of trips made less than 0.5% over the minimum rate, the top 25% of trips made at least 30% over the minimum rate, and the median percent made over the minimum rate was 7%)

In [16]:
rides2022.perc_over_min_rate.describe()

count   16954718.00000
mean           0.30767
std            1.54314
min           -0.99874
25%            0.00476
50%            0.07676
75%            0.31287
max         3272.78962
Name: perc_over_min_rate, dtype: float64

In [17]:
rides2019.perc_over_min_rate.describe()

count   13395779.00000
mean           0.45270
std            1.16549
min           -0.99918
25%            0.19720
50%            0.22892
75%            0.46325
max         3170.68937
Name: perc_over_min_rate, dtype: float64

#### !! Percentage of rides where the driver made didn't make more than the minimum required rate

In [18]:
len(rides2022[rides2022.perc_over_min_rate <= 0])/len(rides2022)

0.03693149010204711

In [19]:
len(rides2019[rides2019.perc_over_min_rate <= 0])/len(rides2019)

0.004164371478508268

#### !! Percentage of rides where the driver made 1% or less over the minimum required rate

In [20]:
len(rides2022[rides2022.perc_over_min_rate <= 0.01])/len(rides2022)

0.35353587125424324

In [22]:
len(rides2019[rides2019.perc_over_min_rate <= 0.01])/len(rides2019)

0.0046787126004392875

#### !! Percentage of rides where the driver made 10% or less over the minimum required rate

In [23]:
len(rides2022[rides2022.perc_over_min_rate <= 0.10])/len(rides2022)

0.5460042449541184

In [24]:
len(rides2019[rides2019.perc_over_min_rate <= 0.10])/len(rides2019)

0.014464257733723436

#### Percentage of rides where the driver made more than 5% of the minimum required rate

In [25]:
len(rides2022[rides2022.perc_over_min_rate > 0.05])/len(rides2022)

0.5561762808440694

In [26]:
len(rides2019[rides2019.perc_over_min_rate > 0.05])/len(rides2019)

0.9920654857026232

#### Percentage of rides where the driver made more than 10% of the minimum required rate

In [27]:
len(rides2022[rides2022.perc_over_min_rate > 0.10])/len(rides2022)

0.45399575504588163

In [28]:
len(rides2019[rides2019.perc_over_min_rate > 0.10])/len(rides2019)

0.9855357422662766

#### Percentage of rides where the driver made more than 20% of the minimum required rate

In [29]:
len(rides2022[rides2022.perc_over_min_rate > 0.20])/len(rides2022)

0.3258008183916713

In [30]:
len(rides2019[rides2019.perc_over_min_rate > 0.20])/len(rides2019)

0.7033880597761429

#### Distribution of driver pay for rides where the driver made over 20% of the minimum rate (2022 dollars)

In [31]:
rides2022[rides2022.perc_over_min_rate > .2].driver_pay.describe()

count   5523861.00000
mean         16.85373
std          13.23824
min           0.83000
25%           7.69000
50%          13.20000
75%          21.56000
max        2053.10000
Name: driver_pay, dtype: float64

In [32]:
rides2019[rides2019.perc_over_min_rate > .2].driver_pay_adj.describe()

count   9422431.00000
mean         17.80667
std          15.64119
min           0.09272
25%           7.40601
50%          12.36653
75%          21.81238
max         902.28150
Name: driver_pay_adj, dtype: float64

#### !! Percent of trips where overage is more than \$1

In [33]:
len(rides2022[rides2022['over_min_rate'] > 1])/len(rides2022)

0.49542740846530153

In [34]:
len(rides2019[rides2019['over_min_rate'] > 1])/len(rides2019)

0.9671474872793885

#### !! Percent of trips where overage was \$20 or more

In [35]:
len(rides2022[rides2022['over_min_rate'] > 20])/len(rides2022)

0.012994377140333446

In [36]:
len(rides2019[rides2019['over_min_rate'] > 20])/len(rides2019)

0.01806778090322332

#### !! Average overage of all trips in dollars

In [38]:
rides2022['over_min_rate'].describe()

count   16954718.00000
mean           2.67810
std            4.81934
min         -383.92999
25%            0.06373
50%            0.98173
75%            3.05683
max         1256.59406
Name: over_min_rate, dtype: float64

In [39]:
rides2019['over_min_rate'].describe()

count   13395779.00000
mean           4.03285
std            5.49055
min         -499.60857
25%            1.62681
50%            2.46624
75%            4.00717
max          480.33676
Name: over_min_rate, dtype: float64

#### !! Average overage of trips that were 10% or less over in dollars

In [40]:
rides2022[rides2022['perc_over_min_rate'] < .1]['over_min_rate'].describe()

count   9257348.00000
mean          0.37021
std           0.79652
min        -383.92999
25%           0.00941
50%           0.07589
75%           0.56360
max          37.07645
Name: over_min_rate, dtype: float64

In [41]:
rides2019[rides2019['perc_over_min_rate'] < .1]['over_min_rate'].describe()

count   193760.00000
mean        -0.07531
std          5.05036
min       -499.60857
25%         -0.19237
50%          0.59934
75%          1.06871
max         25.45025
Name: over_min_rate, dtype: float64

#### !! Average overage of all trips which are over pay rules in dollars

In [42]:
rides2022[rides2022['over_min_rate'] > 0]['over_min_rate'].describe()

count   16328555.00000
mean           2.78558
std            4.86613
min            0.00000
25%            0.07868
50%            1.05495
75%            3.20112
max         1256.59406
Name: over_min_rate, dtype: float64

In [43]:
rides2019[rides2019['over_min_rate'] > 0]['over_min_rate'].describe()

count   13339994.00000
mean           4.06103
std            5.45497
min            0.00001
25%            1.63543
50%            2.47480
75%            4.01805
max          480.33676
Name: over_min_rate, dtype: float64

## Passenger Fare Analysis

In [44]:
# Convert 2019 base passenger fare to 2022 dollars

rides2019['passenger_fare_adj'] = rides2019.base_passenger_fare * 1.159 #inflation rate between 2019 and 2022
rides2019['passenger_fare_min_adj'] = rides2019.passenger_fare_min * 1.159
rides2019['passenger_fare_mile_adj'] = rides2019.passenger_fare_mile * 1.159



#### Distribution of passenger fare (2022 Dollars)

In [45]:
rides2022.base_passenger_fare.describe()

count   16954718.00000
mean          22.56126
std           15.56407
min            0.02000
25%           11.70000
50%           18.39000
75%           28.59000
max         3732.90000
Name: base_passenger_fare, dtype: float64

In [46]:
rides2019.passenger_fare_adj.describe()

count   13395779.00000
mean          19.94154
std           16.15856
min            0.02318
25%            9.26041
50%           14.92792
75%           24.86055
max         1117.51939
Name: passenger_fare_adj, dtype: float64

#### Average fare per minute cost (2022 Dollars)

In [47]:
np.mean(rides2022['passenger_fare_min'])

1.3263106126948536

In [48]:
np.mean(rides2019['passenger_fare_min_adj'])

1.1866627545108976

#### Average fare per mile cost (2022 Dollars)

In [49]:
np.mean(rides2022['passenger_fare_mile'])

7.021462991427306

In [50]:
np.mean(rides2019['passenger_fare_mile_adj'])

6.381262262839378

#### Regression of minutes and miles by passenger fare (In 2022, each additional mile per trip increases the fare by 2.03 dollars and each additional minute per trip increases the fare by 0.47 dollars)



In [51]:
# convert trip time to minutes
rides2019['trip_time_min'] = rides2019.trip_time/60
rides2022['trip_time_min'] = rides2022.trip_time/60



In [52]:
result = sm.ols(formula="base_passenger_fare ~ trip_miles + trip_time_min", data=rides2022).fit()
print(result.params)

Intercept       5.10090
trip_miles      1.62144
trip_time_min   0.53950
dtype: float64


In [53]:
result = sm.ols(formula="passenger_fare_adj ~ trip_miles + trip_time_min", data=rides2019).fit()
print(result.params)

Intercept       1.69551
trip_miles      1.85531
trip_time_min   0.58618
dtype: float64
