In [1]:
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq
import statsmodels.formula.api as sm
import matplotlib.pyplot as plt

pd.set_option('display.float_format', lambda x: '%.5f' % x)


In [2]:
# Import Cleaned Data

rides2022 = pq.read_table("/Users/aya/Desktop/Aya_NYTWA/Data/april_2022_cleaned.parquet", memory_map=True).to_pandas()
rides2019 = pq.read_table("/Users/aya/Desktop/Aya_NYTWA/Data/february_2019_cleaned.parquet", memory_map=True).to_pandas()


In [3]:
#CONSTANTS

# to do: change the rates so that there are company specific values as well (for 2019, need Via rates), maybe store in a dataframe

NON_WAV_PER_MILE_22 = 1.161
NON_WAV_PER_MIN_22 = 0.529

WAV_PER_MILE_22 = 1.504
WAV_PER_MIN_22 = 0.529

OUT_CITY_NON_WAV_PER_MILE_22 = 1.348
OUT_CITY_WAV_PER_MILE_22 = 1.746
OUT_CITY_PER_MIN_22 = 0.613


NON_WAV_PER_MILE_19 = 0.914
NON_WAV_PER_MIN_19 = 0.416

WAV_PER_MILE_19 = 1.186
WAV_PER_MIN_19 = 0.416


OUT_CITY_NON_WAV_PER_MILE_19 = 1.262
OUT_CITY_WAV_PER_MILE_19 = 1.636
OUT_CITY_PER_MIN_19 = 0.574

JUNO = "HV0002"
UBER = "HV0003"
VIA = "HV0004"
LYFT = "HV0005"

In [4]:
# Create minimum rates 2022 based on WAV and Out of City Conditions

conditions2022 = [
    (rides2022['wav_match_flag'] == 'N') & (rides2022['out_of_city'] == False),
    (rides2022['wav_match_flag'] == 'Y') & (rides2022['out_of_city'] == False),
    (rides2022['wav_match_flag'] == 'N') & (rides2022['out_of_city'] == True),
    (rides2022['wav_match_flag'] == 'Y') & (rides2022['out_of_city'] == True)]

choices2022 = [rides2022.trip_miles*NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*NON_WAV_PER_MIN_22,
              rides2022.trip_miles*WAV_PER_MILE_22 + (rides2022.trip_time/60)*WAV_PER_MIN_22,
              rides2022.trip_miles*OUT_CITY_NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*OUT_CITY_PER_MIN_22,
              rides2022.trip_miles*OUT_CITY_WAV_PER_MILE_22 + (rides2022.trip_time/60)*OUT_CITY_PER_MIN_22]

rides2022['min_rate'] = np.select(conditions2022, choices2022, default=rides2022.trip_miles*NON_WAV_PER_MILE_22 + (rides2022.trip_time/60)*NON_WAV_PER_MIN_22)


In [None]:
# Create minimum rates 2019 based on WAV and Out of City Conditions

conditions2019 = [
    (rides2019['wav_match_flag'] == 'N') & (rides2019['out_of_city'] == False),
    (rides2019['wav_match_flag'] == 'Y') & (rides2019['out_of_city'] == False),
    (rides2019['wav_match_flag'] == 'N') & (rides2019['out_of_city'] == True),
    (rides2019['wav_match_flag'] == 'Y') & (rides2019['out_of_city'] == True)]

choices2019 = [rides2019.trip_miles*NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*NON_WAV_PER_MIN_19,
              rides2019.trip_miles*WAV_PER_MILE_19 + (rides2019.trip_time/60)*WAV_PER_MIN_19,
              rides2019.trip_miles*OUT_CITY_NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*OUT_CITY_PER_MIN_19,
              rides2019.trip_miles*OUT_CITY_WAV_PER_MILE_19 + (rides2019.trip_time/60)*OUT_CITY_PER_MIN_19]

rides2019['min_rate'] = np.select(conditions2019, choices2019, default=rides2019.trip_miles*NON_WAV_PER_MILE_19 + (rides2019.trip_time/60)*NON_WAV_PER_MIN_19)



In [5]:
# Create variable for dollar amount of driver pay over min rate

rides2022['over_min_rate'] = (rides2022.driver_pay - rides2022.min_rate)
rides2019['over_min_rate'] = (rides2019.driver_pay - rides2019.min_rate)

In [6]:
# Create variable for percentage of driver pay over min rate

rides2022['perc_over_min_rate'] = (rides2022.over_min_rate/rides2022.min_rate)
rides2019['perc_over_min_rate'] = (rides2019.over_min_rate/rides2019.min_rate)#

In [None]:
# Create rate of passenger fare, by mile and minute


rides2022['passenger_fare_mile'] = rides2022.base_passenger_fare/rides2022.trip_miles
rides2022['passenger_fare_min'] = rides2022.base_passenger_fare/(rides2022.trip_time/60)

rides2019['passenger_fare_mile'] = rides2019.base_passenger_fare/rides2019.trip_miles
rides2019['passenger_fare_min'] = rides2019.base_passenger_fare/(rides2019.trip_time/60)

## Driver Pay Analysis


In [7]:
# adjust 2019 data to 2022 dollar inflation rate

rides2019['driver_pay_adj'] = rides2019.driver_pay * 1.159 #inflation rate between 2019 and 2022


NameError: name 'rides2019' is not defined

#### Distribution of driver pay (in 2022 dollars)

In [None]:
rides2019.driver_pay_adj.describe()


In [None]:
rides2022.driver_pay.describe()

#### Distribution of percentage of driver earnings over the minimum required rate for the trip (In 2022 the bottom 25% of trips made less than 0.5% over the minimum rate, the top 25% of trips made at least 30% over the minimum rate, and the median percent made over the minimum rate was 7%)

In [8]:
rides2022.perc_over_min_rate.describe()

count   17669379.00000
mean           0.29790
std            1.52166
min           -0.99965
25%            0.00463
50%            0.07211
75%            0.30177
max         3272.78962
Name: perc_over_min_rate, dtype: float64

In [None]:
rides2019.perc_over_min_rate.describe()

#### !! Percentage of rides where the driver made didn't make more than the minimum required rate

In [9]:
len(rides2022[rides2022.perc_over_min_rate <= 0])/len(rides2022)

0.05774945457902057

In [10]:
len(rides2019[rides2019.perc_over_min_rate <= 0])/len(rides2019)

NameError: name 'rides2019' is not defined

#### !! Percentage of rides where the driver made 1% or less over the minimum required rate

In [11]:
len(rides2022[rides2022.perc_over_min_rate <= 0.01])/len(rides2022)

0.36280171476315043

In [12]:
len(rides2022[rides2019.perc_over_min_rate <= 0.01])/len(rides2019)

NameError: name 'rides2019' is not defined

#### !! Percentage of rides where the driver made 10% or less over the minimum required rate

In [13]:
len(rides2022[rides2022.perc_over_min_rate <= 0.10])/len(rides2022)

0.5548346096373845

In [None]:
len(rides2019[rides2019.perc_over_min_rate <= 0.10])/len(rides2019)

#### Percentage of rides where the driver made more than 5% of the minimum required rate

In [None]:
len(rides2022[rides2022.perc_over_min_rate > 0.05])/len(rides2022)

In [None]:
len(rides2019[rides2019.perc_over_min_rate > 0.05])/len(rides2019)

#### Percentage of rides where the driver made more than 10% of the minimum required rate

In [None]:
len(rides2022[rides2022.perc_over_min_rate > 0.10])/len(rides2022)

In [None]:
len(rides2019[rides2019.perc_over_min_rate > 0.10])/len(rides2019)

#### Percentage of rides where the driver made more than 20% of the minimum required rate

In [None]:
len(rides2022[rides2022.perc_over_min_rate > 0.20])/len(rides2022)

In [None]:
len(rides2019[rides2019.perc_over_min_rate > 0.20])/len(rides2019)

#### Distribution of driver pay for rides where the driver made over 20% of the minimum rate (2022 dollars)

In [None]:
rides2022[rides2022.perc_over_min_rate > .2].driver_pay.describe()

In [None]:
rides2019[rides2019.perc_over_min_rate > .2].driver_pay_adj.describe()

#### !! Percent of trips where overage is more than \$1

In [14]:
len(rides2022[rides2022['over_min_rate'] > 1])/len(rides2022)

0.4896549561815387

In [None]:
len(rides2022[rides2019['over_min_rate'] > 1])/len(rides2019)

#### !! Percent of trips where overage was \$20 or more

In [15]:
len(rides2022[rides2022['over_min_rate'] > 20])/len(rides2022)

0.014234625902811864

In [15]:
len(rides2019[rides2019['over_min_rate'] > 20])/len(rides2019)

0.014234625902811864

#### !! Average overage of all trips in dollars

In [16]:
rides2022['over_min_rate'].describe()

count   17669379.00000
mean           2.64485
std            5.06055
min         -228.01909
25%            0.05774
50%            0.95019
75%            3.04933
max          515.03386
Name: over_min_rate, dtype: float64

In [16]:
rides2019['over_min_rate'].describe()

count   17669379.00000
mean           2.64485
std            5.06055
min         -228.01909
25%            0.05774
50%            0.95019
75%            3.04933
max          515.03386
Name: over_min_rate, dtype: float64

#### !! Average overage of trips that were 10% or less over in dollars

In [17]:
rides2022[rides2022['perc_over_min_rate'] < .1]['over_min_rate'].describe()

count   9803583.00000
mean          0.26669
std           1.14075
min        -228.01909
25%           0.00763
50%           0.07168
75%           0.55715
max          37.07645
Name: over_min_rate, dtype: float64

In [17]:
rides2019[rides2019['perc_over_min_rate'] < .1]['over_min_rate'].describe()

count   9803583.00000
mean          0.26669
std           1.14075
min        -228.01909
25%           0.00763
50%           0.07168
75%           0.55715
max          37.07645
Name: over_min_rate, dtype: float64

#### !! Average overage of all trips which are over pay rules in dollars

In [18]:
rides2022[rides2022['over_min_rate'] > 0]['over_min_rate'].describe()

count   16648982.00000
mean           2.87750
std            5.08797
min            0.00000
25%            0.08143
50%            1.08386
75%            3.26391
max          515.03386
Name: over_min_rate, dtype: float64

In [18]:
rides2019[rides2019['over_min_rate'] > 0]['over_min_rate'].describe()

count   16648982.00000
mean           2.87750
std            5.08797
min            0.00000
25%            0.08143
50%            1.08386
75%            3.26391
max          515.03386
Name: over_min_rate, dtype: float64

## Passenger Fare Analysis

In [None]:
# Convert 2019 base passenger fare to 2022 dollars

rides2019['passenger_fare_adj'] = rides2019.base_passenger_fare * 1.159 #inflation rate between 2019 and 2022
rides2019['passenger_fare_min_adj'] = rides2019.passenger_fare_min * 1.159
rides2019['passenger_fare_mile_adj'] = rides2019.passenger_fare_mile * 1.159



#### Distribution of passenger fare (2022 Dollars)

In [None]:
rides2022.base_passenger_fare.describe()

In [None]:
rides2019.passenger_fare_adj.describe()

#### Average fare per minute cost (2022 Dollars)

In [None]:
np.mean(rides2022['passenger_fare_min'])

In [None]:
np.mean(rides2019['passenger_fare_min_adj'])

#### Average fare per mile cost (2022 Dollars)

In [None]:
np.mean(rides2022['passenger_fare_mile'])

In [None]:
np.mean(rides2019['passenger_fare_mile_adj'])

#### Regression of minutes and miles by passenger fare (In 2022, each additional mile per trip increases the fare by 2.03 dollars and each additional minute per trip increases the fare by 0.47 dollars)



In [None]:
# convert trip time to minutes
rides2019['trip_time_min'] = rides2019.trip_time/60
rides2022['trip_time_min'] = rides2022.trip_time/60



In [None]:
result = sm.ols(formula="base_passenger_fare ~ trip_miles + trip_time_min", data=rides2022).fit()
print(result.params)

In [None]:
result = sm.ols(formula="passenger_fare_adj ~ trip_miles + trip_time_min", data=rides2019).fit()
print(result.params)