In [1]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
!pip install plotly_express

## Goal
The goal of this notebook is essentially to recommend a driver's lifetime value (LTV).


In [0]:
import os

# change to path
PATH='/content/drive/My Drive/Colab Notebooks/lyft'
os.chdir(PATH)

DATA_DIR = os.path.join(PATH, 'data')

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from datetime import datetime

import warnings
warnings.simplefilter(action='ignore')

## Read in

In [0]:
ride_timestamps = pd.read_csv(os.path.join(DATA_DIR, 'ride_timestamps.csv'))
ride_ids = pd.read_csv(os.path.join(DATA_DIR, 'ride_ids.csv'))
driver_ids = pd.read_csv(os.path.join(DATA_DIR, 'driver_ids.csv'))

In [6]:
ride_timestamps.head()

Unnamed: 0,ride_id,event,timestamp
0,00003037a262d9ee40e61b5c0718f7f0,requested_at,2016-06-13 09:39:19
1,00003037a262d9ee40e61b5c0718f7f0,accepted_at,2016-06-13 09:39:51
2,00003037a262d9ee40e61b5c0718f7f0,arrived_at,2016-06-13 09:44:31
3,00003037a262d9ee40e61b5c0718f7f0,picked_up_at,2016-06-13 09:44:33
4,00003037a262d9ee40e61b5c0718f7f0,dropped_off_at,2016-06-13 10:03:05


In [7]:
ride_ids.head()

Unnamed: 0,driver_id,ride_id,ride_distance,ride_duration,ride_prime_time
0,002be0ffdc997bd5c50703158b7c2491,006d61cf7446e682f7bc50b0f8a5bea5,1811,327,50
1,002be0ffdc997bd5c50703158b7c2491,01b522c5c3a756fbdb12e95e87507eda,3362,809,0
2,002be0ffdc997bd5c50703158b7c2491,029227c4c2971ce69ff2274dc798ef43,3282,572,0
3,002be0ffdc997bd5c50703158b7c2491,034e861343a63ac3c18a9ceb1ce0ac69,65283,3338,25
4,002be0ffdc997bd5c50703158b7c2491,034f2e614a2f9fc7f1c2f77647d1b981,4115,823,100


In [8]:
driver_ids.head()

Unnamed: 0,driver_id,driver_onboard_date
0,002be0ffdc997bd5c50703158b7c2491,2016-03-29 00:00:00
1,007f0389f9c7b03ef97098422f902e62,2016-03-29 00:00:00
2,011e5c5dfc5c2c92501b8b24d47509bc,2016-04-05 00:00:00
3,0152a2f305e71d26cc964f8d4411add9,2016-04-23 00:00:00
4,01674381af7edd264113d4e6ed55ecda,2016-04-29 00:00:00


## Hypotheses
1. The longer the driver is onboard (until the last day the `driver_id` appears in the `ride_timestamps`), the higher his or her LTV.
2. Driving more frequently (or full-time or continuously picking up, will be the `drop_requested_lag` feature later) corresponds to higher LTV.
3. Longer distance corresponds to higher LTV.
4. Longer duration corresponds to higher LTV. (This alludes to the more the driver make the higher its LTV.)
5. Higher Prime Time (high demand hours) corresponds to higher LTV. 
6. Short `requested_at` and `accepted_at` window (`delta_requested_at` feature later) corresponds to higher LTV.




## EDA

### `drivers_ids`

In [9]:
driver_ids['driver_id'].nunique() # len(driver_ids)

937

In [10]:
driver_ids['driver_onboard_date'].min(), driver_ids['driver_onboard_date'].max()

('2016-03-28 00:00:00', '2016-05-15 00:00:00')

The file `driver_ids` consists of drivers who signed up as a driver with Lyft between 2016/03/28 to 2016/05/15.

### `ride_timestamps`

Check whether do the rides start after all the drivers have registered.

In [0]:
ride_timestamps['timestamp'] = pd.to_datetime(ride_timestamps['timestamp'])

In [12]:
ride_timestamps['timestamp'].min()

Timestamp('2016-03-28 05:48:18')

Apparently, according to `timestamp` in `ride_timestamps`, some drivers who have registered earlier started driving earlier.

In [13]:
ride_timestamps['timestamp'].max()

Timestamp('2016-06-27 00:50:50')

The dataset for rides ended on 2016/06/27.

In [0]:
# Feature engineer time features
def add_datepart(df, fldname, drop=False, time=True):
    """
    Taken from fast.ai
    Helper function that adds columns relevant to a date.
    """
    import re 
    
    fld = df[fldname]
    fld_dtype = fld.dtype
    if isinstance(fld_dtype, pd.core.dtypes.dtypes.DatetimeTZDtype):
        fld_dtype = np.datetime64

    if not np.issubdtype(fld_dtype, np.datetime64):
        df[fldname] = fld = pd.to_datetime(fld, infer_datetime_format=True)
    targ_pre = re.sub('[Dd]ate$', '', fldname)
    attr = ['Month', 'Week', 'Day', 'Dayofweek', 'Dayofyear']
    if time: attr = attr + ['Hour', 'Minute']
    for n in attr: df[targ_pre + n] = getattr(fld.dt, n.lower())
    df[targ_pre + 'Elapsed'] = fld.astype(np.int64) // 10 ** 9
    if drop: df.drop(fldname, axis=1, inplace=True)

In [15]:
add_datepart(ride_timestamps, 'timestamp')

ride_timestamps.head()

Unnamed: 0,ride_id,event,timestamp,timestampMonth,timestampWeek,timestampDay,timestampDayofweek,timestampDayofyear,timestampHour,timestampMinute,timestampElapsed
0,00003037a262d9ee40e61b5c0718f7f0,requested_at,2016-06-13 09:39:19,6.0,24.0,13.0,0.0,165.0,9.0,39.0,1465810759
1,00003037a262d9ee40e61b5c0718f7f0,accepted_at,2016-06-13 09:39:51,6.0,24.0,13.0,0.0,165.0,9.0,39.0,1465810791
2,00003037a262d9ee40e61b5c0718f7f0,arrived_at,2016-06-13 09:44:31,6.0,24.0,13.0,0.0,165.0,9.0,44.0,1465811071
3,00003037a262d9ee40e61b5c0718f7f0,picked_up_at,2016-06-13 09:44:33,6.0,24.0,13.0,0.0,165.0,9.0,44.0,1465811073
4,00003037a262d9ee40e61b5c0718f7f0,dropped_off_at,2016-06-13 10:03:05,6.0,24.0,13.0,0.0,165.0,10.0,3.0,1465812185


In [0]:
def seconds_lag(t):
    delta_to_t0 = t - t.iloc[0]
    return delta_to_t0

In [17]:
%%time
ride_timestamps['delta_requested_at'] = ride_timestamps.groupby('ride_id')['timestamp'].apply(seconds_lag)

ride_timestamps['delta_requested_at'] = ride_timestamps['delta_requested_at'].apply(lambda o: o.total_seconds())

CPU times: user 1min 46s, sys: 1.62 s, total: 1min 48s
Wall time: 1min 46s


In [18]:
ride_timestamps.head()

Unnamed: 0,ride_id,event,timestamp,timestampMonth,timestampWeek,timestampDay,timestampDayofweek,timestampDayofyear,timestampHour,timestampMinute,timestampElapsed,delta_requested_at
0,00003037a262d9ee40e61b5c0718f7f0,requested_at,2016-06-13 09:39:19,6.0,24.0,13.0,0.0,165.0,9.0,39.0,1465810759,0.0
1,00003037a262d9ee40e61b5c0718f7f0,accepted_at,2016-06-13 09:39:51,6.0,24.0,13.0,0.0,165.0,9.0,39.0,1465810791,32.0
2,00003037a262d9ee40e61b5c0718f7f0,arrived_at,2016-06-13 09:44:31,6.0,24.0,13.0,0.0,165.0,9.0,44.0,1465811071,312.0
3,00003037a262d9ee40e61b5c0718f7f0,picked_up_at,2016-06-13 09:44:33,6.0,24.0,13.0,0.0,165.0,9.0,44.0,1465811073,314.0
4,00003037a262d9ee40e61b5c0718f7f0,dropped_off_at,2016-06-13 10:03:05,6.0,24.0,13.0,0.0,165.0,10.0,3.0,1465812185,1426.0


In [19]:
accepted_filter = ride_timestamps[ride_timestamps['event'] == 'accepted_at']

driver_accepted_filter = pd.merge(accepted_filter, ride_ids,
                                  on='ride_id')

window_g = driver_accepted_filter.groupby('driver_id')['delta_requested_at'].mean()

window_g = window_g.reset_index()

window_g.head()

Unnamed: 0,driver_id,delta_requested_at
0,002be0ffdc997bd5c50703158b7c2491,8.407942
1,007f0389f9c7b03ef97098422f902e62,11.83871
2,011e5c5dfc5c2c92501b8b24d47509bc,7.294118
3,0152a2f305e71d26cc964f8d4411add9,14.198953
4,01674381af7edd264113d4e6ed55ecda,12.528


In [20]:
len(window_g)

844

In [21]:
ride_timestamps['ride_id'].nunique(), ride_ids['ride_id'].nunique()

(194081, 193502)

## `ride_ids`

In [22]:
ride_id_g = ride_ids.groupby('driver_id')['ride_id'].count()

ride_id_g = ride_id_g.reset_index()

ride_id_g.rename(columns={'ride_id': 'ride_count'}, inplace=True)

ride_id_g.sort_values(by='ride_count', ascending=False).head()

Unnamed: 0,driver_id,ride_count
354,5ccc0e6dc9c7475caf785cdce7b8eb7a,919
308,4eb382d1f7d50fae1294964263d1ce82,831
509,844e9be5a30d8d9c1f8e9ddb086ff717,821
401,689bdf87fb2de49f98bf4946cfaa5068,794
217,3788dc9e91f1548816ce8b5af07ddadc,783


In [23]:
len(driver_ids[~driver_ids['driver_id'].isin(ride_ids['driver_id'].unique())])

83

Apparently, 83 drivers that appear in `ride_ids` do not have their `driver_ids` information.

In [0]:
ride_ids['ride_distance'] = ride_ids['ride_distance'] / 1609.34

ride_ids['ride_duration'] = ride_ids['ride_duration'] / 60

Calculate fare.

In [25]:
ride_ids['price'] = (2 + ride_ids['ride_distance'] * 1.15 + ride_ids['ride_duration'] * 0.22) * ((ride_ids['ride_prime_time'] + 100) / 100) + 1.75 

ride_ids['price'].describe()

def limit_bound(price):
    return min(max(5, price), 400)

ride_ids['price'] = ride_ids['price'].apply(limit_bound)

ride_ids['price'].describe()

count    193502.000000
mean         13.536544
std           9.881417
min           5.000000
25%           8.036033
50%          10.571234
75%          15.115603
max         400.000000
Name: price, dtype: float64

In [26]:
price_g = ride_ids.groupby('driver_id')['price'].sum()

price_g = price_g.reset_index()

price_g.sort_values(by='price', ascending=False).head()

Unnamed: 0,driver_id,price
354,5ccc0e6dc9c7475caf785cdce7b8eb7a,12350.288077
217,3788dc9e91f1548816ce8b5af07ddadc,12304.887351
308,4eb382d1f7d50fae1294964263d1ce82,10781.942344
412,6b65c06851e944351dd285a1eb729499,10708.94377
509,844e9be5a30d8d9c1f8e9ddb086ff717,10613.901682


The higher a driver's `ride_prime_time`, the more it is valuable to Lyft since they help to meet passengers' demand.

In [27]:
# make the variable NaN to calculate mean
ride_ids['ride_prime_time'].replace(0, np.nan, inplace=True)

ride_prime_time_g = ride_ids.groupby('driver_id')['ride_prime_time'].mean()

ride_prime_time_g = ride_prime_time_g.reset_index()

ride_prime_time_g.sort_values(by='ride_prime_time', ascending=False).head()

Unnamed: 0,driver_id,ride_prime_time
807,dae249fc394c9bdf02f7d8bb1ff55733,146.875
676,b2d3f2fb171a12cac427107690c10089,105.0
420,6cb35e276085548f3f095a85aa63af7b,98.076923
443,7419cd5c573ff9994c0f8ff5d92b4408,91.071429
655,acd7dc6118befb6724aa3752d1cdbea1,90.277778


In [0]:
# import seaborn as sns
# import matplotlib.pyplot as plt

# ride_ids.loc[ride_ids['ride_prime_time'] > 100, 'ride_prime_time'] = 100

# ride_ids['ride_prime_time'].fillna(0, inplace=True)

# sns.distplot(ride_ids['ride_prime_time'])
# plt.show()

In [29]:
len(ride_prime_time_g)

937

Merge both `driver_ids` and `driver_timestamps`.

In [30]:
ids_timestamps = pd.merge(ride_ids, ride_timestamps,
                          left_on='ride_id',
                          right_on='ride_id',
                          how='outer')

ids_timestamps = ids_timestamps.sort_values(by=['driver_id', 'timestamp'])

ids_timestamps.head()

Unnamed: 0,driver_id,ride_id,ride_distance,ride_duration,ride_prime_time,price,event,timestamp,timestampMonth,timestampWeek,timestampDay,timestampDayofweek,timestampDayofyear,timestampHour,timestampMinute,timestampElapsed,delta_requested_at
1235,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,requested_at,2016-03-29 18:46:50,3.0,13.0,29.0,1.0,89.0,18.0,46.0,1459277000.0,0.0
1236,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,accepted_at,2016-03-29 18:47:01,3.0,13.0,29.0,1.0,89.0,18.0,47.0,1459277000.0,11.0
1237,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,arrived_at,2016-03-29 18:50:00,3.0,13.0,29.0,1.0,89.0,18.0,50.0,1459277000.0,190.0
1238,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,picked_up_at,2016-03-29 18:50:01,3.0,13.0,29.0,1.0,89.0,18.0,50.0,1459277000.0,191.0
1239,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,dropped_off_at,2016-03-29 18:55:47,3.0,13.0,29.0,1.0,89.0,18.0,55.0,1459278000.0,537.0


In [31]:
ids_timestamps['driver_id'].nunique()

937

Can we determine who are the part-time drivers? We assume drivers who drive mainly on weekends are part-time drivers.

In [32]:
# Monday 0, Sunday 6
ids_timestamps['is_weekday'] = 0 
ids_timestamps.loc[ids_timestamps['timestampDayofweek'] < 5, 'is_weekday'] = 1

weekday_drivers_g = ids_timestamps.groupby('driver_id')['is_weekday'].mean()

weekday_drivers_g = weekday_drivers_g.reset_index()

weekday_drivers_g.sort_values(by='is_weekday').head()

Unnamed: 0,driver_id,is_weekday
870,ea71f79a86672486e018deeb47a545ec,0.0
296,4bbf15c7280e29c1df6edd7bf6dfa56a,0.0
108,1cf6fa07dcec364af2acf257b2d3731e,0.0
732,c64e642876910be4a1f0b57404dc9710,0.0
210,364c929f1d6535aa17df83ebcb349e87,0.0


In [33]:
ids_timestamps[ids_timestamps['driver_id'] == 'ea71f79a86672486e018deeb47a545ec']['ride_id'].nunique()

72

Driver `ea71f79a86672486e018deeb47a545ec` made 72 rides in the entire dataset and all rides only happen on weekends.

Next, we look into drivers who prefer to drive in the morning v.s. drivers who prefer to drive at night. We define `is_nighter` as drivers who driver between 11:00 PM and 6:00 AM.

In [34]:
ids_timestamps['is_nighter'] = 0 
ids_timestamps.loc[(ids_timestamps['timestampHour'] >= 23) |
                   (ids_timestamps['timestampHour'] <= 6), 
                   'is_nighter'] = 1

is_nighter_g = ids_timestamps.groupby('driver_id')['is_nighter'].mean()

is_nighter_g = is_nighter_g.reset_index()

is_nighter_g.sort_values(by='is_nighter', ascending=False).head()

Unnamed: 0,driver_id,is_nighter
523,8969d175c4676642b245af55d2dba19b,0.903448
222,3857da988688981b67c878fc179eb0dc,0.898947
551,905f8007cd46415eba8b9dce088b4395,0.891892
466,793e2072826ee96904a5f03ae2357e96,0.873927
77,175540c364f94b6adbc0ce3541a3254a,0.87037


In [35]:
ids_timestamps[ids_timestamps['driver_id'] == '3857da988688981b67c878fc179eb0dc']['ride_id'].nunique()

190

Driver `3857da988688981b67c878fc179eb0dc` completed 190 rides in the entire dataset and on most of the rides are complete at night.

In the original `driver_ids`, we have 937 unique `driver_ids` but after doing an inner join for the `ride_ids` and `ride_timestamps`, there are only 844 unique `driver_ids`. We can conclude that some `ride_timestamp`s are absent for 93 drivers.

Next, we create a feature `drop_requested_lag` to measure how fast do drivers pick up the next ride after they drop off passengers. The feature `drop_requested_lag` only considers same day pickups. So there exists a condition that a driver only picks up a passenger a day and this might lead to low `drop_requested_lag` value.

In [36]:
ids_timestamps['timestamp_shift'] = ids_timestamps.groupby('driver_id')['timestamp'].apply(lambda o: o.shift())

ids_timestamps['delta_timestamp'] = ids_timestamps['timestamp'] - ids_timestamps['timestamp_shift']

ids_timestamps.head(6)

Unnamed: 0,driver_id,ride_id,ride_distance,ride_duration,ride_prime_time,price,event,timestamp,timestampMonth,timestampWeek,timestampDay,timestampDayofweek,timestampDayofyear,timestampHour,timestampMinute,timestampElapsed,delta_requested_at,is_weekday,is_nighter,timestamp_shift,delta_timestamp
1235,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,requested_at,2016-03-29 18:46:50,3.0,13.0,29.0,1.0,89.0,18.0,46.0,1459277000.0,0.0,1,0,NaT,NaT
1236,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,accepted_at,2016-03-29 18:47:01,3.0,13.0,29.0,1.0,89.0,18.0,47.0,1459277000.0,11.0,1,0,2016-03-29 18:46:50,00:00:11
1237,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,arrived_at,2016-03-29 18:50:00,3.0,13.0,29.0,1.0,89.0,18.0,50.0,1459277000.0,190.0,1,0,2016-03-29 18:47:01,00:02:59
1238,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,picked_up_at,2016-03-29 18:50:01,3.0,13.0,29.0,1.0,89.0,18.0,50.0,1459277000.0,191.0,1,0,2016-03-29 18:50:00,00:00:01
1239,002be0ffdc997bd5c50703158b7c2491,e228d93dc13a2a9d83661321992cd48e,1.541004,5.766667,,6.790822,dropped_off_at,2016-03-29 18:55:47,3.0,13.0,29.0,1.0,89.0,18.0,55.0,1459278000.0,537.0,1,0,2016-03-29 18:50:01,00:05:46
5,002be0ffdc997bd5c50703158b7c2491,01b522c5c3a756fbdb12e95e87507eda,2.089055,13.483333,,9.118747,requested_at,2016-03-29 19:00:49,3.0,13.0,29.0,1.0,89.0,19.0,0.0,1459278000.0,0.0,1,0,2016-03-29 18:55:47,00:05:02


In [0]:
ids_timestamps.loc[(ids_timestamps['event'] == 'requested_at') &
                   (~ids_timestamps['delta_timestamp'].isna()), 'drop_requested_lag'] = 1

ids_timestamps['drop_requested_lag'].fillna(0, inplace=True)

ids_timestamps['drop_requested_lag'] = ids_timestamps['drop_requested_lag'] * ids_timestamps['delta_timestamp']

ids_timestamps['drop_requested_lag'] = ids_timestamps['drop_requested_lag'].apply(lambda o: o.total_seconds())

ids_timestamps.loc[ids_timestamps['delta_timestamp'].dt.days > 0, 'drop_requested_lag'] = np.nan

In [0]:
# ids_timestamps.head(31)

In [39]:
filtered_requested_at = ids_timestamps[ids_timestamps['event'] == 'requested_at']

drop_requested_lag_g = filtered_requested_at.groupby('driver_id')['drop_requested_lag'].mean()

drop_requested_lag_g = drop_requested_lag_g.reset_index()

drop_requested_lag_g.sort_values(by='drop_requested_lag').head()

Unnamed: 0,driver_id,drop_requested_lag
441,7ff85c5c0e9324e28d1e0d0589c364bd,181.0
526,9a54684a69721c1075c2af5fc077665b,754.351351
495,905f8007cd46415eba8b9dce088b4395,936.774194
226,42256e33936dfa69088f540a720edc97,942.870968
313,5cf93f7d1d3a8f0cf395c84053c31b1b,976.818182


Merge `ids_timestamps` with `driver_id`.

In [0]:
df = pd.merge(ids_timestamps, driver_ids,
              left_on='driver_id', right_on='driver_id')

We create the feature `driving_period` to account for the duration between the last trip a driver made and the day the driver was onboard. 

To calculate the average churn rate, we first get the driver's last activity based on `timestamp` in `ride_timestamps`. Since the last day of the `timestamp` in `ride_timestamps` was 2016/06/27, we calculate the last activity of a driver with respect to the day. We assume drivers who have more than 14 days of inactivity have churned.

In [41]:
df['date'] = df['timestamp'].dt.date

unique_dates_g = df.groupby('driver_id')['date'].nunique()

unique_dates_g = unique_dates_g.reset_index()

unique_dates_g.rename(columns={'date': 'unique_days'}, inplace=True)

unique_dates_g.head()

Unnamed: 0,driver_id,unique_days
0,002be0ffdc997bd5c50703158b7c2491,56
1,007f0389f9c7b03ef97098422f902e62,12
2,011e5c5dfc5c2c92501b8b24d47509bc,12
3,0152a2f305e71d26cc964f8d4411add9,42
4,01674381af7edd264113d4e6ed55ecda,40


In [0]:
df['driving_period'] = (df.groupby('driver_id')['timestamp'].tail(1) - pd.to_datetime(df['driver_onboard_date'])).dt.days

df['last_activity'] = (df['timestamp'].max() - df.groupby('driver_id')['timestamp'].tail(1)).dt.days

last_activity_df = df.groupby('driver_id').tail(1)

In [43]:
churn_rate = len(last_activity_df[last_activity_df['last_activity'] > 14]) / len(last_activity_df)

churn_rate

0.28805620608899296

In [44]:
onboard_period = df[~df['driving_period'].isna()]

onboard_period = onboard_period[['driver_id', 'driving_period']].copy()

onboard_period.sort_values('driving_period').head()

Unnamed: 0,driver_id,driving_period
715947,bd057e02f75c92917389d90bb215fe91,1.0
404840,68b546b2f4102641d6774c1ce4f57457,1.0
429430,6eb2e0f41fc9f1fbb70e7751035fdf87,1.0
609573,a2334fdb829cf96fbae920df0cce1587,2.0
841116,e4f3a9d5cf57b9b518136afd9757f76d,3.0


In [0]:
ltv = ride_id_g

to_merge = (drop_requested_lag_g, ride_prime_time_g, price_g, 
            onboard_period, window_g, unique_dates_g,
            weekday_drivers_g, is_nighter_g)

for ser in to_merge:
    ltv = pd.merge(ltv, ser, on='driver_id', how='outer')

In [46]:
len(ltv)

937

In [47]:
ltv.isna().sum()

driver_id               0
ride_count              0
drop_requested_lag     94
ride_prime_time         3
price                   0
driving_period        101
delta_requested_at     93
unique_days            83
is_weekday              0
is_nighter              0
dtype: int64

To impute a feature's `NaN`s, if the higher the value of the feature the better, we impute with 0, otherwise we impute with 999,999. Note that having a `driving_period` of 0 indicates the possibility that a driver has signed up but never started to drive.

In [0]:
ltv['ride_prime_time'].fillna(0, inplace=True) # high better

ltv['driving_period'].fillna(ltv['driving_period'], inplace=True) # high better

ltv['drop_requested_lag'].fillna(ltv['drop_requested_lag'], inplace=True) # low better

ltv['delta_requested_at'].fillna(ltv['delta_requested_at'], inplace=True) # low better

ltv['unique_days'].fillna(ltv['unique_days'].mean(), inplace=True)
ltv['unique_days'].replace(0, ltv['unique_days'].mean(), inplace=True)

ltv['drop_requested_lag'].fillna(ltv['drop_requested_lag'].mean(), inplace=True)

ltv['driving_period'].fillna(ltv['driving_period'].mean(), inplace=True)

ltv['delta_requested_at'].fillna(ltv['delta_requested_at'].mean(), inplace=True)

In [49]:
ltv.head()

Unnamed: 0,driver_id,ride_count,drop_requested_lag,ride_prime_time,price,driving_period,delta_requested_at,unique_days,is_weekday,is_nighter
0,002be0ffdc997bd5c50703158b7c2491,277,6594.516129,48.863636,3560.926071,86.0,8.407942,56.0,0.769675,0.238989
1,007f0389f9c7b03ef97098422f902e62,31,4615.55,52.083333,321.494948,85.0,11.83871,12.0,0.709677,0.0
2,011e5c5dfc5c2c92501b8b24d47509bc,34,7185.041667,42.1875,482.42839,68.0,7.294118,12.0,0.529412,0.058824
3,0152a2f305e71d26cc964f8d4411add9,191,10028.79661,42.708333,2610.871561,64.0,14.198953,42.0,0.796859,0.459686
4,01674381af7edd264113d4e6ed55ecda,375,6633.404372,47.474747,5381.097539,56.0,12.528,40.0,1.0,0.054933


In [0]:
# ltv.describe()

# ltv.isna().sum()

## Questions
* What are the main factors that affect a driver's lifetime value?
* What is the average projected lifetime of a driver? That is, once a driver is
onboarded, how long do they typically continue driving with Lyft?
* Do all drivers act alike? Are there specific segments of drivers that generate more
value for Lyft than the average driver?
* What actionable recommendations are there for the business?

In [0]:
# consider onboard periods only?

In [0]:
# https://blog.hubspot.com/service/how-to-calculate-customer-lifetime-value
ltv['average_ride_val'] = ltv['price'] / ltv['ride_count']

ltv['average_ride_freq'] = ltv['ride_count'] / ltv['unique_days']

ltv['average_val'] = ltv['average_ride_val'] * ltv['average_ride_freq']

ltv['ltv'] = ltv['average_val'] * 365 / churn_rate

In [53]:
base_metrics = ['ride_count', 'price', 'unique_days', 'ltv']

ltv[['driver_id'] + base_metrics].sort_values(by='ltv', ascending=False).head()

Unnamed: 0,driver_id,ride_count,price,unique_days,ltv
412,6b65c06851e944351dd285a1eb729499,718,10708.94377,45.0,301543.348024
217,3788dc9e91f1548816ce8b5af07ddadc,783,12304.887351,56.0,278423.086246
156,297e507cc9f1da096e51c2223657255b,588,8657.558363,40.0,274252.796465
318,51b528390e8e7780595f6009b1f4cf72,240,3018.980483,14.0,273242.278251
426,6eb2e0f41fc9f1fbb70e7751035fdf87,35,421.815299,2.0,267243.997669


The main factors that affect lifetime value (LTV) are the number of rides a driver has completed, `ride_count`, total revenue generated by the driver, `price` and the number of days the driver worked, `unique_days`.

In [54]:
last_activity_df['onboard_period'] = last_activity_df['driving_period']
last_activity_df.loc[last_activity_df['last_activity'] < 14, 'onboard_period'] = 90 # unable to decide the last day for drivers

1 / churn_rate

3.4715447154471546

We predict the average lifetime for drivers is approximately 3.5 years.

In [0]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

In [56]:
all_metrics = [col for col in ltv.columns if col not in ['driver_id']]

all_metrics

['ride_count',
 'drop_requested_lag',
 'ride_prime_time',
 'price',
 'driving_period',
 'delta_requested_at',
 'unique_days',
 'is_weekday',
 'is_nighter',
 'average_ride_val',
 'average_ride_freq',
 'average_val',
 'ltv']

In [57]:
# only fit base_metrics for KMeans
lvt_matrix = ltv[base_metrics].values

CLUSTERS = 3

kmeans = KMeans(n_clusters=CLUSTERS, init='k-means++', n_init=40)
kmeans.fit(lvt_matrix)

clusters = kmeans.predict(lvt_matrix)

print("The average silhouette_score is :", silhouette_score(lvt_matrix, clusters))

The average silhouette_score is : 0.5658765040390985


In [58]:
pd.Series(clusters).value_counts()

0    401
2    375
1    161
dtype: int64

In [59]:
ltv['clusters'] = clusters

scale_max = ltv[all_metrics].max().values

scale_max

array([9.19000000e+02, 3.85809615e+04, 1.46875000e+02, 1.23502881e+04,
       9.00000000e+01, 4.12500000e+01, 8.10000000e+01, 1.00000000e+00,
       9.03448276e-01, 2.59015036e+01, 1.75000000e+01, 2.37976528e+02,
       3.01543348e+05])

In [0]:
import plotly_express as px

In [62]:
for i in range(3):
    radar = pd.DataFrame(ltv[ltv['clusters'] == i][all_metrics].mean()).reset_index()

    radar['r'] = radar.loc[:, 0] / scale_max

    radar.rename(columns={'index': 'theta'}, inplace=True)

    print('Cluster {}'.format(str(i)))
    fig = px.line_polar(radar, r='r', theta='theta', line_close=True)
    fig.show()

Cluster 0


Cluster 1


Cluster 2
