In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# Load Data

In [4]:
df = pd.read_csv('data/driver_feature.csv')
df.head()

Unnamed: 0,driver_id,driver_onboard_date,first_drive,last_drive,num_drive,day_drive,min_ride_distance,max_ride_distance,total_ride_distance,min_ride_duration,...,min_earning_ride,max_earning_ride,earnings_per_active_day,ride_freq_during_active,avg_primetime_multiplier,num_rides_in_1st_30days,n_days_after_onboarding,n_days_after_last_drive,n_days_to_first_drive,churn
0,002be0ffdc997bd5c50703158b7c2491,2016-03-29 00:00:00.000000 UTC,2016-03-29,2016-06-23,277,277,606,109289,1740287,117,...,4.0,78.746694,50.870315,3.22093,19.404332,112,89.0,3.0,0.0,0
1,007f0389f9c7b03ef97098422f902e62,2016-03-29 00:00:00.000000 UTC,2016-03-29,2016-06-22,31,31,548,10028,117531,200,...,4.99361,14.602911,21.432978,0.364706,20.16129,11,89.0,4.0,0.0,0
2,011e5c5dfc5c2c92501b8b24d47509bc,2016-04-05 00:00:00.000000 UTC,2016-04-05,2016-06-12,34,34,1148,86080,269653,223,...,4.310401,65.760606,32.161853,0.5,19.852941,12,82.0,14.0,0.0,0
3,0152a2f305e71d26cc964f8d4411add9,2016-04-23 00:00:00.000000 UTC,2016-04-25,2016-06-26,191,191,22,31789,1471239,24,...,4.0,29.293182,52.217368,3.080645,10.732984,56,64.0,0.0,2.0,0
4,01674381af7edd264113d4e6ed55ecda,2016-04-29 00:00:00.000000 UTC,2016-04-29,2016-06-24,375,375,673,72609,3123644,145,...,4.0,54.34029,107.621815,6.696429,12.533333,188,58.0,2.0,0.0,0


In [7]:
df['first_drive'] = pd.to_datetime(df['first_drive'])
df['last_drive'] = pd.to_datetime(df['last_drive'])

df['total_tenure'] = (df['last_drive'] - df['first_drive']).dt.days

In [10]:
df['avg_rides_per_day'] = df['num_drive'] / ((df['last_drive'] - df['first_drive']).dt.days + 1)

In [11]:
df['LTV'] = df['avg_earning_per_ride'] * df['avg_rides_per_day'] * df['total_tenure']

In [8]:
df.columns

Index(['driver_id', 'driver_onboard_date', 'first_drive', 'last_drive',
       'num_drive', 'day_drive', 'min_ride_distance', 'max_ride_distance',
       'total_ride_distance', 'min_ride_duration', 'max_ride_duration',
       'total_ride_duration', 'avg_distance_per_day', 'avg_duration_per_day',
       'avg_distance_per_drive', 'avg_duration_per_drive',
       'min_acceptance_duration', 'max_acceptance_duration',
       'avg_acceptance_duration', 'min_arrival_duration',
       'max_arrival_duration', 'avg_arrival_duration', 'min_wait_duration',
       'max_wait_duration', 'avg_wait_duration', 'avg_earning_per_ride',
       'min_earning_ride', 'max_earning_ride', 'earnings_per_active_day',
       'ride_freq_during_active', 'avg_primetime_multiplier',
       'num_rides_in_1st_30days', 'n_days_after_onboarding',
       'n_days_after_last_drive', 'n_days_to_first_drive', 'churn',
       'total_tenure'],
      dtype='object')

In [13]:
driver_data = df[['avg_distance_per_drive','avg_duration_per_drive','avg_arrival_duration','avg_wait_duration','ride_freq_during_active','avg_primetime_multiplier', 'LTV']]

In [16]:
driver_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 937 entries, 0 to 936
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   avg_distance_per_drive    937 non-null    float64
 1   avg_duration_per_drive    937 non-null    float64
 2   avg_arrival_duration      844 non-null    float64
 3   avg_wait_duration         844 non-null    float64
 4   ride_freq_during_active   937 non-null    float64
 5   avg_primetime_multiplier  937 non-null    float64
 6   LTV                       844 non-null    float64
dtypes: float64(7)
memory usage: 51.4 KB


In [18]:
driver_data.dropna(subset=['avg_arrival_duration','avg_wait_duration', 'LTV'], inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  driver_data.dropna(subset=['avg_arrival_duration','avg_wait_duration', 'LTV'], inplace=True)


In [19]:
driver_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 844 entries, 0 to 936
Data columns (total 7 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   avg_distance_per_drive    844 non-null    float64
 1   avg_duration_per_drive    844 non-null    float64
 2   avg_arrival_duration      844 non-null    float64
 3   avg_wait_duration         844 non-null    float64
 4   ride_freq_during_active   844 non-null    float64
 5   avg_primetime_multiplier  844 non-null    float64
 6   LTV                       844 non-null    float64
dtypes: float64(7)
memory usage: 52.8 KB


In [22]:
driver_data.corr()['LTV'].sort_values()

avg_arrival_duration       -0.222126
avg_distance_per_drive     -0.050489
avg_duration_per_drive      0.090370
avg_primetime_multiplier    0.218600
avg_wait_duration           0.278651
ride_freq_during_active     0.685345
LTV                         1.000000
Name: LTV, dtype: float64

In [20]:
X = driver_data.drop(columns='LTV')
y = driver_data['LTV']

In [21]:
X = sm.add_constant(X)
model = sm.OLS(y, X).fit()
print(model.summary())

coefficients = model.params
print("\nCoefficients:")
print(coefficients)

                            OLS Regression Results                            
Dep. Variable:                    LTV   R-squared:                       0.535
Model:                            OLS   Adj. R-squared:                  0.532
Method:                 Least Squares   F-statistic:                     160.5
Date:                Thu, 04 Jul 2024   Prob (F-statistic):          1.63e-135
Time:                        03:02:55   Log-Likelihood:                -7248.3
No. Observations:                 844   AIC:                         1.451e+04
Df Residuals:                     837   BIC:                         1.454e+04
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   