In [1]:
from pathlib import Path
import pandas as pd 
import numpy as np
import pingouin as pg
from dowhy import CausalModel
import statsmodels.api as sm

import warnings
warnings.filterwarnings('ignore')

import modules

pd.set_option('display.max_columns', None)

### Data preprocessing (Training)

In [2]:
mode = 'Train' # 'Train' or 'Test'
    
trn_rts_df = modules.data_prep(mode)

res_df = modules.strat_folds(trn_rts_df, n_folds=5, stratify_cols=["day_of_week", "time_of_day"])

print(len(res_df))
res_df.head()


199805


Unnamed: 0,date_of_trip,day_of_week,time_of_day,PULocationID,DOLocationID,uber_vol,uber_wait_ratio,uber_pay_ratio,uber_fare_per_mile,uber_pay_per_mile,uber_adj_pay_per_mile,uber_rev_pos,uber_rev_per_mile,uber_trip_dur,uber_wait_dur,lyft_vol,lyft_wait_ratio,lyft_pay_ratio,lyft_fare_per_mile,lyft_pay_per_mile,lyft_adj_pay_per_mile,lyft_rev_pos,lyft_rev_per_mile,lyft_trip_dur,lyft_wait_dur,tot_vol,uber_pay,uber_fare,lyft_pay,lyft_fare,uber_wait,lyft_wait,lyft_share,fold
0,2024-01-01,0,2,7,7,41,1.085918,0.668816,11.898381,4.188248,4.188248,0.95122,4.076855,4.982439,5.090488,28,0.472643,0.961423,9.712873,6.518245,6.518245,0.571429,0.516913,6.850714,2.7325,69,4.188248,11.898381,6.518245,9.712873,1.085918,0.472643,0.405797,2
1,2024-01-01,0,2,7,129,18,0.473235,0.774078,6.144271,3.362486,3.362486,0.944444,1.38211,13.285,5.641111,19,0.207341,0.939334,5.407455,4.21966,4.21966,0.578947,0.405809,10.046316,1.936316,37,3.362486,6.144271,4.21966,5.407455,0.473235,0.207341,0.513514,3
2,2024-01-01,0,2,7,179,16,0.787224,0.713398,8.809631,3.641561,3.641561,0.9375,2.649557,7.094375,5.09625,15,0.342444,1.012388,8.523135,6.561819,6.561819,0.4,-0.035974,5.872,1.76,31,3.641561,8.809631,6.561819,8.523135,0.787224,0.342444,0.483871,0
3,2024-01-01,0,2,7,223,34,0.833741,0.614408,8.230382,2.841334,2.841334,0.970588,3.350593,8.185882,5.561471,16,0.303613,0.926691,7.988269,5.790445,5.790445,0.6875,0.670853,7.8825,2.12875,50,2.841334,8.230382,5.790445,7.988269,0.833741,0.303613,0.32,3
4,2024-01-01,0,2,7,265,14,0.194777,0.649962,4.51729,2.305692,2.305692,1.0,1.813762,40.161429,7.330714,12,0.059844,0.813631,3.332146,2.489306,2.489306,0.916667,0.704673,34.518333,2.03,26,2.305692,4.51729,2.489306,3.332146,0.194777,0.059844,0.461538,1


### Nuisance Function Models (Cross Validation)

In [3]:
res_df, ufare_report, ufare_model = modules.xgb_coeffs(
    df=res_df,
    fold_col="fold",
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="uber_fare",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800,
)

res_df, lfare_report, lfare_model = modules.xgb_coeffs(
    df=res_df,
    fold_col="fold",
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_fare",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=1000,
)

res_df, lshare_report, lshare_model = modules.xgb_coeffs(
    df=res_df,
    fold_col="fold",
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_share",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800,
)

### Orthogonal Regression Model (Cross Validation)

In [4]:
results = modules.fit_linear_regression(
    df=res_df,
    predictor_cols=['uber_fare_error', 'lyft_fare_error'],
    target_col='lyft_share_error',
    intercept=False
)

print(results.summary())


                                 OLS Regression Results                                
Dep. Variable:       lyft_share_error   R-squared (uncentered):                   0.046
Model:                            OLS   Adj. R-squared (uncentered):              0.046
Method:                 Least Squares   F-statistic:                              4805.
Date:                Thu, 05 Feb 2026   Prob (F-statistic):                        0.00
Time:                        20:04:22   Log-Likelihood:                      2.2033e+05
No. Observations:              199805   AIC:                                 -4.407e+05
Df Residuals:                  199803   BIC:                                 -4.406e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

### Nuisance Function Models (Training)

In [5]:
mod_df, lyft_fare_model = modules.xgb_predictor(
    df=trn_rts_df,
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_fare",
    new_col="lyft_fare_pred",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=1000
)

mod_df, uber_fare_model = modules.xgb_predictor(
    df=mod_df,
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="uber_fare",
    new_col="uber_fare_pred",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800
)

mod_df, lyft_share_model = modules.xgb_predictor(
    df=mod_df,
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_share",
    new_col="lyft_share_pred",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800,
)

mod_df['uber_fare_error'] = mod_df['uber_fare'] - mod_df['uber_fare_pred']
mod_df['lyft_fare_error'] = mod_df['lyft_fare'] - mod_df['lyft_fare_pred']
mod_df['lyft_share_error'] = mod_df['lyft_share'] - mod_df['lyft_share_pred']

### Orthogonal Regression Model (Training)

In [6]:
ols_model, results = modules.lin_regression(
    df=mod_df,
    predictor_cols=['uber_fare_error', 'lyft_fare_error'],
    target_col='lyft_share_error',
)

print(results.summary())



                                 OLS Regression Results                                
Dep. Variable:       lyft_share_error   R-squared (uncentered):                   0.047
Model:                            OLS   Adj. R-squared (uncentered):              0.047
Method:                 Least Squares   F-statistic:                              4883.
Date:                Thu, 05 Feb 2026   Prob (F-statistic):                        0.00
Time:                        20:04:25   Log-Likelihood:                      2.2244e+05
No. Observations:              199805   AIC:                                 -4.449e+05
Df Residuals:                  199803   BIC:                                 -4.449e+05
Df Model:                           2                                                  
Covariance Type:            nonrobust                                                  
                      coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------

### Data preprocessing (Test)

In [7]:
mode = 'Test' # 'Train' or 'Test'

tst_rts_df = modules.data_prep(mode)

### Predicting baseline lyft share (using trained nuisance functions based on uber wait dur and lyft wait dur)

In [8]:
res_df, lyft_fare_model = modules.xgb_predictor(
    df=tst_rts_df,
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_fare",
    model = lyft_fare_model,
    new_col="lyft_fare_pred",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=1000
)

res_df, uber_fare_model = modules.xgb_predictor(
    df=res_df,
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_fare",
    model = uber_fare_model,
    new_col="uber_fare_pred",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800
)

res_df, lyft_share_model = modules.xgb_predictor(
    df=res_df,
    objective='reg:squarederror',
    predictor_cols=["uber_wait", "lyft_wait"],
    target_col="lyft_share",
    model = lyft_share_model,
    new_col="lyft_share_pred",
    learning_rate=0.03,
    max_depth=5,
    n_estimators=800,
)

### Predicting causal lyft share (after using the uber fare per mile, lyft fare per mile)

In [9]:
res_df['lyft_fare_residual'] = res_df['lyft_fare'] -  res_df['lyft_fare_pred']
res_df['uber_fare_residual'] = res_df['uber_fare'] -  res_df['uber_fare_pred']

# Step 3: Assemble prediction
res_df['lyft_share_pred_causal'] = res_df['lyft_share_pred'] + (results.params['uber_fare_error'] * res_df['uber_fare_residual']) + (results.params['lyft_fare_error'] * res_df['lyft_fare_residual'])

res_df.head()

Unnamed: 0,date_of_trip,day_of_week,time_of_day,PULocationID,DOLocationID,uber_vol,uber_wait_ratio,uber_pay_ratio,uber_fare_per_mile,uber_pay_per_mile,uber_adj_pay_per_mile,uber_rev_pos,uber_rev_per_mile,uber_trip_dur,uber_wait_dur,lyft_vol,lyft_wait_ratio,lyft_pay_ratio,lyft_fare_per_mile,lyft_pay_per_mile,lyft_adj_pay_per_mile,lyft_rev_pos,lyft_rev_per_mile,lyft_trip_dur,lyft_wait_dur,tot_vol,uber_pay,uber_fare,lyft_pay,lyft_fare,uber_wait,lyft_wait,lyft_share,lyft_fare_pred,uber_fare_pred,lyft_share_pred,lyft_fare_residual,uber_fare_residual,lyft_share_pred_causal
0,2025-01-01,2,2,7,7,39,1.008489,0.686902,10.464277,3.685993,3.685993,0.948718,3.30167,5.438718,4.785641,23,0.397376,0.943194,9.715177,6.61542,6.61542,0.521739,0.608063,6.393043,2.27087,62,3.685993,10.464277,6.61542,9.715177,1.008489,0.397376,0.370968,12.712036,13.899877,0.360623,-2.996859,-3.4356,0.382549
1,2025-01-01,2,2,7,138,38,0.458189,0.78578,4.74979,2.498394,2.498394,0.868421,1.233755,10.145526,4.503421,14,0.189548,0.782308,4.378333,2.913213,2.913213,0.928571,1.001036,10.477143,1.792857,52,2.498394,4.74979,2.913213,4.378333,0.458189,0.189548,0.269231,9.582954,10.69711,0.293122,-5.204621,-5.94732,0.331281
2,2025-01-01,2,2,7,223,39,0.639487,0.797748,8.156095,3.999705,3.999705,0.871795,1.745035,8.725641,4.912821,14,0.314251,0.810102,7.319902,4.602183,4.602183,0.714286,1.482602,8.327143,2.356429,53,3.999705,8.156095,4.602183,7.319902,0.639487,0.314251,0.264151,10.535579,11.685065,0.31705,-3.215677,-3.52897,0.341223
3,2025-01-01,2,2,10,132,27,0.661967,0.827803,4.381039,2.052732,2.052732,0.740741,1.085482,8.964444,5.705556,18,0.339722,0.680332,4.783655,2.520645,2.520645,1.0,1.574531,9.950556,3.105556,45,2.052732,4.381039,2.520645,4.783655,0.661967,0.339722,0.4,10.444571,11.111013,0.317606,-5.660917,-6.729974,0.358038
4,2025-01-01,2,2,17,17,13,0.899227,0.773829,11.184254,4.901926,4.901926,0.769231,2.783407,11.397692,4.098462,17,0.407482,1.056017,9.576977,7.277742,7.277742,0.294118,-0.2221,5.924118,2.435882,30,4.901926,11.184254,7.277742,9.576977,0.899227,0.407482,0.566667,11.005808,12.31707,0.340012,-1.428831,-1.132816,0.352539


### Evaluating RMSE and R2 for the final lyft share

In [10]:
from sklearn.metrics import mean_squared_error, r2_score


# Step 4: Evaluate
rmse = mean_squared_error(res_df['lyft_share'], res_df['lyft_share_pred_causal'])
r2 = r2_score(res_df['lyft_share'], res_df['lyft_share_pred_causal'])

print(f'RMSE: {round(rmse, 3)}, R2: {(round(r2,3))}')

RMSE: 0.007, R2: 0.034
