In [51]:
from sklearn.metrics import r2_score, mean_absolute_error

from gdc.cms.data_access import *
from gdc.utils import filter_data, is_greater_than
import seaborn as sns

import numpy as np

In [52]:
df_merged_payments.describe()

Unnamed: 0,payments_2008,payments_2009,payments_2010
count,116352.0,114538.0,112754.0
mean,3998.503163,4297.398069,2521.582826
std,9847.702381,8551.804491,5703.427942
min,-430.0,-1320.0,-1770.0
25%,0.0,130.0,70.0
50%,940.0,1530.0,880.0
75%,3040.0,4230.0,2290.0
max,170190.0,154970.0,116910.0


In [53]:
(df_merged_payments < 0).mean(axis=0)

payments_2008    0.000026
payments_2009    0.000017
payments_2010    0.000120
dtype: float64

In [54]:
(df_merged_payments == 0).mean(axis=0)

payments_2008    0.263794
payments_2009    0.194255
payments_2010    0.205128
dtype: float64

In [55]:
(df_merged_payments < 100).mean(axis=0)

payments_2008    0.298456
payments_2009    0.234332
payments_2010    0.254125
dtype: float64

In [56]:
df_merged_payments.corr()

Unnamed: 0,payments_2008,payments_2009,payments_2010
payments_2008,1.0,0.342798,0.170055
payments_2009,0.342798,1.0,0.200042
payments_2010,0.170055,0.200042,1.0


In [57]:
filter_data(
    df_merged_payments,
    {'payments_2008': is_greater_than(5000)}).mean()

payments_2008    18521.044859
payments_2009    10605.247216
payments_2010     4826.262663
dtype: float64

In [58]:
relevant_cols(CB.cc_cancer)

['SP_CNCR_2_2008', 'SP_CNCR_2_2009', 'SP_CNCR_2_2010']

In [59]:
df_merged_covariates[relevant_cols(CB.cc_cancer)].corr()

Unnamed: 0,SP_CNCR_2_2008,SP_CNCR_2_2009,SP_CNCR_2_2010
SP_CNCR_2_2008,1.0,0.429547,0.317119
SP_CNCR_2_2009,0.429547,1.0,0.28955
SP_CNCR_2_2010,0.317119,0.28955,1.0


In [60]:
df_merged_covariates[relevant_cols(CB.cc_esrd_indicator)].corr()

Unnamed: 0,BENE_ESRD_IND_Y_2008,BENE_ESRD_IND_Y_2009,BENE_ESRD_IND_Y_2010
BENE_ESRD_IND_Y_2008,1.0,0.227481,0.172674
BENE_ESRD_IND_Y_2009,0.227481,1.0,0.189684
BENE_ESRD_IND_Y_2010,0.172674,0.189684,1.0


In [61]:
df_merged_covariates[relevant_cols(CB.cc_diabetes)].corr()

Unnamed: 0,SP_DIABETES_2_2008,SP_DIABETES_2_2009,SP_DIABETES_2_2010
SP_DIABETES_2_2008,1.0,0.533018,0.394265
SP_DIABETES_2_2009,0.533018,1.0,0.400199
SP_DIABETES_2_2010,0.394265,0.400199,1.0


In [62]:
relevant_cols('2008', df_merged_payments)

['payments_2008']

In [106]:
year = str(2010)
y = df_merged_payments[relevant_cols(year, df_merged_payments)]
X = sm.add_constant(df_merged_covariates[relevant_cols(year)])
idx = X.dropna().index.intersection(y.dropna().index)
X = X.loc[idx]
y = y.loc[idx]

In [107]:
model = sm.OLS(y, X, missing='drop')
res = model.fit()

In [108]:
print(res.summary())
print('MAE', mean_absolute_error(y, res.predict(X)))

                            OLS Regression Results                            
Dep. Variable:          payments_2010   R-squared:                       0.214
Model:                            OLS   Adj. R-squared:                  0.214
Method:                 Least Squares   F-statistic:                     2052.
Date:                Fri, 16 Jan 2026   Prob (F-statistic):               0.00
Time:                        07:55:43   Log-Likelihood:            -1.1216e+06
No. Observations:              112754   AIC:                         2.243e+06
Df Residuals:                  112738   BIC:                         2.243e+06
Df Model:                          15                                         
Covariance Type:            nonrobust                                         
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                   

In [109]:
# L1 norm

model = sm.QuantReg(y, X, missing='drop')
res = model.fit(q=.5)
print(res.summary())
print('MAE', mean_absolute_error(y, res.predict(X)))

                         QuantReg Regression Results                          
Dep. Variable:          payments_2010   Pseudo R-squared:               0.2207
Model:                       QuantReg   Bandwidth:                       68.95
Method:                 Least Squares   Sparsity:                        591.9
Date:                Fri, 16 Jan 2026   No. Observations:               112754
Time:                        07:55:48   Df Residuals:                   112738
                                        Df Model:                           15
                               coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------------
const                     8109.0011      9.596    845.034      0.000    8090.193    8127.809
PLAN_CVRG_MOS_NUM_2010       6.0000      0.211     28.485      0.000       5.587       6.413
age_2010                  3.345e-09      0.071   4.73e-08      1.000      -

### Other predictive models 

#### Use x [Amt | Use]

In [110]:
# --- 1) Usage model: P(y > 0 | X) ---
use = (y > 0).astype(int)

res_use = sm.Logit(use, X, missing="drop").fit(disp=False)

# predicted probability of positive spending
p_use = pd.Series(res_use.predict(X), index=X.index, name="p_use")

In [111]:
# --- 2) Amount model among users: E[y | y > 0, X] ---
mask_pos = (y > 0).values


y_pos = y.loc[mask_pos]
X_pos = X.loc[mask_pos]

y_pos_ihs = np.arcsinh(y_pos)

# res_amt = sm.OLS(y_pos_ihs, X_pos, missing="drop").fit()
res_amt = sm.QuantReg(y_pos_ihs, X_pos, missing="drop").fit(q=.5)

In [112]:
# predicted IHS amount among users
yhat_pos_ihs = pd.Series(res_amt.predict(X_pos), index=y_pos.index, name="yhat_pos_ihs")

# back-transform to dollars (conditional on use)
yhat_pos = pd.Series(np.sinh(yhat_pos_ihs), index=y_pos.index, name="yhat_pos")

In [113]:
# --- 3) Combine: overall predicted spending (dollars) ---
# overall E[y|X] = P(use|X) * E[y|use,X]

yhat_allpos_ihs = pd.Series(res_amt.predict(X), index=X.index, name="yhat_allpos_ihs")
yhat_allpos = pd.Series(np.sinh(yhat_allpos_ihs), index=X.index, name="yhat_allpos")

yhat = (p_use * yhat_allpos).rename("yhat")  # overall predicted spending for everyone

In [114]:
mean_absolute_error(y, yhat)

1855.90127221372

### random forest

In [115]:

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)


In [116]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    min_samples_leaf=50,      # critical for cost data
    max_features="sqrt",
    n_jobs=-1,
    random_state=123,
    oob_score=True,
    criterion="absolute_error"
)

rf.fit(X_train, y_train)


  return fit_method(estimator, *args, **kwargs)


0,1,2
,n_estimators,800
,criterion,'absolute_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [117]:

y_pred = rf.predict(X_test)

print("R2:", r2_score(y_test, y_pred))
print("MAE ($):", mean_absolute_error(y_test, y_pred))


R2: 0.1186595526193932
MAE ($): 1775.8144297924703


In [118]:

print("MAE Overall ($):", mean_absolute_error(y, rf.predict(X)))


MAE Overall ($): 1776.9223623441296


In [119]:
top = np.percentile(y_test, 90)
capture = y_test[y_pred >= np.percentile(y_pred, 90)].mean() / y_test.mean()
print("Top-decile capture ratio:", capture)


Top-decile capture ratio: payments_2010    3.362202
dtype: float64
