In [None]:
from gdc.cms.data_access import *
from gdc.utils import filter_data, is_greater_than
import seaborn as sns

import numpy as np

In [2]:
df_merged_payments.describe()

Unnamed: 0,payments_2008,payments_2009,payments_2010
count,116352.0,114538.0,112754.0
mean,3998.503163,4297.398069,2521.582826
std,9847.702381,8551.804491,5703.427942
min,-430.0,-1320.0,-1770.0
25%,0.0,130.0,70.0
50%,940.0,1530.0,880.0
75%,3040.0,4230.0,2290.0
max,170190.0,154970.0,116910.0


In [23]:
(df_merged_payments < 0).mean(axis=0)

payments_2008    0.000026
payments_2009    0.000017
payments_2010    0.000120
dtype: float64

In [24]:
(df_merged_payments == 0).mean(axis=0)

payments_2008    0.263794
payments_2009    0.194255
payments_2010    0.205128
dtype: float64

In [25]:
(df_merged_payments < 100).mean(axis=0)

payments_2008    0.298456
payments_2009    0.234332
payments_2010    0.254125
dtype: float64

In [3]:
df_merged_payments.corr()

Unnamed: 0,payments_2008,payments_2009,payments_2010
payments_2008,1.0,0.342798,0.170055
payments_2009,0.342798,1.0,0.200042
payments_2010,0.170055,0.200042,1.0


In [4]:
filter_data(
    df_merged_payments,
    {'payments_2008': is_greater_than(5000)}).mean()

payments_2008    18521.044859
payments_2009    10605.247216
payments_2010     4826.262663
dtype: float64

In [5]:
def relevant_cols(kw, df=df_merged_covariates):
    return [c for c in df.columns if c.find(kw) > -1]

relevant_cols(CB.cc_cancer)

['SP_CNCR_2_2008', 'SP_CNCR_2_2009', 'SP_CNCR_2_2010']

In [6]:
df_merged_covariates[relevant_cols(CB.cc_cancer)].corr()

Unnamed: 0,SP_CNCR_2_2008,SP_CNCR_2_2009,SP_CNCR_2_2010
SP_CNCR_2_2008,1.0,0.429547,0.317119
SP_CNCR_2_2009,0.429547,1.0,0.28955
SP_CNCR_2_2010,0.317119,0.28955,1.0


In [7]:
df_merged_covariates[relevant_cols(CB.cc_esrd_indicator)].corr()

Unnamed: 0,BENE_ESRD_IND_Y_2008,BENE_ESRD_IND_Y_2009,BENE_ESRD_IND_Y_2010
BENE_ESRD_IND_Y_2008,1.0,0.227481,0.172674
BENE_ESRD_IND_Y_2009,0.227481,1.0,0.189684
BENE_ESRD_IND_Y_2010,0.172674,0.189684,1.0


In [8]:
df_merged_covariates[relevant_cols(CB.cc_diabetes)].corr()

Unnamed: 0,SP_DIABETES_2_2008,SP_DIABETES_2_2009,SP_DIABETES_2_2010
SP_DIABETES_2_2008,1.0,0.533018,0.394265
SP_DIABETES_2_2009,0.533018,1.0,0.400199
SP_DIABETES_2_2010,0.394265,0.400199,1.0


In [9]:
relevant_cols('2008', df_merged_payments)

['payments_2008']

In [None]:
year = str(2009)
y = df_merged_payments[relevant_cols(year, df_merged_payments)]
X = sm.add_constant(df_merged_covariates[relevant_cols(year)])

model = sm.OLS(y, X, missing='drop')
res = model.fit()
res.summary()

In [12]:
res.resid

DESYNPUF_ID
57FC84F24745CFA6    -1638.462702
0B582944B8C71C58       53.831037
8272FB0B28E248CC    44818.028967
9249DE518539A0D1    -7464.193772
52C2A0117A675ADB      428.936513
                        ...     
4658E62CA16335E3      388.563478
D7BE91E25C6F02F4       53.831037
350BE146D5D17455    -6740.519716
F245C5C422BC61F6      379.471243
3CF456D138A9C938      200.188731
Length: 112754, dtype: float64

In [26]:
y.values[:,0]

array([ 3000.,     0., 59120., ...,  1900.,     0.,     0.],
      shape=(112754,))

In [13]:
res.predict()

array([ 4638.4627021 ,   -53.83103678, 14301.97103256, ...,
        8640.51971561,  -379.47124285,  -200.1887313 ], shape=(112754,))

In [20]:
y.values[:,0] - res.predict()

array([-1638.4627021 ,    53.83103678, 44818.02896744, ...,
       -6740.51971561,   379.47124285,   200.1887313 ], shape=(112754,))

### Other predictive models

In [42]:
year = str(2008)
y = df_merged_payments[relevant_cols(year, df_merged_payments)[0]]
X = sm.add_constant(df_merged_covariates[relevant_cols(year)])

In [43]:
# --- 1) Usage model: P(y > 0 | X) ---
use = (y > 0).astype(int)

res_use = sm.Logit(use, X, missing="drop").fit(disp=False)

# predicted probability of positive spending
p_use = pd.Series(res_use.predict(X), index=X.index, name="p_use")

In [48]:
# --- 2) Amount model among users: E[y | y > 0, X] ---
mask_pos = (y > 0) & y.notna()

y_pos = y.loc[mask_pos]
X_pos = X.loc[mask_pos]

y_pos_ihs = np.arcsinh(y_pos)

res_amt = sm.OLS(y_pos_ihs, X_pos, missing="drop").fit()
res_amt.summary()

0,1,2,3
Dep. Variable:,payments_2008,R-squared:,0.541
Model:,OLS,Adj. R-squared:,0.541
Method:,Least Squares,F-statistic:,6729.0
Date:,"Thu, 15 Jan 2026",Prob (F-statistic):,0.0
Time:,06:25:25,Log-Likelihood:,-128050.0
No. Observations:,85656,AIC:,256100.0
Df Residuals:,85640,BIC:,256300.0
Df Model:,15,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,11.8270,0.031,378.906,0.000,11.766,11.888
PLAN_CVRG_MOS_NUM_2008,-0.0142,0.001,-21.860,0.000,-0.015,-0.013
age_2008,0.0017,0.000,5.723,0.000,0.001,0.002
BENE_ESRD_IND_Y_2008,0.2825,0.014,20.348,0.000,0.255,0.310
SP_ALZHDMTA_2_2008,-0.3092,0.009,-33.903,0.000,-0.327,-0.291
SP_CHF_2_2008,-0.4299,0.009,-48.861,0.000,-0.447,-0.413
SP_CHRNKIDN_2_2008,-0.6123,0.011,-55.836,0.000,-0.634,-0.591
SP_CNCR_2_2008,-0.6451,0.013,-47.994,0.000,-0.671,-0.619
SP_COPD_2_2008,-0.5271,0.011,-49.571,0.000,-0.548,-0.506

0,1,2,3
Omnibus:,1372.448,Durbin-Watson:,1.991
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2119.864
Skew:,-0.17,Prob(JB):,0.0
Kurtosis:,3.692,Cond. No.,646.0


In [49]:
# predicted IHS amount among users
yhat_pos_ihs = pd.Series(res_amt.predict(X_pos), index=y_pos.index, name="yhat_pos_ihs")

# back-transform to dollars (conditional on use)
yhat_pos = pd.Series(np.sinh(yhat_pos_ihs), index=y_pos.index, name="yhat_pos")

In [50]:
# --- 3) Combine: overall predicted spending (dollars) ---
# overall E[y|X] = P(use|X) * E[y|use,X]

yhat_allpos_ihs = pd.Series(res_amt.predict(X), index=X.index, name="yhat_allpos_ihs")
yhat_allpos = pd.Series(np.sinh(yhat_allpos_ihs), index=X.index, name="yhat_allpos")

yhat = (p_use * yhat_allpos).rename("yhat")  # overall predicted spending for everyone

In [51]:
# --- 4) Residuals ---
# Dollar residuals for everyone (works with negative y too)
resid_dollars = (y - yhat).rename("resid_dollars")


In [52]:
# --- 5) Package outputs ---
out = pd.DataFrame({
    "y": y,
    "p_use": p_use,
    "yhat_allpos": yhat_allpos,   # predicted amount among users, for everyone
    "yhat": yhat,                 # overall predicted spending
    "resid_dollars": resid_dollars,
})



In [53]:
def adjusted_r2(out, k):
    """
    Compute adjusted R^2 treating yhat as a black-box predictor.

    Parameters
    ----------
    y : array-like or pd.Series
        True outcome (levels, dollars)
    yhat : array-like or pd.Series
        Predicted outcome (levels, dollars)
    k : int
        Number of estimated parameters (effective degrees of freedom)

    Returns
    -------
    adj_r2 : float
    """
    y = out['y']
    yhat = out['yhat']
    
    y = pd.Series(y)
    yhat = pd.Series(yhat)

    mask = y.notna() & yhat.notna()
    y = y[mask]
    yhat = yhat[mask]

    n = len(y)
    if n <= k + 1:
        raise ValueError("Not enough observations for adjusted R^2")

    ssr = np.sum((y - yhat) ** 2)
    sst = np.sum((y - y.mean()) ** 2)

    r2 = 1 - ssr / sst
    adj_r2 = 1 - (1 - r2) * (n - 1) / (n - k - 1)

    return adj_r2

In [54]:
adjusted_r2(out,0)

np.float64(0.3405284745194038)

### random forest

In [56]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=123
)


In [59]:
from sklearn.ensemble import RandomForestRegressor

rf = RandomForestRegressor(
    n_estimators=800,
    max_depth=None,
    min_samples_leaf=50,      # critical for cost data
    max_features="sqrt",
    n_jobs=-1,
    random_state=123,
    oob_score=True
)

rf.fit(X_train, y_train)


0,1,2
,n_estimators,800
,criterion,'squared_error'
,max_depth,
,min_samples_split,2
,min_samples_leaf,50
,min_weight_fraction_leaf,0.0
,max_features,'sqrt'
,max_leaf_nodes,
,min_impurity_decrease,0.0
,bootstrap,True


In [60]:
from sklearn.metrics import r2_score, mean_absolute_error

y_pred = rf.predict(X_test)

print("R2:", r2_score(y_test, y_pred))
print("MAE ($):", mean_absolute_error(y_test, y_pred))


R2: 0.40165710854311587
MAE ($): 3072.4263989619453


In [61]:
top = np.percentile(y_test, 90)
capture = y_test[y_pred >= np.percentile(y_pred, 90)].mean() / y_test.mean()
print("Top-decile capture ratio:", capture)


Top-decile capture ratio: 4.943727871480036


#### With concave transform before estimation

In [63]:
y_train_t = np.arcsinh(y_train)
y_test_t  = np.arcsinh(y_test)

rf.fit(X_train, y_train_t)

y_pred_t = rf.predict(X_test)
y_pred = np.sinh(y_pred_t)    # back to dollars

print("R2 (IHS):", r2_score(y_test_t, y_pred_t))
print("R2 ($):", r2_score(y_test, y_pred))
print("MAE ($):", mean_absolute_error(y_test, y_pred))


R2 (IHS): 0.7049390793565182
R2 ($): 0.30076434411712294
MAE ($): 2744.8142975308283


In [64]:
from sklearn.model_selection import RandomizedSearchCV

param_grid = {
    "max_depth": [None, 10, 20, 30],
    "min_samples_leaf": [20, 50, 100],
    "max_features": ["sqrt", 0.3, 0.5]
}

search = RandomizedSearchCV(
    rf,
    param_grid,
    n_iter=20,
    scoring="neg_mean_absolute_error",
    cv=3,
    n_jobs=-1
)

search.fit(X_train, y_train)
rf_best = search.best_estimator_


In [None]:
y_train_t = np.arcsinh(y_train)
y_test_t  = np.arcsinh(y_test)

rf.fit(X_train, y_train_t)

y_pred_t = rf.predict(X_test)
y_pred = np.sinh(y_pred_t)    # back to dollars

print("R2 (IHS):", r2_score(y_test_t, y_pred_t))
print("R2 ($):", r2_score(y_test, y_pred))
print("MAE ($):", mean_absolute_error(y_test, y_pred))
