# Description

Notebook for diabetes screening RDD discovery analysis. Exercises the end-to-end process of RDD discovery.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
import sys

from tqdm import tqdm

In [None]:
# sample split for TMLR rebuttal
from sklearn.model_selection import train_test_split

In [None]:
# user imports 
sys.path.append("../../")

from rddd.feat import gen_feat_df, get_descriptives
#from rdsgd import *

In [None]:
sys.path.append("/home/liutony/optum-pipeline/notebooks/tmlr/")

import rdsgd

In [None]:
# notebook magics
%load_ext autoreload

%autoreload 2

%matplotlib inline

# Load data

In [None]:
%%time
diabetes_df = pd.read_parquet("/project/liu_optum_causal_inference/data/diabetes/merge/diabetes.parq")

## Load up rx data


In [None]:
rx_path = "/project/liu_optum_causal_inference/archive/threshold-optimization/rx_data/dm_data"

START_DATE = 2001
END_DATE = 2016

rx_df = pd.DataFrame()
for yr in range(START_DATE, END_DATE+1):
    for q in range(1, 5):

        df = pd.read_pickle("{0}/ses_r{1}q{2}_rx_dm.df".format(rx_path, yr, q)) 
        df.columns = df.columns.str.lower()
        #print(df.columns)
        rx_df = pd.concat([rx_df, df[['patid', 'fill_dt']]])


In [None]:
rx_df['patid'] = rx_df['patid'].astype(int)

In [None]:
rx_df.to_parquet("/project/liu_optum_causal_inference/data/rx_dm.parquet")
#met_df = pd.read_parquet("/project/liu_optum_causal_inference/data/rx_met.parquet")

In [None]:
rx_df = rx_df.set_index('patid')

In [None]:
dm_id = diabetes_df[['patid', 'fst_dt_pre']]

In [None]:
dm_id = dm_id.set_index('patid')

In [None]:
dm_id = dm_id.sort_index()
rx_df = rx_df.sort_index()

In [None]:
dm_met = dm_id.merge(rx_df, left_index=True, right_index=True, how='left')

In [None]:
dm_met.shape

In [None]:
dm_met['fill_dt'].isna().sum()

prior_rx = dm_met[dm_met['fst_dt_pre'] > dm_met['fill_dt']]
prior_rx = prior_rx.reset_index()
prior_rx_patids = prior_rx['patid'].unique()

In [None]:
diabetes_df['prior_rx_indicator'] = diabetes_df['patid'].isin(prior_rx_patids).astype(int)
diabetes_df['prior_rx_indicator'].value_counts()

# Clean data

In [None]:
sel_dm = diabetes_df[(diabetes_df['fst_dt_pre'] <= '2017-01-01') & (diabetes_df['prior_rx_indicator'] == 0)]

In [None]:
%%time
#rdd_dm
diabetes_feat = gen_feat_df(sel_dm, rdd_cols=['indicator', 'lr_fmt'], compute_age=False)#, 'loinc_cd', 'diag'])

In [None]:
diabetes_feat.shape

In [None]:
diabetes_feat['lr_fmt'].describe().to_frame().style.format("{:.2f}")

In [None]:
diabetes_feat = diabetes_feat[(diabetes_feat['lr_fmt'] > 0) & (diabetes_feat['lr_fmt'] < 20)]

In [None]:
feat_cols = list(diabetes_feat.columns)
feat_cols = feat_cols[2:]
feat_cols

# Run assignment tree discovery

In [None]:
test_df = diabetes_feat.copy()
test_df['lr_fmt'] = test_df['lr_fmt'].round(1)


In [None]:
import warnings
warnings.filterwarnings("ignore", module='sk.*')

In [None]:
feat_cols

In [None]:
%%time
grid_dict = {
    'lr_fmt': np.round(np.arange(6, 7.6, 0.1), 1)
}
alpha = 0.05
treat = 'indicator'
running_cols = ['lr_fmt']
random_state = 42
bw = 0.4
sample_df = test_df[feat_cols + ['indicator', 'lr_fmt']]


In [None]:

# add in sample splitting
s1_df, s2_df = train_test_split(sample_df, test_size=0.5, random_state=random_state)

In [None]:

subgroup_dict, num_tests = rdsgd.rd_subgroup_discovery(s1_df,
                                                 running_cols=running_cols,
                                                 grid_dict=grid_dict,
                                                 treat=treat,
                                                 alpha=alpha,
                                                 rescale=False,
                                                 bw=bw,
                                                 omit_mask=True,
                                                 kernel='rectangular'
                                                 #tree_kwargs=tree_kwargs, 
                                                 #random_state=random_state
                                                )

In [None]:
# update llr_results with holdout
for cutoff, nodes in subgroup_dict['lr_fmt'].items():
    for node in nodes:
        rule_path = node['rule_path']
        holdout = s2_df.copy()
        for rule in rule_path[:-1]:
            if rule.path_dir == '<':
                holdout = holdout[holdout[rule.feature] < rule.threshold]
            elif rule.path_dir == '>=':
                holdout = holdout[holdout[rule.feature] >= rule.threshold]
            elif rule.path_dir == '<=':
                holdout = holdout[holdout[rule.feature] <= rule.threshold]
            elif rule.path_dir == '>':
                holdout = holdout[holdout[rule.feature] > rule.threshold]
            elif rule.path_dir == '==':
                holdout = holdout[holdout[rule.feature] == rule.threshold]

        llr_results, _, _ = rdsgd.test_discontinuity(holdout, cutoff, 'lr_fmt', treat=treat, bw=bw, kernel='rectangular')
        node['llr_results'] = llr_results

In [None]:
out_dir = "/project/liu_optum_causal_inference/results/tmlr_sample_split"
pickle.dump((subgroup_dict, num_tests), open(f"{out_dir}/diabetes_subgroup_results_tmlr.pkl", "wb"))

#subgroup_dict, num_tests = )

# Extract baseline and subgroup data

In [None]:
sorted_nodes = sorted(sel_nodes, key=lambda x: x[1]['posthoc_pwr'], reverse=True)

In [None]:
cutoff = 6.5
running = 'lr_fmt'
bw = 0.4
baseline_df = create_feat_df(diabetes_feat, running=running, 
                             cutoff=cutoff, bw=bw)

In [None]:
baseline_df['in_subgroup'] = (sorted_nodes[0][1]['subgroup_mask']).astype(int)

baseline_df[[running, 'indicator', 'in_subgroup']].to_parquet("/project/liu_optum_causal_inference/results/diabetes_running.parq")

In [None]:
baseline_df['in_subgroup'].value_counts()

In [None]:
baseline_df.shape

In [None]:
for col in baseline_df.columns:
    get_descriptives(baseline_df, col)

In [None]:
baseline_df['age'].describe()