# Description

Notebook for colon cancer screening RDD discovery analysis. Exercises the end-to-end process of RDD discovery.

**Note**: since the Optum claims data is private, this notebook will not run and is provided for information purposes only.

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import os
import pandas as pd
import pickle
import seaborn as sns
import sys

from tqdm import tqdm

In [None]:
# user imports 
sys.path.append("../../../optum-pipeline")
sys.path.append("../../")


from rddd.feat import gen_feat_df, get_descriptives
from utils.rddd import policy_tree_discovery, test_discontinuity, create_feat_df

In [None]:
# notebook magics
%load_ext autoreload

%autoreload 2

%matplotlib inline

# Load data

In [None]:
%%time
cc_df = pd.read_parquet("/REDACTED/colon_cancer.parq")

In [None]:
cc_df.columns

In [None]:
print(cc_df.shape)

# Clean data

In [None]:
%%time
cc_feat = gen_feat_df(cc_df)

In [None]:
cc_feat.columns

# Run assignment tree discovery

In [None]:
test_df = cc_feat.copy()

In [None]:
# import warnings
# warnings.filterwarnings("ignore", module='sk.*')

In [None]:
%%time
grid_dict = {
    'age': np.arange(40, 61, 5)
}
alpha = 0.05
treat = 'indicator'
running_cols = ['age']
random_state = 42

In [None]:
%%time
subgroup_dict, num_tests = policy_tree_discovery(test_df,
                                                 running_cols=running_cols,
                                                 grid_dict=grid_dict,
                                                 treat=treat,
                                                 alpha=alpha,
                                                 rescale=False,
                                                 bw=4,
                                                 random_state=random_state)

In [None]:
out_dir = "/REDACTED/results/colon_cancer/"
pickle.dump((subgroup_dict, num_tests), open(f"{out_dir}/colon_cancer_subgroup_results.pkl", "wb"), -1)

In [None]:
sel_nodes = []
for cutoff, nodes in subgroup_dict['age'].items():
    for node in nodes:
        if ((node['net_benefit'] > 0) or (len(node['rule_path']) == 1)) and (node['llr_results'].pvalues['z'] < alpha / num_tests):
            sel_nodes.append((cutoff, node))