# Description

Runs and processes the multi-dimensional covariate simulation results.

# Imports

In [2]:
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches
import multiprocessing
import numpy as np
import os
import pandas as pd
import pingouin as pg


import pickle
import seaborn as sns
import sklearn
import sys
from tqdm import tqdm

# to hide warnings for pretty notebook rendering in repo
import warnings
warnings.filterwarnings('ignore')

# user imports
sys.path.append("../../")

from utils.sim import *
from utils.rddd import *
from utils.pwr import *

In [3]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

In [4]:
def _multidim_wrapper(seed, n_feats):
    regression_dict = dict(n_informative=n_feats, 
                               noise=0, 
                               n_features=n_feats)
        
    df = generate_blended_rdd_with_covars(seed, n, fuzzy_gap=fuzzy_gap, take=0.05, reg_dict=regression_dict)
    print(df['cutoff'].value_counts())


    running_cols = ['x']#, 'covar']
    treat = 't'    
    # we use a fixed bandwidth to simplify power analysis
    bw = 0.25
    alpha = 0.05

    grid_dict = {
        'x': np.arange(0.05, 0.96, 0.05),
    }

    result, num_tests = policy_tree_discovery(df.drop(['comply_coeff', 'p', 'z', 'cutoff'], axis='columns', errors='ignore'), running_cols, grid_dict, treat=treat, bw=bw,
                                   alpha=alpha, omit_mask=True, random_state=seed)
    
    return seed, result, num_tests

In [16]:
%%time
%%capture
# we'll fix both sample size and fuzzy gap
n = 1000
fuzzy_gap = 0.5
n_trials = 500

RESULT_DIR = "../../results/kdd/multidim_xgrid/"

for n_feats in [2, 4, 8, 16]:
    f_args = [(seed, n_feats) for seed in range(n_trials)]
    
    with multiprocessing.Pool(4) as pool:
        results = pool.starmap(_multidim_wrapper, f_args)
        
        pickle.dump(results, open("{}n_feats{}.pkl".format(RESULT_DIR, n_feats), "wb"), -1)
    

CPU times: user 13min 26s, sys: 1min 21s, total: 14min 47s
Wall time: 3h 7min 2s


# Process results

In [17]:
%%time
all_pwr_dict = {}
alpha = 0.05
n_feat = 8

for n_feat in tqdm([2, 4, 8, 16]):
    res = pickle.load(open("{}n_feats{}.pkl".format(RESULT_DIR, n_feat), "rb"))
    pwr_dict = {
        "lower_all": [],
        "upper_all": [],
        "lower_max": [],
        "upper_max": []
    }

    for seed, result, n_tests in res:
        cur_trial = seed
        x_dict = result['x']

        for x_cutoff, label in [(0.25, "lower"), (0.75, "upper")]:
            nodes = x_dict[x_cutoff]
            pwrs = []
            for node in nodes:

                if ((node['net_benefit'] > 0) or len(node['rule_path']) == 1) and node['llr_results'].pvalues['z'] < (alpha / n_tests):
                    sig_power = rdd_power(node['llr_results'].params['z'], node['llr_results'].std_errors['z']**2, alpha=alpha / n_tests)
                    pwrs.append(sig_power)
                    pwr_dict["{}_all".format(label)].append(sig_power)
            if len(pwrs) > 0:
                pwr_dict["{}_max".format(label)].append(max(pwrs))

    all_pwr_dict[n_feat] = pwr_dict

100%|██████████| 4/4 [07:56<00:00, 119.25s/it]

CPU times: user 7min 43s, sys: 14.4 s, total: 7min 58s
Wall time: 7min 56s





In [18]:
pickle.dump(all_pwr_dict, open("../../results/kdd/multdim_pwr_xgrid.dict", "wb"), -1)