In [1]:
from utilities import *
from plot_utilities import *
import timeit

'''
Goals:
1. compute_ksstat runs in constant time, regardless of length of sample
2. accuracy still within 1% of true ksstat
3. holds for range of choices for true CDF (low/high r/eta)
4. holds for incorrectly specified CDFs as well (i.e. not the best fit)
'''
os.getcwd()

'c:\\Users\\yashd\\Desktop\\hierarchical-bayesian-model-validation\\testing-framework'

In [2]:
# sample_dict = pd.read_pickle('full-pastis-gray-wavelet-oldCDFs/obs_x_dict.pickle')
sample_df = pd.read_pickle(f'../transformed-data/toy-agriVision-red-fourier.pickle')
all_cdfs = combine_pickles('scipy_10000') | combine_pickles('mtlb_10000')

In [3]:
x = sample_df[(sample_df['band'] == 14)]['data'].iloc[0]
cdf = all_cdfs[(0.22, 0.1)]
n = x.size
large_x = np.array([])
for _ in range(20):
    noise = stats.norm.rvs(loc = 1, scale=0.1, size=x.size)
    large_x = np.append(large_x, x*noise)
large_n = large_x.size
large_n

596494080

In [4]:
def ks_1samp_test(subsample, cdf):
    return stats.ks_1samp(subsample, cdf).statistic

true_stat = stats.ks_1samp(large_x, cdf).statistic
true_time = timeit.timeit(lambda: ks_1samp_test(large_x, cdf), number=10) / 10
print(f"True Stat: {true_stat} and True Time: {true_time}, {large_n} samples")

True Stat: 0.025371806588293744 and True Time: 147.92736257999204, 596494080 samples


In [5]:
sliced_df = pd.DataFrame(columns = ['subsample_size', 'ksstat', 'abs_error', 'time'])
i = 0

print(f"True Stat: {true_stat} and True Time: {true_time}, {large_n} samples")
for k in np.logspace(3, np.floor(np.log10(large_n)), int(np.floor(np.log10(large_n))) - 2):
    subsample = np.sort(large_x)[np.round(np.linspace(0, large_n - 1, int(k))).astype(int)]
    time = timeit.timeit(lambda: ks_1samp_test(subsample, cdf), number=10) / 10
    ksstat = stats.ks_1samp(subsample, cdf).statistic
    sliced_df.loc[i] = subsample.size, ksstat, round_to_sigfigs(ksstat - true_stat), time
    i += 1

sliced_df['percent_time'] = np.round(sliced_df['time']/true_time * 100, 8)
sliced_df['percent_sample'] = np.round(sliced_df['subsample_size']/large_n * 100, 8)
sliced_df['percent_error'] = np.round(np.abs(sliced_df['ksstat'] - true_stat)/true_stat * 100, 8)
sliced_df.to_csv('approx_kstest_slice_results.csv')
sliced_df.sort_values('percent_sample')


True Stat: 0.025371806588293744 and True Time: 147.92736257999204, 596494080 samples


Unnamed: 0,subsample_size,ksstat,abs_error,time,percent_time,percent_sample,percent_error
0,1000.0,0.025816,0.00044371,0.002469,0.001669,0.000168,1.748812
1,10000.0,0.025416,4.4234e-05,0.020086,0.013578,0.001676,0.174345
2,100000.0,0.025376,4.3778e-06,0.266005,0.179821,0.016765,0.017255
3,1000000.0,0.025372,4.2562e-07,0.038471,0.026006,0.167646,0.001678
4,10000000.0,0.025372,4.1225e-08,0.695326,0.470045,1.676463,0.000162
5,100000000.0,0.025372,3.4782e-09,6.077918,4.108718,16.764626,1.4e-05


In [6]:
random_df = pd.DataFrame(columns = ['subsample_size', 'ksstat', 'abs_error', 'time'])
i = 0

print(f"True Stat: {true_stat} and True Time: {true_time}, {large_n} samples")
for k in np.logspace(3, np.floor(np.log10(large_n)), int(np.floor(np.log10(large_n))) - 2):
    subsample = np.sort(large_x[np.random.permutation(large_n)[:int(k)]])
    time = timeit.timeit(lambda: ks_1samp_test(subsample, cdf), number=10) / 10
    ksstat = stats.ks_1samp(subsample, cdf).statistic
    random_df.loc[i] = subsample.size, ksstat, round_to_sigfigs(ksstat - true_stat), time
    i += 1

random_df['percent_time'] = np.round(random_df['time']/true_time * 100, 8)
random_df['percent_sample'] = np.round(random_df['subsample_size']/large_n * 100, 8)
random_df['percent_error'] = np.round(np.abs(random_df['ksstat'] - true_stat)/true_stat * 100, 8)
random_df.to_csv('approx_kstest_random_results.csv')
random_df.sort_values('percent_sample')

True Stat: 0.025371806588293744 and True Time: 147.92736257999204, 596494080 samples


Unnamed: 0,subsample_size,ksstat,abs_error,time,percent_time,percent_sample,percent_error
0,1000.0,0.039403,0.014032,0.00083,0.000561,0.000168,55.304085
1,10000.0,0.028839,0.003467,0.030732,0.020775,0.001676,13.663567
2,100000.0,0.024704,-0.000668,0.231126,0.156243,0.016765,2.631632
3,1000000.0,0.025316,-5.6e-05,0.060076,0.040612,0.167646,0.220184
4,10000000.0,0.025403,3.1e-05,0.645428,0.436314,1.676463,0.122506
5,100000000.0,0.025308,-6.4e-05,5.835607,3.944914,16.764626,0.250483


In [7]:
averaged_df = pd.DataFrame(columns = ['subsample_size', 'ksstat', 'abs_error', 'time'])
i = 0

print(f"True Stat: {true_stat} and True Time: {true_time}, {large_n} samples")
for k in np.logspace(3, np.floor(np.log10(large_n)), int(np.floor(np.log10(large_n))) - 2):
    subsample = np.array([np.mean(s) for s in np.array_split(np.sort(large_x), k)])
    time = timeit.timeit(lambda: ks_1samp_test(subsample, cdf), number=10) / 10
    ksstat = stats.ks_1samp(subsample, cdf).statistic
    averaged_df.loc[i] = subsample.size, ksstat, round_to_sigfigs(ksstat - true_stat), time
    i += 1

averaged_df['percent_time'] = np.round(averaged_df['time']/true_time * 100, 8)
averaged_df['percent_sample'] = np.round(averaged_df['subsample_size']/large_n * 100, 8)
averaged_df['percent_error'] = np.round(np.abs(averaged_df['ksstat'] - true_stat)/true_stat * 100, 8)
averaged_df.to_csv('approx_kstest_averaged_results.csv')
averaged_df.sort_values('percent_sample')


True Stat: 0.025371806588293744 and True Time: 147.92736257999204, 596494080 samples


Unnamed: 0,subsample_size,ksstat,abs_error,time,percent_time,percent_sample,percent_error
0,1000.0,0.025871,0.00049894,0.000888,0.000601,0.000168,1.966518
1,10000.0,0.025418,4.6638e-05,0.018842,0.012738,0.001676,0.183817
2,100000.0,0.025371,-5.7766e-07,0.349399,0.236196,0.016765,0.002277
3,1000000.0,0.025004,-0.00036808,0.081869,0.055344,0.167646,1.450726
4,10000000.0,0.026922,0.0015502,0.679466,0.459324,1.676463,6.109934
5,100000000.0,0.026922,0.0015502,6.182317,4.179292,16.764626,6.109788


In [8]:
median_df = pd.DataFrame(columns = ['subsample_size', 'ksstat', 'abs_error', 'time'])
i = 0

print(f"True Stat: {true_stat} and True Time: {true_time}, {large_n} samples")
for k in np.logspace(3, np.floor(np.log10(large_n)), int(np.floor(np.log10(large_n))) - 2):
    subsample = np.array([np.mean(s) for s in np.array_split(np.sort(large_x), k)])
    time = timeit.timeit(lambda: ks_1samp_test(subsample, cdf), number=10) / 10
    ksstat = stats.ks_1samp(subsample, cdf).statistic
    median_df.loc[i] = subsample.size, ksstat, round_to_sigfigs(ksstat - true_stat), time
    i += 1

median_df['percent_time'] = np.round(median_df['time']/true_time * 100, 8)
median_df['percent_sample'] = np.round(median_df['subsample_size']/large_n * 100, 8)
median_df['percent_error'] = np.round(np.abs(median_df['ksstat'] - true_stat)/true_stat * 100, 8)
median_df.to_csv('approx_kstest_median_results.csv')
median_df.sort_values('percent_sample')

True Stat: 0.025371806588293744 and True Time: 147.92736257999204, 596494080 samples


Unnamed: 0,subsample_size,ksstat,abs_error,time,percent_time,percent_sample,percent_error
0,1000.0,0.025871,0.00049894,0.001159,0.000783,0.000168,1.966518
1,10000.0,0.025418,4.6638e-05,0.019691,0.013311,0.001676,0.183817
2,100000.0,0.025371,-5.7766e-07,0.182852,0.123609,0.016765,0.002277
3,1000000.0,0.025004,-0.00036808,0.037732,0.025507,0.167646,1.450726
4,10000000.0,0.026922,0.0015502,0.538699,0.364164,1.676463,6.109934
5,100000000.0,0.026922,0.0015502,5.350352,3.616878,16.764626,6.109788
