# Distribution of p-values for C code  
Verification of results of C implementations of KS, $\chi^2$ from batteries and additional code (KS proposed by Marsaglia). For each uniformity test $10^5$ random samples with various sample sizes [10, 100, 1000, 10000] are randomly selected from devurandom generated floats from (0,1). Then uniformity test was applied and $10^5$ saved into file **2nd_simulated&name&test_id&sample_size**.
- TODO: sample size 10000, test ids= 10, 13...  

# Distribution of p-values for python code  
Verification of results of python tests. Analyzed were tests form scipy:
- KS (uniformity) 1 sample and 2 sample: **KS_scipy, KS_2samp_scipy**
- chi2 1 sample and 2 sample: **chisquare_scipy, chisquare_2sample_scipy**
- binomial test: **binom_scipy, binom_as_normal_scipy**

For each uniformity test $10^5$ random samples with various sample sizes [10, 100, 1000, 10000] are randomly selected from devurandom generated floats from (0,1). Then uniformity test was applied and $10^5$ saved into file **2nd_simulated&name&test_id&sample_size**.

TODO: all except chisquare_scipy, KS_scipy


In [1]:
import json
import sys, time

sys.path.insert(1, "../../python")
from statistical_tests import GoF_pvals_wrapper
from names import GoF_test_ids, GoF_test_names

uniform_pvals_path = "/mnt/d/Data/batteries_testing/1st/ideal-devu/uniform_pvals_devurand.pval"
sample_sizes = [10, 100, 1000, 10000]
save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
repetitions = 100000
for GoF_idx in GoF_test_ids[:]:
    for sample_size in sample_sizes:
        if (GoF_idx in [12, 13, 14]) and (sample_size == 10000):
            continue
        filename = f"2nd_simulated&{GoF_idx}&{sample_size}&{GoF_test_names[GoF_idx]}"
        file_path = save_path + filename
        print(file_path)
        start = time.time()
        GoF_pvals_wrapper(src_pvals_filepath = uniform_pvals_path,
                          dst_pvals_filepath = file_path,
                          sample_size=sample_size, repetitions = repetitions + 200, GoF_idx = GoF_idx, seed=1)
        print(f"time={time.time()-start}" )
        # with open(file_path) as f1:
        #     lines = f1.readlines()
        # 
        # with open(file_path, 'w') as f2:
        #     f2.writelines(lines[:repetitions]) 


/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&0&10&dieharder_default
time=1.0107998847961426
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&0&100&dieharder_default
time=3.957066774368286
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&0&1000&dieharder_default
time=199.38254380226135
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&0&10000&dieharder_default
time=111.43191361427307
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&1&10&dieharder_kuiper
time=1.0107879638671875
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&1&100&dieharder_kuiper
time=1.8212292194366455
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&1&1000&dieharder_kuiper
time=9.830022096633911
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&1&10000&dieharder_kuiper
time=116.11903023719788
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated&2&10&nist_chi2
time=1.0995161533355713
/mnt/d/Data/batteries_testing/2nd/simulated/2nd_simulated

In [2]:
import random, sys
from collections import defaultdict
sys.path.insert(1, "../../python")
from statistical_tests import KS_scipy, chisquare_scipy, binom_scipy, binom_as_normal_scipy
from histograms import histogram_sorted
from utils import read_pvalues
uniform_pvals_path = '/mnt/d/Data/batteries_testing/1st/ideal-devu/uniform_pvals_devurand.pval'
pvals_1stlvl = read_pvalues(uniform_pvals_path) 

save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
repetitions = 100000
sample_sizes = [10, 100, 1000, 10000]

names = ['KS_greater', 'KS_less', 'KS_both','chi2_&10&bins', 'chi2_&100&bins', 'chi2_&1000&bins',
         'binom_greater', 'binom_less', 'binom_both',
         ]
datasets = defaultdict(list)

for sample_size in sample_sizes:
    print(f"sample_size={sample_size}")
    for _ in range(10**5):
        sample = random.choices(pvals_1stlvl, k=sample_size)
        filename = f"2nd_simulated&17&{sample_size}&scipy_KS_greater"
        datasets[filename].append(KS_scipy(sample, alternative='greater').pvalue)
        
        filename = f"2nd_simulated&18&{sample_size}&scipy_KS_less"
        datasets[filename].append(KS_scipy(sample, alternative='less').pvalue)
        
        filename = f"2nd_simulated&19&{sample_size}&scipy_KS_both"
        datasets[filename].append( KS_scipy(sample, alternative='two-sided').pvalue)
  
        sample_sorted = sorted(sample)      
        for num_bins in [10, 100, 1000]:
            if sample_size // num_bins < 10:
                continue
            hist = histogram_sorted(sample_sorted, num_bins=num_bins)
            filename = f"2nd_simulated&20&{sample_size}&{num_bins}&scipy_chi2"
            datasets[filename].append(chisquare_scipy(f_obs = list(hist.values())).pvalue)
            filename = f"2nd_simulated&21&{sample_size}&{num_bins}&binom_scipy"
            datasets[filename].append(binom_scipy(f_obs = list(hist.values())[0], n=10**7, p=num_bins/10**7).pvalue)
            filename = f"2nd_simulated&22&{sample_size}&{num_bins}&binom_as_normal_scipy"
            datasets[filename].append(binom_as_normal_scipy(f_obs = list(hist.values())[0], n=10**7, p=num_bins/10**7))

            
for filename in datasets:
    file_path = save_path + filename
    pvals = datasets[filename]
    with open(file_path, 'w') as f:
        f.writelines( '\n'.join(map(str, pvals)) + '\n')
        

sample_size=10
sample_size=100
sample_size=1000
sample_size=10000


# Distribution of 1st level (extraction of data)


In [2]:
import random, sys, math, json
from collections import defaultdict, namedtuple
sys.path.insert(1, "../../python")
from utils import results_traverse, read_pvalues, custom_log, data_to_csv
from statistical_tests import KS_scipy, chisquare_scipy
from histograms import histogram_sorted, bin_frequency
from names import extract_from_path

In [3]:
# pvals_path = '/mnt/d/Data/batteries_testing/1st/'
pvals_path = '/mnt/d/Data/batteries_testing/2nd/'
# save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
chi2_num_bins = [10, 100, 1000, 10000]

chi2 = [f"chi2_{math.log10(num_bins)}" for num_bins in  chi2_num_bins]
log_chi2 = [f"log_{item}" for item in chi2]

absolute_left_tails = [f"L_{e}" for e in range(1, 10)][::-1]
absolute_right_tails = [f"R_{e}" for e in range(1, 10)]
relative_left_tails = [f"l_{e}" for e in range(1, 10)][::-1]
relative_right_tails = [f"r_{e}" for e in range(1, 10)]

log_relative_left_tails = [f"log_{item}" for item in relative_left_tails]
log_relative_right_tails = [f"log_{item}" for item in relative_right_tails]

atributes = ['bat','subb', 'test', 'id','num_pvals', 'unique','ratio', 'min', 'max', 'KS', 'log_KS'] 
atributes += chi2 + log_chi2 + absolute_left_tails + ['num_pvals2'] + absolute_right_tails
atributes += relative_left_tails + relative_right_tails + log_relative_left_tails + log_relative_right_tails

exps_backward = [9,8,7,6,5,4,3,2,1]
exps_forward = exps_backward[::-1]
print(atributes)
# Extracted_values = namedtuple('Extracted_values', atributes)

['bat', 'subb', 'test', 'id', 'num_pvals', 'unique', 'ratio', 'min', 'max', 'KS', 'log_KS', 'chi2_1.0', 'chi2_2.0', 'chi2_3.0', 'chi2_4.0', 'log_chi2_1.0', 'log_chi2_2.0', 'log_chi2_3.0', 'log_chi2_4.0', 'L_9', 'L_8', 'L_7', 'L_6', 'L_5', 'L_4', 'L_3', 'L_2', 'L_1', 'num_pvals2', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'l_9', 'l_8', 'l_7', 'l_6', 'l_5', 'l_4', 'l_3', 'l_2', 'l_1', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5', 'r_6', 'r_7', 'r_8', 'r_9', 'log_l_9', 'log_l_8', 'log_l_7', 'log_l_6', 'log_l_5', 'log_l_4', 'log_l_3', 'log_l_2', 'log_l_1', 'log_r_1', 'log_r_2', 'log_r_3', 'log_r_4', 'log_r_5', 'log_r_6', 'log_r_7', 'log_r_8', 'log_r_9']


In [7]:
rows = []
# pvals_path = '/mnt/d/Data/batteries_testing/1st/ideal-devu'
pvals_path = '/mnt/d/Data/batteries_testing/2nd/'
for path in results_traverse(pvals_path, endswith='*'):
    pvals = read_pvalues(path)
    pvals_sorted = sorted(pvals)
    res = []
    print(path) 
    names = extract_from_path(path)
    res += list(names)
    num_pvals = len(pvals) 
    unique = len(set(pvals_sorted))
    ratio = round(num_pvals/unique)
    min_pval, max_pval = pvals_sorted[0], pvals_sorted[-1]
    res += [num_pvals, unique,ratio, min_pval, max_pval] 
    print([num_pvals, unique,ratio, min_pval, max_pval])
    KS_pvalue = KS_scipy(pvals_sorted).pvalue
    res += [KS_pvalue, custom_log(KS_pvalue, round_to=1)]
    print([KS_pvalue, custom_log(KS_pvalue, round_to=1)])
    chi2_pvals = [None]*len(chi2_num_bins)
    for i in range(len(chi2_num_bins)):
        num_bins = chi2_num_bins[i]
        if num_pvals / num_bins >= 0:
            hist = histogram_sorted(pvals_sorted, num_bins=num_bins, domain = (0,1))
            chi2_pvals[i] = chisquare_scipy(f_obs=list(hist.values())).pvalue 
    chi2_pvals_logs = [custom_log(chi2pval, round_to=1) for chi2pval in chi2_pvals]
    res += chi2_pvals + chi2_pvals_logs
    
    absolute_left_tails = [bin_frequency(pvals_sorted, interval=(0, 10**(-e)), interval_type="[]" ) for e in exps_backward]
    absolute_right_tails = [bin_frequency(pvals_sorted, interval=(1-10**(-e),1), interval_type="[]" ) for e in exps_forward]
    absolute_tails = absolute_left_tails + absolute_right_tails
    res += absolute_left_tails + [num_pvals] + absolute_right_tails
    relative_tails = [round(abs_freq*10**(exp)/num_pvals,1) for abs_freq,exp in zip(absolute_tails, exps_backward+exps_forward)]
    res += relative_tails
    relative_tails_logs = [custom_log(relative_tail, round_to=1) for relative_tail in relative_tails]
    res += relative_tails_logs
    # print(res)
    rows.append(res)
    # c+= 1
    # if c % 10 == 0:
# data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_extracted.csv', 
#     filename_table = '../../../data/table/1st_extracted.txt')
# 
# res = {'header':atributes, 'rows': rows}
# json.dump(res, open('../../../data/json/1st_extracted.json', 'w'))
# # print(dict(zip(atributes,res)))
# data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_extracted.csv', 
#             filename_table = '../../../data/table/1st_extracted.txt')
# 
# res = {'header':atributes, 'rows': rows}
# json.dump(res, open('../../../data/json/1st_extracted.json', 'w'))
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/2nd_extracted.csv', 
    filename_table = '../../../data/table/2nd_extracted.txt')

res = {'header':atributes, 'rows': rows}
json.dump(res, open('../../../data/json/2nd_extracted.json', 'w'))
# print(dict(zip(atributes,res)))
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/2nd_extracted.csv', 
            filename_table = '../../../data/table/1st_extracted.txt')

res = {'header':atributes, 'rows': rows}
json.dump(res, open('../../../data/json/2nd_extracted.json', 'w'))

/mnt/d/Data/batteries_testing/2nd/empirical/default/2nd_Dieharder&000_000_000&10&0&default


ValueError: invalid literal for int() with base 10: '2nd_Dieharder&000_000_000&10&0&defaul'

# Histograms and selected values

In [6]:
import random, sys, math, json
from collections import defaultdict, namedtuple
sys.path.insert(1, "../../python")
from utils import results_traverse, read_pvalues, custom_log, data_to_csv
from statistical_tests import KS_scipy, chisquare_scipy
from histograms import histogram_sorted, bin_frequency, select_equiv
from names import extract_from_path

pvals_path = '/mnt/d/Data/batteries_testing/1st/'
res = defaultdict(dict)
for path in results_traverse(pvals_path):
    print(path)
    row = []
    names = extract_from_path(path)

    pvals = read_pvalues(path)
    pvals_sorted = sorted(pvals)
    key = ','.join(names)
    hist = histogram_sorted(pvals_sorted, num_bins=10, domain = (0,1))
    res[key]['10 hist'] =  dict(zip(list(map(float,hist.keys())), list(map(int,hist.values()))))
    res[key]['10 select'] = list(map(float, select_equiv(pvals_sorted, 10, values = True)))
    hist = histogram_sorted(pvals_sorted, num_bins=100, domain = (0,1))
    res[key]['100 hist'] =  dict(zip(list(map(float,hist.keys())), list(map(int,hist.values()))))
    res[key]['100 select'] = list(map(float, select_equiv(pvals_sorted, 100, values = True)))
    hist = histogram_sorted(pvals_sorted, num_bins=1000, domain = (0,1))
    res[key]['1000 hist'] =  dict(zip(list(map(float,hist.keys())), list(map(int,hist.values()))))
    res[key]['1000 select'] = list(map(float, select_equiv(pvals_sorted, 1000, values = True)))

json.dump(res, open('../../../data/json/1st_histograms.json', 'w'))

/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(0) Diehard Birthdays Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(1) Diehard OPERM5 Test.pval


KeyboardInterrupt: 

# 1st Images

In [1]:
# import sys, json
# sys.path.insert(1, "../../python")
# import numpy as np
# import matplotlib.pyplot as plt
# from images import draw_hist,draw_ECDF, draw_KS_relative, draw_tails
# 
# extracted = json.load(open('../../../data/json/1st_extracted.json', 'r'))
# histograms = json.load(open('../../../data/json/1st_histograms.json', 'r'))
# header = extracted['header']
# dir_path = "../../../data/image/1st/"
# 
# print(list(zip(range(len(header)), header)))
# for row in extracted['rows']:
#     key = ','.join(row[:4])
#     fig, axs = plt.subplots(3)
#     fig.set_figheight(10)
#     fig.set_figwidth(10)
#     hist10 = histograms[key]["10 hist"]
#     hist100 = histograms[key]["100 hist"]
#     freqs10 = list(hist10.values())
#     freqs100 = list(hist100.values())
#     pvals10 = histograms[key]["10 select"]
#     pvals100 = histograms[key]["100 select"]
#     extracted_n = row[4]
#     tails = row[42:52]
#     # left_tails = 
#     # right_tails = 
#     # bar histogram
#     draw_hist(freqs=freqs10, expected_freq=extracted_n/10, axis=axs[0], text="Histogram 10")
#     draw_hist(freqs=freqs100, expected_freq=extracted_n/100, axis=axs[1], text="Histogram 100")
#     draw_KS_relative(values=pvals100, axis=axs[2], text="Uniform(0, 1)")
#     # draw_tails(tails=tails, axis=axs[1][0], text="tails")    
#     # draw_ECDF(values=pvals10, axis=axs[1][0], label="$D_n$ and ECDF_abs")
#     # draw_KS_relative(values=pvals10, axis=axs[1][0], text="Uniform(0, 1)")
#     # fig.tight_layout()
#     filepath = dir_path + 'all/' + f"{row[1]}&{row[3]}&{row[2]}"
#     print(filepath)
#     plt.savefig(filepath.replace('|','_')) 
#     
#     
#     
#     # draw_hist(freqs=freqs100, expected_freq=extracted_n/100, axis=axs[0][1], text="Histogram 100")
#     # draw_ECDF(values=pvals100, axis=axs[1][0], label="$D_n$ and ECDF_abs")
#     # draw_KS_relative(values=pvals100, axis=axs[1][1], text="Uniform(0, 1)")
#         
# 
# # json.load(open('../../../data/json/1st_equal_histograms.json', 'w'))

# Distribution of 1st level (extraction for fix)
Find intervals of 1st level p-values with equal length.  


In [1]:
import random, sys, math, json
from collections import defaultdict, namedtuple
sys.path.insert(1, "../../python")
from utils import results_traverse, read_pvalues, custom_log, data_to_csv
from statistical_tests import KS_scipy, chisquare_scipy
from histograms import histogram_sorted, bin_frequency, find_equal_bins
from names import extract_from_path

In [None]:
pvals_path = '/mnt/d/Data/batteries_testing/1st/'
depths = range(3, 11)
depths = [10]
res = dict(zip(depths, [{}]*len(depths)))
rows = []
attributes = ['depth', 'bat', 'subb', 'test', 'id', 'num_pvals', 'unique', 'ratio'] 
for depth in depths:
    attributes  += [f"min{2**depth}", f"max{2**depth}"]
print(attributes)
for path in results_traverse(pvals_path):
    print(path)
    row = []
    names = extract_from_path(path)
    row += names
    for depth in depths:
        pvals = read_pvalues(path)
        pvals_sorted = sorted(pvals)
        
        num_pvals = len(pvals) 
        unique = len(set(pvals_sorted))
        ratio = round(num_pvals/unique)
        row += [num_pvals, unique, ratio]
        if ratio < 10 or unique < 2**depth:
            res[depth][names] = [-1]*(len(attributes) - len(row))
            row += [-1, -1]
        else:       
            limits = find_equal_bins(pvals_sorted, depth=depth)
            hist = histogram_sorted(pvals_sorted, limits=limits)
            freqs = list(hist.values())
            min_freq, max_freq = min(freqs), max(freqs)
            row += [min_freq, max_freq]
            res[depth][names] = hist 
        rows += row
        
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_equal_histograms.csv', 
    filename_table = '../../../data/table/1st_equal_histograms.txt')

json.dump(res, open('../../../data/json/1st_equal_histograms.json', 'w'))


['depth', 'bat', 'subb', 'test', 'id', 'num_pvals', 'unique', 'ratio', 'min1024', 'max1024']
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(0) Diehard Birthdays Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(1) Diehard OPERM5 Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(10) Diehard Parking Lot Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(100) STS Monobit Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(101) STS Runs Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 1.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 10.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 11.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 12.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test

# 2nd level p-values

## default 2nd level 

In [None]:
import json
import sys, time
sys.path.insert(1, "../../python")

from utils import results_traverse, read_pvalues
from statistical_tests import GoF_pvals_wrapper
from names import GoF_test_ids, GoF_test_names, testname_to_GoF_idx, extract_from_path

pvals_path = '/mnt/d/Data/batteries_testing/1st/'
save_path = '/mnt/d/Data/batteries_testing/2nd/empirical/default/'
sample_sizes =[10, 100, 1000]
repetitions = 100000
new_start = False
for path in results_traverse(pvals_path):
    print(path)
    names = extract_from_path(path)
    bat, subb, test, id = names[:4]
    # if '008|000|092' == names[3]:
    #     new_start = True
    if not new_start:
        continue
    for sample_size in sample_sizes:
        print(subb)
        start = time.time()
        # GoF_idx = testname_to_GoF_idx(test)
        GoF_idx = 2
        filename = f"2nd_{bat}&{id}&{sample_size}&{GoF_idx}&default"

        file_path = save_path + filename
        file_path = file_path.replace('|', '_')
        print(file_path)
        GoF_pvals_wrapper(src_pvals_filepath=path,
                          dst_pvals_filepath=file_path,
                          sample_size=sample_size, repetitions=repetitions+400, GoF_idx=GoF_idx, seed=1)
        print(f"time={time.time() - start}")
        with open(file_path) as f1:
            lines = f1.readlines()

        with open(file_path, 'w') as f2:
            f2.writelines(lines[:repetitions])

## NIST proportions


In [9]:
import json
import sys, time, random, math
sys.path.insert(1, "../../python")
from statistical_tests import NIST_proportion
from utils import results_traverse, read_pvalues
from names import GoF_test_ids, GoF_test_names, testname_to_GoF_idx, extract_from_path

pvals_path = '/mnt/d/Data/batteries_testing/1st/'
save_path = '/mnt/d/Data/batteries_testing/2nd/empirical/default/'
sample_sizes =[10, 100, 1000]
repetitions = 100000
new_start = False
for path in results_traverse(pvals_path):

    names = extract_from_path(path)
    bat, subb, test, id = names[:4]
    if bat != 'NIST':
        continue
    else:
        print(path)
    pvals = read_pvalues(path)
    
    for sample_size in sample_sizes:
        start = time.time()
        ALPHA = 0.01
        p_hat = 1 - ALPHA      
        proportion_threshold_max = (p_hat + 3.0 * math.sqrt((p_hat*ALPHA)/sample_size)) * sample_size;
        proportion_threshold_min = (p_hat - 3.0 * math.sqrt((p_hat*ALPHA)/sample_size)) * sample_size;
        res = []
        for _ in range(repetitions):
            sample = random.choices(pvals, k=sample_size)
            prop = sum([val > ALPHA for val in sample])/len(sample)
            res.append(prop)
        filename = f"2nd_{bat}&{id}&{sample_size}&{24}&default.prop"
        # f"2nd_simulated&22&{sample_size}&{num_bins}&binom_as_normal_scipy"
        file_path = save_path + filename
        file_path = file_path.replace('|', '_')
        print(f"time={time.time() - start}")
        with open(file_path, 'w') as f:
            f.write('\n'.join(map(str,res)))


/mnt/d/Data/batteries_testing/1st/nist/NIST Statistical Testing Suite(1) Frequency (monobits) test.pval
time=1.1791212558746338
time=6.679841756820679
time=63.06130123138428
/mnt/d/Data/batteries_testing/1st/nist/NIST Statistical Testing Suite(10) Maurer's Universal Statistical Test.pval
time=0.6027684211730957
time=5.987804174423218
time=58.63269639015198
/mnt/d/Data/batteries_testing/1st/nist/NIST Statistical Testing Suite(11) Approximate Entropy Test.pval
time=0.6164321899414062
time=5.502378225326538
time=54.102142572402954
/mnt/d/Data/batteries_testing/1st/nist/NIST Statistical Testing Suite(12) Random Excursions Test Subtest 1.pval
time=0.6261782646179199
time=6.258124828338623
time=60.49539542198181
/mnt/d/Data/batteries_testing/1st/nist/NIST Statistical Testing Suite(12) Random Excursions Test Subtest 2.pval
time=0.6602804660797119
time=5.799468278884888
time=62.783287525177
/mnt/d/Data/batteries_testing/1st/nist/NIST Statistical Testing Suite(12) Random Excursions Test Subtest

In [2]:
import json
import sys, time, random, math
sys.path.insert(1, "../../python")
from statistical_tests import KS_scipy, chisquare_scipy
from utils import results_traverse, read_pvalues
from names import GoF_test_ids, GoF_test_names, testname_to_GoF_idx, extract_from_path
from histograms import histogram_sorted

def save(res, save_path, filename):
    file_path = save_path + filename
    file_path = file_path.replace('|', '_')
    
    with open(file_path, 'w') as f:
        f.write('\n'.join(map(str,res)))
pvals_path = '/mnt/d/Data/batteries_testing/1st/'
save_path = '/mnt/d/Data/batteries_testing/2nd/empirical/scipy/'
sample_sizes =[10, 100, 1000]
bins_count = [10, 100]
repetitions = 10000

for path in results_traverse(pvals_path):
    print(path)
    names = extract_from_path(path)
    bat, subb, test, id = names[:4]
    pvals = read_pvalues(path)
    for sample_size in sample_sizes:
        start = time.time()
        KS_res = []
        chi2 = [[],[],[]]
        for _ in range(repetitions):
            sample = random.choices(pvals, k=sample_size)
            sample_sorted = sorted(sample)   
            
            KS_res.append( KS_scipy(sample, alternative='two-sided').pvalue)
            for i, num_bins in enumerate(bins_count):
                if sample_size // num_bins < 10:
                    continue
                hist = histogram_sorted(sample_sorted, num_bins=num_bins)
                pval = chisquare_scipy(f_obs = list(hist.values())).pvalue
                chi2[i].append(pval)
        
        filename = f"2nd_{bat}&{id}&{sample_size}&19&scipy_KS_both"
        save(KS_res, save_path, filename)
        for i, num_bins in enumerate(bins_count):
            if sample_size // num_bins < 10:
                continue
            filename = f"2nd_{bat}&{id}&{sample_size}&20&scipy_chi2"
            print(filename)
            save(chi2[i], save_path, filename)
        print(f"time={time.time() - start}")


/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(0) Diehard Birthdays Test.pval
time=6.660298109054565
2nd_Dieharder&000|000|000&100&20&scipy_chi2
time=18.41854739189148
2nd_Dieharder&000|000|000&1000&20&scipy_chi2
2nd_Dieharder&000|000|000&1000&20&scipy_chi2
time=21.18041729927063
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(1) Diehard OPERM5 Test.pval
time=5.439454793930054
2nd_Dieharder&001|000|000&100&20&scipy_chi2
time=21.30632519721985
2nd_Dieharder&001|000|000&1000&20&scipy_chi2
2nd_Dieharder&001|000|000&1000&20&scipy_chi2
time=20.248220682144165
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(10) Diehard Parking Lot Test.pval
time=4.532387971878052
2nd_Dieharder&010|000|000&100&20&scipy_chi2
time=21.395550966262817
2nd_Dieharder&010|000|000&1000&20&scipy_chi2
2nd_Dieharder&010|000|000&1000&20&scipy_chi2
time=20.44685459136963
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(100) STS Monobit Test.pval
time=4.688415050506592
2nd_Dieharder&100|000|000&100&

KeyboardInterrupt: 

In [3]:
import json
import sys, time, random, math
sys.path.insert(1, "../../python")
from statistical_tests import KS_scipy, chisquare_scipy
from utils import results_traverse, read_pvalues
from names import GoF_test_ids, GoF_test_names, testname_to_GoF_idx, extract_from_path
from histograms import histogram_sorted

def save(res, save_path, filename):
    file_path = save_path + filename
    file_path = file_path.replace('|', '_')
    
    with open(file_path, 'w') as f:
        f.write('\n'.join(map(str,res)))
pvals_path = '/mnt/d/Data/batteries_testing/1st/'
save_path = '/mnt/d/Data/batteries_testing/2nd/empirical/scipy/'
repetitions = 10000
sample_size = 1000
num_bins = 10

for path in results_traverse(pvals_path):
    print(path)
    names = extract_from_path(path)
    bat, subb, test, id = names[:4]
    pvals = read_pvalues(path)
    start = time.time()
    chi2_pvals = []
    for _ in range(repetitions):
        sample = random.choices(pvals, k=sample_size)
        sample_sorted = sorted(sample)   
        hist = histogram_sorted(sample_sorted, num_bins=num_bins)
        pval = chisquare_scipy(f_obs = list(hist.values())).pvalue
        chi2_pvals.append(pval)
    filename = f"2nd_{bat}&{id}&{sample_size}&{10}&20&scipy_chi2"
    save(chi2_pvals, save_path, filename)
    print(f"time={time.time() - start}")


/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(0) Diehard Birthdays Test.pval
time=11.385201692581177
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(1) Diehard OPERM5 Test.pval
time=11.569670915603638
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(10) Diehard Parking Lot Test.pval
time=14.503270626068115
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(100) STS Monobit Test.pval
time=13.081437587738037
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(101) STS Runs Test.pval
time=12.903582334518433
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 1.pval
time=9.614129304885864
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 10.pval
time=8.957174062728882
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 11.pval
time=8.221542835235596
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (General

In [15]:
import json
import sys, time, random, math
sys.path.insert(1, "../../python")
from utils import results_traverse


src_dir = '/mnt/d/Data/batteries_testing/2nd/empirical/scipy'
for path in results_traverse(src_dir, endswith = '*'):
    if '1000&20' in path:
        print(path)
        path_new = path.replace('1000&20', '1000&100&20')
        with open(path) as f:
            content = f.read()
        with open(path_new, 'w') as f_new:
            f_new.write(content)

    

/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&000_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&001_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&002_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&003_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&004_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&008_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&009_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&010_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&011_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/2nd_Dieharder&012_000_000&1000&20&scipy_chi2
/mnt/d/Data/batteries_testing/2nd/empirical/scipy/

In [None]:
import random, sys, math, json
from collections import defaultdict, namedtuple
sys.path.insert(1, "../../python")
from utils import results_traverse, read_pvalues, custom_log, data_to_csv
from statistical_tests import KS_scipy, chisquare_scipy
from histograms import histogram_sorted, bin_frequency, find_equal_bins
from names import extract_from_path
# pvals_path = '/mnt/d/Data/batteries_testing/1st/'
pvals_path = '/mnt/d/Data/batteries_testing/2nd/'
# save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
chi2_num_bins = [10, 100, 1000, 10000]

chi2 = [f"chi2_{math.log10(num_bins)}" for num_bins in  chi2_num_bins]
log_chi2 = [f"log_{item}" for item in chi2]

absolute_left_tails = [f"L_{e}" for e in range(1, 10)][::-1]
absolute_right_tails = [f"R_{e}" for e in range(1, 10)]
relative_left_tails = [f"l_{e}" for e in range(1, 10)][::-1]
relative_right_tails = [f"r_{e}" for e in range(1, 10)]

log_relative_left_tails = [f"log_{item}" for item in relative_left_tails]
log_relative_right_tails = [f"log_{item}" for item in relative_right_tails]

atributes = ['bat','subb', 'test', 'id','num_pvals', 'unique','ratio', 'min', 'max', 'KS', 'log_KS'] 
atributes += chi2 + log_chi2 + absolute_left_tails + ['num_pvals2'] + absolute_right_tails
atributes += relative_left_tails + relative_right_tails + log_relative_left_tails + log_relative_right_tails

exps_backward = [9,8,7,6,5,4,3,2,1]
exps_forward = exps_backward[::-1]
print(atributes)
# Extracted_values = namedtuple('Extracted_values', atributes)
rows = []

# pvals_path = '/mnt/d/Data/batteries_testing/1st/ideal-devu'
pvals_path = '/mnt/d/Data/batteries_testing/2nd/'
for path in results_traverse(pvals_path, endswith='*'):
    pvals = read_pvalues(path)
    pvals_sorted = sorted(pvals)
    res = []
    print(path) 
    names = extract_from_path(path)
    res += list(names)
    num_pvals = len(pvals) 
    unique = len(set(pvals_sorted))
    ratio = round(num_pvals/unique)
    min_pval, max_pval = pvals_sorted[0], pvals_sorted[-1]
    res += [num_pvals, unique,ratio, min_pval, max_pval] 
    print([num_pvals, unique,ratio, min_pval, max_pval])
    KS_pvalue = KS_scipy(pvals_sorted).pvalue
    res += [KS_pvalue, custom_log(KS_pvalue, round_to=1)]
    print([KS_pvalue, custom_log(KS_pvalue, round_to=1)])
    chi2_pvals = [None]*len(chi2_num_bins)
    for i in range(len(chi2_num_bins)):
        num_bins = chi2_num_bins[i]
        if num_pvals / num_bins >= 0:
            hist = histogram_sorted(pvals_sorted, num_bins=num_bins, domain = (0,1))
            chi2_pvals[i] = chisquare_scipy(f_obs=list(hist.values())).pvalue 
    chi2_pvals_logs = [custom_log(chi2pval, round_to=1) for chi2pval in chi2_pvals]
    res += chi2_pvals + chi2_pvals_logs
    
    absolute_left_tails = [bin_frequency(pvals_sorted, interval=(0, 10**(-e)), interval_type="[]" ) for e in exps_backward]
    absolute_right_tails = [bin_frequency(pvals_sorted, interval=(1-10**(-e),1), interval_type="[]" ) for e in exps_forward]
    absolute_tails = absolute_left_tails + absolute_right_tails
    res += absolute_left_tails + [num_pvals] + absolute_right_tails
    relative_tails = [round(abs_freq*10**(exp)/num_pvals,1) for abs_freq,exp in zip(absolute_tails, exps_backward+exps_forward)]
    res += relative_tails
    relative_tails_logs = [custom_log(relative_tail, round_to=1) for relative_tail in relative_tails]
    res += relative_tails_logs
    # print(res)
    rows.append(res)
    # c+= 1
    # if c % 10 == 0:
# data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_extracted.csv', 
#     filename_table = '../../../data/table/1st_extracted.txt')
# 
# res = {'header':atributes, 'rows': rows}
# json.dump(res, open('../../../data/json/1st_extracted.json', 'w'))
# # print(dict(zip(atributes,res)))
# data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_extracted.csv', 
#             filename_table = '../../../data/table/1st_extracted.txt')
# 
# res = {'header':atributes, 'rows': rows}
# json.dump(res, open('../../../data/json/1st_extracted.json', 'w'))
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/2nd_extracted.csv', 
    filename_table = '../../../data/table/2nd_extracted.txt')

res = {'header':atributes, 'rows': rows}
json.dump(res, open('../../../data/json/2nd_extracted.json', 'w'))
# print(dict(zip(atributes,res)))
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/2nd_extracted.csv', 
            filename_table = '../../../data/table/1st_extracted.txt')

res = {'header':atributes, 'rows': rows}
json.dump(res, open('../../../data/json/2nd_extracted.json', 'w'))