# Distribution of p-values for C code  
Verification of results of C implementations of KS, $\chi^2$ from batteries and additional code (KS proposed by Marsaglia). For each uniformity test $10^5$ random samples with various sample sizes [10, 100, 1000, 10000] are randomly selected from devurandom generated floats from (0,1). Then uniformity test was applied and $10^5$ saved into file **2nd_simulated&name&test_id&sample_size**.
- TODO: sample size 10000, test ids= 10, 13...  

In [None]:
import json
import sys, time

sys.path.insert(1, "../../python")
from statistical_tests import GoF_pvals_wrapper
from names import GoF_test_ids, GoF_test_names

uniform_pvals_path = '/mnt/d/Data/batteries_testing/1st/ideal-devu/uniform_pvals_devurand.pval'
sample_sizes = [10000]
save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
repetitions = 100000
for GoF_idx in GoF_test_ids[11:]:
    for sample_size in sample_sizes:
        filename = f"2nd_simulated&{GoF_test_names[GoF_idx]}&{GoF_idx}&{sample_size}"
        file_path = save_path + filename
        print(file_path)
        start = time.time()
        GoF_pvals_wrapper(src_pvals_filepath = uniform_pvals_path,
                          dst_pvals_filepath = file_path,
                          sample_size=sample_size, repetitions = repetitions + 200, GoF_idx = GoF_idx, seed=1)
        print(f"time={time.time()-start}" )
        with open(file_path) as f1:
            lines = f1.readlines()
    
        with open(file_path, 'w') as f2:
            f2.writelines(lines[:repetitions]) 


# Distribution of p-values for python code  
Verification of results of python tests. Analyzed were tests form scipy:
- KS (uniformity) 1 sample and 2 sample: **KS_scipy, KS_2samp_scipy**
- chi2 1 sample and 2 sample: **chisquare_scipy, chisquare_2sample_scipy**
- binomial test: **binom_scipy, binom_as_normal_scipy**

For each uniformity test $10^5$ random samples with various sample sizes [10, 100, 1000, 10000] are randomly selected from devurandom generated floats from (0,1). Then uniformity test was applied and $10^5$ saved into file **2nd_simulated&name&test_id&sample_size**.

TODO: all except chisquare_scipy, KS_scipy


In [19]:
import random, sys
from collections import defaultdict
sys.path.insert(1, "../../python")
from statistical_tests import KS_scipy, chisquare_scipy
from histograms import histogram_sorted
from utils import read_pvalues
uniform_pvals_path = '/mnt/d/Data/batteries_testing/1st/ideal-devu/uniform_pvals_devurand.pval'
pvals_1stlvl = read_pvalues(uniform_pvals_path) 

save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
repetitions = 100000
sample_sizes = [10, 100, 1000, 10000]

names = ['KS_greater', 'KS_less', 'KS_both','chi2_&10&bins', 'chi2_&100&bins', 'chi2_&1000&bins',
         'binom_greater', 'binom_less', 'binom_both',
         ]
datasets = defaultdict(list)

for sample_size in sample_sizes:
    print(f"sample_size={sample_size}")
    for _ in range(10**5):
       
        sample = random.choices(pvals_1stlvl, k=sample_size)
        
        filename = f"2nd_simulated&scipy_KS_greater&17&{sample_size}"
        datasets[filename].append(KS_scipy(sample, alternative='greater').pvalue)
        
        filename = f"2nd_simulated&scipy_KS_less&18&{sample_size}"
        datasets[filename].append(KS_scipy(sample, alternative='less').pvalue)
        
        filename = f"2nd_simulated&scipy_KS_both&19&{sample_size}"
        datasets[filename].append( KS_scipy(sample, alternative='two-sided').pvalue)
  
        sample_sorted = sorted(sample)      
        for num_bins in [10, 100, 1000]:
            if sample_size // num_bins < 10:
                continue
            hist = histogram_sorted(sample_sorted, num_bins=num_bins)
            filename = f"2nd_simulated&scipy_chi2&{20}&{sample_size}&{num_bins}"
            datasets[filename].append(chisquare_scipy(f_obs = list(hist.values())).pvalue)

for filename in datasets:
    file_path = save_path + filename
    pvals = datasets[filename]
    with open(file_path, 'w') as f:
        f.writelines( '\n'.join(map(str, pvals)) + '\n')
        

sample_size=10
sample_size=100
sample_size=1000
sample_size=10000


# Distribution of 1st level (extraction of data)


In [1]:
import random, sys, math, json
from collections import defaultdict, namedtuple
sys.path.insert(1, "../../python")
from utils import results_traverse, read_pvalues, custom_log, data_to_csv
from statistical_tests import KS_scipy, chisquare_scipy
from histograms import histogram_sorted, bin_frequency
from names import extract_from_path

In [2]:
pvals_path = '/mnt/d/Data/batteries_testing/1st/'
# save_path = "/mnt/d/Data/batteries_testing/2nd/simulated/"
chi2_num_bins = [10, 100, 1000, 10000]

chi2 = [f"chi2_{math.log10(num_bins)}" for num_bins in  chi2_num_bins]
log_chi2 = [f"log_{item}" for item in chi2]

absolute_left_tails = [f"L_{e}" for e in range(1, 10)][::-1]
absolute_right_tails = [f"R_{e}" for e in range(1, 10)]
relative_left_tails = [f"l_{e}" for e in range(1, 10)][::-1]
relative_right_tails = [f"r_{e}" for e in range(1, 10)]

log_relative_left_tails = [f"log_{item}" for item in relative_left_tails]
log_relative_right_tails = [f"log_{item}" for item in relative_right_tails]

atributes = ['bat','subb', 'test', 'id','num_pvals', 'unique','ratio', 'min', 'max', 'KS', 'log_KS'] 
atributes += chi2 + log_chi2 + absolute_left_tails + ['num_pvals2'] + absolute_right_tails
atributes += relative_left_tails + relative_right_tails + log_relative_left_tails + log_relative_right_tails

exps_backward = [9,8,7,6,5,4,3,2,1]
exps_forward = exps_backward[::-1]
print(atributes)
# Extracted_values = namedtuple('Extracted_values', atributes)

['bat', 'subb', 'test', 'id', 'num_pvals', 'unique', 'ratio', 'min', 'max', 'KS', 'log_KS', 'chi2_1.0', 'chi2_2.0', 'chi2_3.0', 'chi2_4.0', 'log_chi2_1.0', 'log_chi2_2.0', 'log_chi2_3.0', 'log_chi2_4.0', 'L_9', 'L_8', 'L_7', 'L_6', 'L_5', 'L_4', 'L_3', 'L_2', 'L_1', 'num_pvals2', 'R_1', 'R_2', 'R_3', 'R_4', 'R_5', 'R_6', 'R_7', 'R_8', 'R_9', 'l_9', 'l_8', 'l_7', 'l_6', 'l_5', 'l_4', 'l_3', 'l_2', 'l_1', 'r_1', 'r_2', 'r_3', 'r_4', 'r_5', 'r_6', 'r_7', 'r_8', 'r_9', 'log_l_9', 'log_l_8', 'log_l_7', 'log_l_6', 'log_l_5', 'log_l_4', 'log_l_3', 'log_l_2', 'log_l_1', 'log_r_1', 'log_r_2', 'log_r_3', 'log_r_4', 'log_r_5', 'log_r_6', 'log_r_7', 'log_r_8', 'log_r_9']


In [3]:
rows = []
c = 0
for path in results_traverse(pvals_path):
    # if c == 4:
    #     break
    # if 'Die' in path:
    #     continue
    # else:
    #     c += 1
    pvals = read_pvalues(path)
    pvals_sorted = sorted(pvals)
    res = []
    print(path) 
    names = extract_from_path(path)
    res += list(names)
    
    num_pvals = len(pvals) 
    unique = len(set(pvals_sorted))
    ratio = round(num_pvals/unique)
    min_pval, max_pval = pvals_sorted[0], pvals_sorted[-1]
    res += [num_pvals, unique,ratio, min_pval, max_pval] 
    
    KS_pvalue = KS_scipy(pvals_sorted).pvalue
    res += [KS_pvalue, custom_log(KS_pvalue, round_to=1)]
    
    chi2_pvals = [None]*len(chi2_num_bins)
    for i in range(len(chi2_num_bins)):
        num_bins = chi2_num_bins[i]
        if num_pvals / num_bins >= 0:
            hist = histogram_sorted(pvals_sorted, num_bins=num_bins, domain = (0,1))
            chi2_pvals[i] = chisquare_scipy(f_obs=list(hist.values())).pvalue 
    chi2_pvals_logs = [custom_log(chi2pval, round_to=1) for chi2pval in chi2_pvals]
    res += chi2_pvals + chi2_pvals_logs

    absolute_left_tails = [bin_frequency(pvals_sorted, interval=(0, 10**(-e)), interval_type="[]" ) for e in exps_backward]
    absolute_right_tails = [bin_frequency(pvals_sorted, interval=(1-10**(-e),1), interval_type="[]" ) for e in exps_forward]
    absolute_tails = absolute_left_tails + absolute_right_tails
    res += absolute_left_tails + [num_pvals] + absolute_right_tails

    relative_tails = [round(abs_freq*10**(exp)/num_pvals,1) for abs_freq,exp in zip(absolute_tails, exps_backward+exps_forward)]
    res += relative_tails
    relative_tails_logs = [custom_log(relative_tail, round_to=1) for relative_tail in relative_tails]
    res += relative_tails_logs
    # print(res)
    rows.append(res)
    # c+= 1
    # if c % 10 == 0:
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_extracted.csv', 
    filename_table = '../../../data/table/1st_extracted.txt')

res = {'header':atributes, 'rows': rows}
json.dump(res, open('../../../data/json/1st_extracted.json', 'w'))
# print(dict(zip(atributes,res)))
data_to_csv(header= atributes, rows=rows, filename_csv='../../../data/excel/1st_extracted.csv', 
            filename_table = '../../../data/table/1st_extracted.txt')

res = {'header':atributes, 'rows': rows}
json.dump(res, open('../../../data/json/1st_extracted.json', 'w'))

/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(0) Diehard Birthdays Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(1) Diehard OPERM5 Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(10) Diehard Parking Lot Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(100) STS Monobit Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(101) STS Runs Test.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 1.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 10.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 11.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 12.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS Serial Test (Generalized) Subtest 13.pval
/mnt/d/Data/batteries_testing/1st/dieharder/Dieharder(102) STS