In [2]:
from fdr_hacking.data_generation import *
import numpy as np

In [9]:
def proportion_above_threshold(array, threshold):
   return np.sum(array > threshold) / array.size

def proportion_below_threshold(array, threshold):
    return np.sum(array < threshold) / array.size

realworld_data_path = "../data/realworld_methyl_beta.h5"
realworld_data = load_realworld_data(file_path=realworld_data_path)
base_config = {
    'n_sites': 10000,
    'n_observations': 100,
    'dependencies': 1,
    'bin_size_ratio': 0.3,
    'correlation_strength': "high"
}
high_corr_config = base_config.copy()
high_corr_smaller_bin_config = base_config.copy()
high_corr_smaller_bin_config['bin_size_ratio'] = 0.2
medium_corr_config = base_config.copy()
medium_corr_config['correlation_strength'] = "medium"
medium_corr_smaller_bin_config = medium_corr_config.copy()
medium_corr_smaller_bin_config['bin_size_ratio'] = 0.2
config_dicts = [high_corr_config, high_corr_smaller_bin_config, medium_corr_config, medium_corr_smaller_bin_config]

In [15]:
n_replicates = 3
results_list = []
for config in config_dicts:
    if config['correlation_strength'] == 'medium':
        corr_coef_dist = [(-0.6, -0.4), (-0.1, 0.1), (0.4, 0.6)]
    elif config['correlation_strength'] == 'high':
        corr_coef_dist = [(-0.85, -0.7), (-0.1, 0.1), (0.7, 0.85)]
    sub_results_dict = {"correlation_strength": config['correlation_strength'], "bin_size_ratio": config['bin_size_ratio'],
                    'above_threshold': np.array([]), 'below_threshold': np.array([])}
    for i  in range(n_replicates):
        simulated_data = simulate_methyl_data(realworld_data, config['n_sites'], config['n_observations'],
                                                      config['dependencies'],
                                                      int(config['bin_size_ratio'] * config['n_sites']), corr_coef_dist)
        corr_mat = determine_correlation_matrix(simulated_data)
        corr_mat = np.triu(corr_mat, k=1)
        sub_results_dict['above_threshold'] = np.append(sub_results_dict['above_threshold'], proportion_above_threshold(corr_mat, 0.3))
        sub_results_dict['below_threshold'] = np.append(sub_results_dict['below_threshold'], proportion_below_threshold(corr_mat, -0.3))
    results_list.append(sub_results_dict)
sub_results_dict = {"correlation_strength": "not_applicable", "bin_size_ratio": "not_applicable",
                    'above_threshold': np.array([]), 'below_threshold': np.array([])}
for i in range(n_replicates):
    semi_real_world_data = sample_realworld_methyl_val(n_sites=10000, realworld_data=realworld_data)
    semi_real_world_data = beta_to_m(methyl_beta_values=semi_real_world_data)
    np.random.shuffle(semi_real_world_data)
    corr_mat = determine_correlation_matrix(semi_real_world_data)
    corr_mat = np.triu(corr_mat, k=1)
    sub_results_dict['above_threshold'] = np.append(sub_results_dict['above_threshold'], proportion_above_threshold(corr_mat, 0.3))
    sub_results_dict['below_threshold'] = np.append(sub_results_dict['below_threshold'], proportion_below_threshold(corr_mat, -0.3))
results_list.append(sub_results_dict)

In [16]:
for every_dict in results_list:
    every_dict['above_threshold_mean'] = every_dict['above_threshold'].mean()
    every_dict['above_threshold_std'] = every_dict['above_threshold'].std()
    every_dict['below_threshold_mean'] = every_dict['below_threshold'].mean()
    every_dict['below_threshold_std'] = every_dict['below_threshold'].std()
    del every_dict['above_threshold']
    del every_dict['below_threshold']
# make a dataframe of results_list by concatenating all the dicts in the list
results_df = pd.DataFrame(results_list)

{'correlation_strength': 'high', 'bin_size_ratio': 0.3, 'above_threshold': array([0.09031837, 0.09028346, 0.09032634, 0.09038125, 0.09034408,
       0.09035642, 0.09012616, 0.09038359, 0.09022575, 0.09012184]), 'below_threshold': array([0.00046422, 0.00043147, 0.00042619, 0.00044883, 0.00043371,
       0.00047908, 0.00042624, 0.00046024, 0.00047068, 0.00044903]), 'above_threshold_mean': 0.090286726, 'above_threshold_std': 9.259176013015581e-05, 'below_threshold_mean': 0.0004489689999999999, 'below_threshold_std': 1.8199543098660475e-05}
{'correlation_strength': 'high', 'bin_size_ratio': 0.2, 'above_threshold': array([0.06030178, 0.06042811, 0.06023728, 0.06037951, 0.06022542,
       0.06029617, 0.06012848, 0.06031179, 0.0603552 , 0.06127716]), 'below_threshold': array([0.0004933 , 0.00045266, 0.00067724, 0.000714  , 0.00349072,
       0.00051945, 0.00053265, 0.0004669 , 0.00043967, 0.0003948 ]), 'above_threshold_mean': 0.06039409, 'above_threshold_std': 0.00030515002267737036, 'below_t