In [None]:
from utilities import *
from plot_utilities import *

np.random.seed(42)

In [None]:
# Note that the code below assumes you are in the "testing-framework" directory. Can check Current Working Directory below:
# Should be testing-framework
os.getcwd()

In [None]:
data_dict = pd.read_pickle('../data/Panoptic Agriculture/Transformed Dataset/Panoptic_Data_Dict_Normalized.pickle')
# obs_x_dict = dict()
# for layer in np.arange(2, 9):
#     obs_x_dict[layer] = create_obs_x(data_dict, layer)
# pd.to_pickle(obs_x_dict, 'panoptic/obs_x_dict.pickle')
obs_x_dict = pd.read_pickle('panoptic/obs_x_dict.pickle')
obs_x_dict

In [None]:
# Large grid search CDFs already computed (r = 0 to 8, eta = 0 to 3.9)
all_cdfs = combine_pickles('scipy_10000') | combine_pickles('mtlb_10000') 
all_cdfs_df = pd.DataFrame({'(r,eta),cdf' : sorted(all_cdfs.items())})
all_cdfs_df['r'] = pd.Series(all_cdfs_df["(r,eta),cdf"].str[0].str[0])
all_cdfs_df['eta'] = pd.Series(all_cdfs_df["(r,eta),cdf"].str[0].str[1])
all_cdfs_df['cdf'] = pd.Series(all_cdfs_df["(r,eta),cdf"].str[1])

create_scatter_plots_log_eta(all_cdfs_df)

In [None]:
# Experimental cell demoing all the plots
layer = 2
obs_x = obs_x_dict[layer]
all_ksstats, best_param, min_stat = gridsearch(obs_x, all_cdfs)
df = all_cdfs_df.copy()
total_samples = obs_x.size
df['kstest_stat'] = all_ksstats
df['kstest_pval'] = kstwo(n=total_samples).sf(all_ksstats)
print(f"Best parameters {(best_param)} with KS-test Statistic {np.round(min_stat, 4)} and pvalue {kstwo(n=total_samples).sf(min_stat)}, layer {layer} with num_samples={total_samples}")
distance, location = visualize_cdfs(obs_x, best_param[0], best_param[1], 10000, all_cdfs)
create_scatter_plot(df, 'kstest_stat')
create_scatter_plot(df, 'kstest_pval')

In [None]:
df0 = df[(df['r'] > 0.4) & (df['eta'] >= 0.3) & (df0['r'] < 1)]
indices = df0.index
x = np.array(df0['r'].loc[indices])
y = np.array(df0['eta'].loc[indices])
z = np.array(df0['kstest_stat'].loc[indices]) 
X, Y = np.meshgrid(df0['r'].unique(), df0['eta'].unique())

dims_r = df0['r'].unique().size
dims_eta = df0['eta'].unique().size
fig, ax = plt.subplots()
CS = ax.contour(x.reshape(dims_r, dims_eta), y.reshape(dims_r, dims_eta), z.reshape(dims_r, dims_eta), np.append(np.arange(0.04, 0.3, 0.02), 0.3), cmap =  'viridis')
ax.clabel(CS, CS.levels, inline=True, fontsize=10)
create_contour_plot(df0,'kstest_stat')

In [None]:
splines = sorted(all_cdfs.items())
[splines[i] for i, param in enumerate(sorted(all_cdfs)) if param[1] == 0]


In [None]:
np.array([i for i, param in enumerate(sorted(all_cdfs)) if param[1] == 0])

In [None]:
large_grid_df = pd.DataFrame(columns = ['layer', 'ksstats', 'best_param', 'kstest_stat', 'best_param_eta_0', 'kstest_stat_eta_0'])
idx_eta_0 = [i for i, param in enumerate(sorted(all_cdfs)) if np.isclose(param[1], 0, atol = 1e-40)]
sorted_params = sorted(all_cdfs)
for i, layer in enumerate(np.arange(2, 5)):
    sample = obs_x_dict[layer]
    ksstats, best_param, min_stat = gridsearch(sample, all_cdfs)
    ksstats_eta_0 = [ksstats[i] for i in idx_eta_0]
    idx_min_ksstats_eta_0 = 
    large_grid_df.loc[i, :] = [layer, ksstats, best_param, min_stat, sorted_params[idx_min_ksstats_eta_0], ksstats[idx_min_ksstats_eta_0]]

large_grid_df

In [None]:
# Given that pvalue = 0.05 and I have _ samples, what should the kstest statistic be?
cutoffs_df = pd.DataFrame(columns = ['layer', 'num_samples', 'kstest_stat_0.05_cutoff', 'kstest_stat_0.1_cutoff'])
for i, layer in enumerate(np.arange(2, 9)):
    num_points = obs_x_dict[layer].size
    cutoffs_df.loc[i, :] = (layer, num_points, kstwo(n=num_points).isf(0.05), kstwo(n=num_points).isf(0.1))   
cutoffs_df

In [None]:
best_params_df = pd.read_csv('panoptic/CSVs/best_params_df_fine_grid.csv')
best_params_df['n_0.05'] = best_params_df.apply(lambda row : find_n_fixed_pval_stat(row.iloc[3], row.iloc[1]), axis = 1)
best_params_df.head(8)

In [None]:
def find_n_fixed_pval_stat(ksstat : float, n : int, cutoff= 0.05):
    curr_pval = kstwo(n).sf(ksstat)
    while not np.isclose(curr_pval, cutoff, atol=0.01):
        if curr_pval < cutoff: 
            n = int(n/2)
            curr_pval = kstwo(n).sf(ksstat)
        elif curr_pval > cutoff:
            n = int(n*1.5)
            curr_pval = kstwo(n).sf(ksstat)
    return n

In [None]:
def coord_descent(sample, initial_param, r_depth, eta_depth, layer, completed_r_depth = 1, completed_eta_depth = 1):
    '''
    Given a NumPy array `sample` and an initial guess for the parameters that minimize the kstest statistic `initial_param`,
    computes the best fti parameters (r, eta) right to decimal points specified by `r_depth` and `eta_depth`. 
    Assumes that initial guess is right to 1 decimal place by default.
    Example usage:
    `coord_descent(obs_x_dict[4], (0.8, 3), 3, 2, 4)` will search through
    r = range(0.70, 0.90, 0.01), eta = 3. Suppose best value is 0.80
    r = range(0.780, 0.800, 0.001), eta = 3. Suppose best value is r=0.803 (3 decimals)
    Then
    r = 0.803, eta = range(2.9, 3.1, 0.01). Suppose best value is eta=3.01 (2 decimals)

    returns 0.803, 3.01
    '''
    r_0, eta_0 = initial_param

    for d in np.arange(completed_r_depth, r_depth):
            
        r_range = np.arange(r_0 - 10.0**(-d), r_0 + 10.0**(-d), 10.0**(-d-1)) 
        eta_range = [eta_0]
        print(r_range, eta_range, r_0)
        add_cdfs(r_range, eta_range, 10000, True, f'layer{layer}_')
        layer_cdfs = combine_pickles(f'layer{layer}_10000')
        ksstats, best_param, min_stat = gridsearch(sample, layer_cdfs)
        r_0 = best_param[0]

    for d in np.arange(completed_eta_depth, eta_depth):
            
        r_range = [r_0]
        eta_range = np.arange(eta_0 - 10.0**(-d), eta_0 + 10.0**(-d), 10.0**(-d-1)) 
        print(r_range, eta_range, eta_0)
        add_cdfs(r_range, eta_range, 10000, True, f'layer{layer}_')
        layer_cdfs = combine_pickles(f'layer{layer}_10000')
        ksstats, best_param, min_stat = gridsearch(sample, layer_cdfs)
        eta_0 = best_param[1]

    return (r_0, eta_0)


In [None]:
def cdf_dict_to_many_pickle(large_dict, folder_path, items_per = 20):

    dict_keys = sorted(large_dict) 
    i = 0
    
    for k in dict_keys:
        
        small_dict = dict()
        small_dict[k] = large_dict[k]
        i += 1
        min_r = round_to_sigfigs(min(k[0], min_r))
        max_r = round_to_sigfigs(max(k[0], max_r))
        min_eta = round_to_sigfigs(min(k[1], min_eta))
        max_eta = round_to_sigfigs(max(k[1], max_eta))

        if i == items_per:
            with open(os.path.join(folder_path, f'{min_r}-{max_r}_{min_eta}-{max_eta}.pickle'), 'wb') as handle:
                pickle.dump(small_dict, handle)
            print()
            

In [None]:
big_dict

In [None]:
with open('CDFs/layer2_10000/layer2_10000.pickle', 'rb') as handle:
    big_dict = pickle.load(handle)
big_dict

In [None]:
cdf_dict_to_many_pickle(big_dict, 'CDFs/testing_many_pickle')

In [None]:
coord_descent(obs_x_dict[4], (0.8, 3), 3, 2, 4)

In [None]:
gridsearch(obs_x_dict[4], combine_pickles('layer4_10000'))

In [None]:
%%time
# Creates validation dataframes 
# For now, it caps out at 6. For layer 7 and up it just defaults to 2.9, 0

# for layer in range(6, 8):
#     obs_x = create_obs_x(data_dict, layer)
#     df = make_layer_df(obs_x, all_cdfs_df)
#     total_samples = obs_x.size
#     all_num_samples = np.sort(np.append(5*10**np.arange(3.0, np.floor(np.log10(total_samples))), 10**np.arange(3.0, np.ceil(np.log10(total_samples)))))
#     print(list(all_num_samples))
#     np.random.seed(42)
#     x = obs_x[np.random.permutation(total_samples)]
#     val_df = pd.concat([val_df_fixed_num(x, n, all_cdfs_df) for n in all_num_samples])
#     val_df.to_csv(f'panoptic/CSVs/val_df{layer}_{cdfs_name}.csv')
#     val_df.value_counts(['r', 'eta'])

In [None]:
val_df = pd.read_csv(f'panoptic/CSVs/val_df{4}.csv', index_col='Unnamed: 0')
print(val_df.value_counts(['r', 'eta'])[:10])

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(16, 6))  # Create a figure with two subplots

for layer in np.arange(layer, layer+1):
    #obs_x = obs_x_dict[layer]
    #df = df_dict[layer]
    total_samples = obs_x.size
    x = obs_x[np.random.permutation(total_samples)]
    val_df = pd.read_csv(f'panoptic/CSVs/val_df{layer}_{cdfs_name}.csv', index_col='Unnamed: 0')

    # Scatter plot
    sns.scatterplot(val_df.drop(['kstest_stat', 'kstest_pval'], axis=1), x='r', y='eta', palette='bright', hue='num_samples', alpha=0.4, ax=ax1)
    r, eta = best_params_df.loc[layer]['r'], best_params_df.loc[layer]['eta']
    ax1.scatter(x=r, y=eta, marker="*", label='all_data', s=60, color='xkcd:shamrock green', alpha=0.7)
    ax1.set_title(f'KS-Test Statistic minimizing parameters for subsets of layer {layer} data')
    print(r, eta)
    ax1.legend()

    # KDE plot
    sns.kdeplot(val_df.drop(['kstest_stat', 'kstest_pval'], axis=1), x='r', y='eta', palette='bright', hue='num_samples', alpha=0.4, ax=ax2)

plt.tight_layout()
plt.show()

In [None]:
np.random.seed(42)

# TODO: Put plots side by side
for layer in np.arange(layer, layer+1):
    obs_x = obs_x_dict[layer]
    df = df_dict[layer]
    total_samples = obs_x.size
    x = obs_x[np.random.permutation(total_samples)]
    val_df = pd.read_csv(f'panoptic/CSVs/val_df{layer}.csv', index_col='Unnamed: 0')
    sns.scatterplot(val_df.drop(['kstest_stat', 'kstest_pval'], axis = 1), x = 'r', y = 'eta', palette = 'bright', hue = 'num_samples', alpha = 0.4)
    r, eta, = best_params_df.loc[layer]['r'], best_params_df.loc[layer]['eta']
    plt.scatter(x = r, y = eta, marker="*", label = 'all_data', s = 60, color = 'xkcd:shamrock green', alpha = 0.7)
    plt.title(f'KS-Test Statistic minimizing parameters for subsets of layer {layer} data')
    print(r, eta)
    plt.legend()
    
    sns.kdeplot(val_df.drop(['kstest_stat', 'kstest_pval'], axis = 1), x = 'r', y = 'eta', palette = 'bright', hue = 'num_samples', alpha = 0.4)
    

In [None]:
layer = 6
val_df = pd.read_csv(f'panoptic/CSVs/val_df{layer}.csv', index_col='Unnamed: 0')
mask = val_df['num_samples'] == 100
sns.kdeplot(val_df.drop(['kstest_pval'], axis = 1)[mask], x = 'r', y = 'eta', fill=True)
r, eta, = best_params_df.loc[layer]['r'], best_params_df.loc[layer]['eta']
plt.scatter(x = r, y = eta, marker="*", label = 'all_data', s = 60, color = 'xkcd:shamrock green', alpha = 0.7)
plt.title('Validation')