In [230]:
import numpy as np
import pandas as pd
from bokeh.plotting import figure
from bokeh.io import show, output_notebook
from statsmodels.stats.proportion import proportion_confint

In [231]:
num_agents_per_algo = 100000  # this is "n" in the draft
significance_level = 0.1 # significance level = 1 - confidence

First we generate some fake performance data $R_{\phi_A}$ for $\phi_A$

In [232]:
R_a = np.random.normal(100, 10, num_agents_per_algo)

Then some fake shift that's mostly positively correlated to $R_{\phi_A}$

In [233]:
def corr_info(x, y):
    corr = np.corrcoef(x, y)[0][1]
    print(f"corr: {corr}")

In [234]:
shift_factor = 0.3  # how significant the shift is, 0 means no shift, 1 means a lot of shift and a lot of correlation
pseudo_cov_shift = (R_a - R_a.mean()) * np.random.uniform(-0.2, 1, num_agents_per_algo) * shift_factor
corr_info(R_a, pseudo_cov_shift)

corr: 0.7590062606626572


We assume there's some covariance introduced by the controlled seeds, so let's add the fake shift to $R_{\phi_B, c}$.

In [235]:
R_b_c = np.random.normal(100, 10, num_agents_per_algo) + pseudo_cov_shift
R_b_uc = np.random.normal(100, 10, num_agents_per_algo)

Since we added some correlated noise to $R_{\phi_B, c}$, it should be positively correlated to $R_{\phi_A}$. On the other hand, $R_{\phi_B, uc}$ should have a negligible correlation and could be either positive or negative.

In [236]:
print("R_a vs R_b_c")
corr_info(R_a, R_b_c)
print("\nR_a vs R_b_uc")
corr_info(R_a, R_b_uc)

R_a vs R_b_c
corr: 0.11784198779878644

R_a vs R_b_uc
corr: -0.00145457799286241


In [237]:
delta_c = R_a - R_b_c
delta_uc = R_a - R_b_uc

For the first part analysis, we simply compare $\mathrm{Var}(\Delta_c)$ and $\mathrm{Var}(\Delta_{uc})$. If controlled seeds reduce effect size, then $\mathrm{Var}(\Delta_c)$ should be smaller than $\mathrm{Var}(\Delta_{uc})$.

In [238]:
var_c = np.var(delta_c)
var_uc = np.var(delta_uc)
print(f'var_c: {var_c}')
print(f'var_uc: {var_uc}')
print(f'var_c < var_uc ? {var_c < var_uc}')

var_c: 178.69050689652946
var_uc: 199.9028394891771
var_c < var_uc ? True


For the second part analysis, we calculate the binomial confidence interval.

In [239]:
num_success = np.sum((np.abs(delta_c) < np.abs(delta_uc)))  # this is "k" in the draft

In [240]:
print('num_success:', num_success)
print('num_agents_per_algo:', num_agents_per_algo)
interval = proportion_confint(num_success, num_agents_per_algo)
print('interval:', interval)

num_success: 52111
num_agents_per_algo: 100000
interval: (0.5180137880760319, 0.524206211923968)


We accept the null hypothesis if 0.5 falls in the range of the interval, and reject it otherwise.

In [241]:
if interval[0] < 0.5 < interval[1]:
    print('We fail to reject the null hypothesis.')
    print('We conclude that controlled seeds do not have a significant impact on effect size.')
else:
    print('We reject the null hypothesis.')
    print(f'We are {(1 - significance_level) * 100}% confident that controlled seeds do have a significant impact on effect size.')

We reject the null hypothesis.
We are 90.0% confident that controlled seeds do have a significant impact on effect size.


We plot out the distribution of deltas and see if they are different (of course they are).

In [246]:
hist_range = (0, 50)
num_bins = 25
bin_width = (hist_range[1] - hist_range[0]) / num_bins
c_hist_vals, c_hist_edges = np.histogram(np.abs(delta_c), bins=num_bins, range=hist_range)
c_hist_left = c_hist_edges[:-1]
c_hist_right = c_hist_edges[1:] - bin_width * 0.5
uc_hist_vals, uc_hist_edges = np.histogram(np.abs(delta_uc), bins=num_bins, range=hist_range)
uc_hist_left = uc_hist_edges[:-1] + bin_width * 0.5
uc_hist_right = uc_hist_edges[1:]

In [247]:
p = figure(plot_height=600, plot_width=800)
p.quad(bottom=0, top=c_hist_vals, left=c_hist_left, right=c_hist_right,
       fill_color='red', alpha=0.5)
p.quad(bottom=0, top=uc_hist_vals, left=uc_hist_left, right=uc_hist_right,
       fill_color='blue', alpha=0.5)
output_notebook()
show(p)