In [None]:
# imports
import pandas as pd
import numpy as np
from scipy.stats import binom

In [None]:
# helpers
remove_nan = lambda df, i: list(map(lambda x: int(float(x)), df.iloc[i][df.iloc[i].notnull()].tolist())) # the pandas data frame is a parallogram with nan values as padding 
out_string = lambda p_val, alpha, thresh: f"Significant: {'yes' if p_val < alpha else 'no'}\np-value:     {round(p_val, 5)}\nThreshhold:  {thresh}\n"

In [None]:
df = pd.read_csv('data.csv')        # read constructed data.csv file
df = df[df.columns[1:16]][:150*7]   # limit to treatment and date columns
df = df[(df != 'forbidden').all(1)] # remove treatment samples that broke due to lacking upvote rights
df = df[df['treatment'] != -1]      # remove treatment samples that were deleted before we could upvote

In [None]:
M = np.array([remove_nan(df, i) for i in range(df.shape[0])]) # convert pandas data frame to numpy array
T = (M[M[:, 0] == 1] - 2)[:, 1:]                              # treatment dataframe with our's and author's upvote REMOVED
C = M[M[:, 0] == 0][:, 1:] - 1                                # control data frame with author's upvote removed
# np.random.seed(42)                                            # ensure same 500 samples are selected on every run
# C = C[np.random.choice(C.shape[0], T.shape[0], replace=0), :] # balance C and T by selecting 500 random samples from C

In [None]:
threshs = [10 ** i for i in range(4)]  # our hypothesis looks at the four different orders of magnitude
# days    = 6                          # test on multiple days
n       = T.shape[0]                   # number of samples
alpha   = 0.05 / (len(threshs)) # significance level with bernferoni correction

In [None]:
# for i in range(days):                       # for all the days after the first day (RESULT TRUE FOR ALL DAYS)
for thresh in threshs:                        # for the four upvote orderes of magnitude in focus
    r = np.sum(T[:,-1] > thresh)              # how big a fraction of samples are above the treshhold in Treatment
    p = np.sum(C[:,-1] > thresh) / C.shape[0] # what's the probability of being above the threshhold given no treatment?
    p_val = 1 - binom.cdf(r, n, p)            # calcaulte p-value (whether we are above thresh or not is a binary/binomial question)
    print(out_string(p_val, alpha, thresh))   # print nice text describing result