In [81]:
import numpy as np
from scipy import stats
import pandas as pd
import statsmodels.api as sm

In [100]:
def get_synthetic_data(n_control, n_treatment, p_0 = 0.2 , p_1 = 0.05, c_1 = 0.05, c_2 = 0.05, c_3 = 0.1, T = 30, r_low = 0.05, r_high = 0.1, r_effect = 0.05, rand_seed = 0):
    np.random.seed(seed=rand_seed)
    assignment = np.repeat(["control", "treatment"], (n_control, n_treatment))
    n = n_control + n_treatment
    U = stats.binom.rvs(1, p_0, size=n)
    x_1 = np.array([stats.uniform.rvs(loc=0, scale=0.25+0.75*u, size=1) for u in U]).ravel()  # if U=1 , upper bound is 1 , otherwise 0.25
    x_2 = stats.uniform.rvs(loc=0, scale=1, size=n)
    p_s = p_1 + c_1 * (x_1 - (3*p_0 + 1) / 8 ) + c_2 * (x_2 - 0.5) # c_1 and c_2 are multiplied to the centered version of x1 and x2 to make p_1 the mean of p_s
    S = stats.binom.rvs(1, p_s, size=n)
    
    df = pd.DataFrame(dict(assignment = assignment, x_1 = x_1, x_2 = x_2, p_s = p_s, U = U, S = S, D = S * np.array(assignment == "treatment").astype('int')))
    df['r'] = np.where(U==1, r_high, r_low) + c_3*(x_2 - 0.5) + r_effect * df['D'].values
    df['Y'] = np.array([stats.binom.rvs(T, r, size=1) for r in df['r'].values]).ravel()
    
    return df

In [101]:
df = get_synthetic_data(n_control=25000, n_treatment=75000, rand_seed = 0)
df.describe()

Unnamed: 0,x_1,x_2,p_s,U,S,D,r,Y
count,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0,100000.0
mean,0.201654,0.497864,0.049976,0.20041,0.0503,0.038,0.061707,1.85063
std,0.210122,0.288784,0.017837,0.400309,0.218564,0.191197,0.036966,1.706988
min,3e-06,3e-06,0.015005,0.0,0.0,0.0,2e-06,0.0
25%,0.074316,0.246476,0.035702,0.0,0.0,0.0,0.031789,0.0
50%,0.147847,0.49786,0.04945,0.0,0.0,0.0,0.060994,2.0
75%,0.22088,0.747647,0.062495,0.0,0.0,0.0,0.086611,3.0
max,0.99996,0.999994,0.114855,1.0,1.0,1.0,0.199892,14.0


In [104]:
# ITT
print('Est. ITT: {}'.format(df.loc[df['assignment']=='treatment', 'Y'].mean() - df.loc[df['assignment']=='control', 'Y'].mean()))

Est. ITT: 0.0750266666666668


In [51]:
# True S.E.
n_resample = 1000
delta_y_bar_boot = dict(naive=np.zeros(n_resample), trig_dilu=np.zeros(n_resample), cuped2=np.zeros(n_resample), cuped1=np.zeros(n_resample))
np.random.seed(seed=0)
n = len(df)
for i in range(n_resample):
    df_samp = df.iloc[np.random.choice(n, size=n, replace=True), :]
    n_c, n_t = np.sum(df_samp['assignment']=='control'), np.sum(df_samp['assignment']=='treatment')
    delta_y_bar = df_samp.loc[df_samp['assignment']=='treatment', 'Y'].mean() - df_samp.loc[df_samp['assignment']=='control', 'Y'].mean()
    
    # naive
    delta_y_bar_boot['naive'][i] = delta_y_bar
    
    # trigger-dilute
    TR = np.mean(df_samp['S']==1)
    delta_y1_bar = df_samp.loc[(df_samp['assignment']=='treatment') & (df_samp['S']==1), 'Y'].mean() - df_samp.loc[(df_samp['assignment']=='control') & (df_samp['S']==1), 'Y'].mean()
    delta_y_bar_boot['trig_dilu'][i] = delta_y1_bar*TR
    
    # two-sided
    Y_t = df_samp.loc[(df_samp['assignment']=='treatment'),'Y'].values
    Y_c = df_samp.loc[(df_samp['assignment']=='control'),'Y'].values
    S_t = df_samp.loc[(df_samp['assignment']=='treatment'),'S'].values
    S_c = df_samp.loc[(df_samp['assignment']=='control'),'S'].values
    
    #********** calculate tau_0 and theta using bootstrap sample******************
    
    tau_0 = df_samp.loc[(df_samp['assignment']=='treatment') & (df_samp['S']==0),'Y'].mean() - df_samp.loc[(df_samp['assignment']=='control') & (df_samp['S']==0),'Y'].mean()
    # theta = (1/n_t**2*np.cov(Y_t,S_t)[0,1] + 1/n_c**2*np.cov(Y_c,S_c)[0,1]) / (1/n_t**2*np.var(S_t) + 1/n_c**2*np.var(S_c))
    theta = np.cov(Y_c,S_c)[0,1] / np.var(S_c)
    
    delta_y_bar_boot['cuped2'][i] = delta_y_bar - theta*tau_0
    
    
    
    # one-sided
    log_reg = sm.Logit(df_samp.loc[df_samp['assignment']=='treatment','D'].values, df_samp.loc[df_samp['assignment']=='treatment',['x_1','x_2']].values).fit(disp=0)
    w = log_reg.predict(df_samp.loc[df_samp['assignment']=='control',['x_1','x_2']].values)
    
    tau_0 = df_samp.loc[(df_samp['assignment']=='treatment') & (df_samp['D']==0),'Y'].mean() - np.sum((1-w)*df_samp.loc[df_samp['assignment']=='control','Y'].values)/np.sum((1-w))
    #theta = np.cov(Y_t,S_t)[0,1] / np.var(S_t)
    
    delta_y_bar_boot['cuped1'][i] = delta_y_bar - theta*tau_0      
    

est_mean = {k: np.round(np.mean(v),5) for k,v in delta_y_bar_boot.items()}
print(est_mean)
true_se = {k: np.round(np.std(v),5) for k,v in delta_y_bar_boot.items()}
print(true_se)

    # 简化用
    # tau_0 = 
    # theta = np.cov(Y_t-Y_c,S_t-S_c)[0,1] / np.var(S_t-S_c)
    # true_se_cuped = np.sqrt(true_se**2*(1-np.corrcoef(delta_y_bar, tau_0)[0,1]))

{'naive': 0.07553, 'trig_dilu': 0.07726, 'cuped2': 0.07636, 'cuped1': 0.11455}
{'naive': 0.01222, 'trig_dilu': 0.00326, 'cuped2': 0.00847, 'cuped1': 0.01025}


In [50]:
(1/n_t**2*np.cov(Y_t,S_t)[0,1] + 1/n_c**2*np.cov(Y_c,S_c)[0,1]) / (1/n_t**2*np.var(S_t) + 1/n_c**2*np.var(S_c))

0.49567329848805874

In [112]:
#naive
n_c, n_t = 25000, 75000
delta_y_bar = df.loc[df['assignment']=='treatment', 'Y'].mean() - df.loc[df['assignment']=='control', 'Y'].mean()
var_delta_y_bar = np.var(df.loc[df['assignment']=='treatment','Y'].values)/n_t + np.var(df.loc[df['assignment']=='control','Y'].values)/n_c
print('Naive method: Est. ITT = {:.5f}, Est. SE = {:.5f}'.format(delta_y_bar, np.sqrt(var_delta_y_bar)))

Naive method: Est. ITT = 0.07503, Est. SE = 0.01226


In [118]:
#trigger_dilute
n_c1, n_t1 = df.loc[(df['assignment']=='control'),'S'].sum(), df.loc[(df['assignment']=='treatment'),'S'].sum()
n = n_c+n_t
gamma = (n_t1+n_c1)/n  #triggering rate

delta_y1_bar = df.loc[(df['assignment']=='treatment') & (df['S']==1), 'Y'].mean() - df.loc[(df['assignment']=='control') & (df['S']==1), 'Y'].mean()
tau_d_hat = delta_y1_bar*gamma

var_tau_d_hat = ( np.var(df.loc[(df['assignment']=='treatment') & (df['S']==1) ,'Y'].values)/n_t1 + np.var(df.loc[(df['assignment']=='control') & (df['S']==1) ,'Y'].values)/n_c1 )*gamma**2 + gamma*(1-gamma)/n*(df.loc[(df['assignment']=='treatment') & (df['S']==1) ,'Y'].mean() - df.loc[(df['assignment']=='control') & (df['S']==1) ,'Y'].mean())**2
print('Tigger-dilute: Est. ITT = {:.5f}, Est. SE = {:.5f}'.format(tau_d_hat, np.sqrt(var_tau_d_hat)))

Tigger-dilute: Est. ITT = 0.07697, Est. SE = 0.00321


In [115]:
#CUPED two-sided trigger
n_resample = 1000
delta_y_bar_boot = np.zeros(n_resample)
delta_y0_bar_hat_boot = np.zeros(n_resample)

np.random.seed(seed=0)
for i in range(n_resample):    
    df_samp = df.iloc[np.random.choice(len(df),size=len(df),replace=True), :]
    delta_y0_bar_hat_boot[i] = df_samp.loc[(df_samp['assignment']=='treatment') & (df_samp['S']==0),'Y'].mean() - df_samp.loc[(df_samp['assignment']=='control') & (df_samp['S']==0),'Y'].mean()   
    delta_y_bar_boot[i] = df_samp.loc[df_samp['assignment']=='treatment','Y'].mean() - df_samp.loc[df_samp['assignment']=='control','Y'].mean()

theta = np.cov(delta_y_bar_boot, delta_y0_bar_hat_boot)[0,1]/np.var(delta_y0_bar_hat_boot)
delta_y0_bar_hat =  df.loc[(df['assignment']=='treatment') & (df['S']==0),'Y'].mean() - df.loc[(df['assignment']=='control') & (df['S']==0),'Y'].mean()
tau_trig2_hat = delta_y_bar - theta*delta_y0_bar_hat    
var_tau_trig2_hat =  np.var(delta_y_bar_boot) - np.cov(delta_y_bar_boot, delta_y0_bar_hat_boot)[0,1]**2/np.var(delta_y0_bar_hat_boot)

print('CUPED two-sided triggering: Est. ITT={:.5f}, Est. SE={:.5f}'.format(tau_trig2_hat, np.sqrt(var_tau_trig2_hat)))

CUPED two-sided trigger: Est. ITT=0.07803, Est. SE=0.00341


In [116]:
# CUPED one-sided trigger
n_resample = 1000
delta_y_bar_boot = np.zeros(n_resample)
tau_0_hat_boot = np.zeros(n_resample)

np.random.seed(seed=0)
for i in range(n_resample):
    df_samp = df.iloc[np.random.choice(len(df),size=len(df),replace=True), :]
    X_t = sm.add_constant(df_samp.loc[df_samp['assignment']=='treatment',['x_1','x_2']].values)
    X_c = sm.add_constant(df_samp.loc[df_samp['assignment']=='control',['x_1','x_2']].values)
    
    log_reg = sm.Logit(df_samp.loc[df_samp['assignment']=='treatment','D'].values, X_t).fit(disp=0)
    w = log_reg.predict(X_c)
    
    tau_0_hat_boot[i] = df_samp.loc[(df_samp['assignment']=='treatment') & (df_samp['D']==0),'Y'].mean() - np.sum((1-w)*df_samp.loc[df_samp['assignment']=='control','Y'].values)/np.sum((1-w))
    delta_y_bar_boot[i] = df_samp.loc[df_samp['assignment']=='treatment','Y'].mean() - df_samp.loc[df_samp['assignment']=='control','Y'].mean()
    
theta = np.cov(delta_y_bar_boot, tau_0_hat_boot)[0,1]/np.var(tau_0_hat_boot)

X_t = sm.add_constant(df.loc[df['assignment']=='treatment',['x_1','x_2']].values)
X_c = sm.add_constant(df.loc[df['assignment']=='control',['x_1','x_2']].values)
log_reg = sm.Logit(df.loc[df['assignment']=='treatment','D'].values, X_t).fit(disp=0)
w = log_reg.predict(X_c)
tau_0_hat = df.loc[(df['assignment']=='treatment') & (df['D']==0),'Y'].mean() - np.sum((1-w)*df.loc[df['assignment']=='control','Y'].values)/np.sum((1-w))
tau_trig1_hat = delta_y_bar - theta*tau_0_hat    
var_tau_trig1_hat =  np.var(delta_y_bar_boot) - np.cov(delta_y_bar_boot, tau_0_hat_boot)[0,1]**2/np.var(tau_0_hat_boot)

print('CUPED one-sided triggering: Est. ITT={:.5f}, Est. SE={:.5f}'.format(tau_trig1_hat, np.sqrt(var_tau_trig1_hat)))

CUPED one-sided triggering: Est. ITT=0.07803, Est. SE=0.00341
