In [1]:
from scipy.stats import norm
import pandas as pd
import numpy as np

## Объявим данные

Объявим дф с 3 стратами, где у всех отличается дисперсия и средние

In [2]:
np.random.seed(1123)
treatment_effect = 1

# Объявим дф с 3 стратами, где у всех отличается дисперсия и средние
def gen_data(treatment_effect = 0):

    stratum_1 = pd.DataFrame({"group": "stratum_1", "val": norm.rvs(size=12000, loc=15 + treatment_effect, scale=2)})
    stratum_2 = pd.DataFrame({"group": "stratum_2", "val": norm.rvs(size=6000, loc=20 + treatment_effect, scale=2.5)})
    stratum_3 = pd.DataFrame({"group": "stratum_3", "val": norm.rvs(size=2000, loc=30 + treatment_effect, scale=3)})

    return pd.concat([stratum_1, stratum_2, stratum_3])

df_control = gen_data()
df_control["variant"] = "Control"
df_control["indx"] = df_control.index

df_treatment = gen_data(treatment_effect)
df_treatment["variant"] = "Treatment"
df_treatment["indx"] = df_treatment.index

df_combined = pd.concat([df_control, df_treatment])


## Считаем средние и дисперсию по двум кейсам: средневзвешенно по стратам и без их учета

без страт для сравнения

In [3]:
normal_te = pd.DataFrame({
    "effect_estimate": np.mean(df_treatment.val - df_control.val),
    "effect_estimate_se": np.sqrt(np.var(df_treatment.val) / len(df_treatment.val) + np.var(df_control.val) / len(df_control.val)),
    "n": len(df_treatment.val) + len(df_control.val)
}, index=[0])
print(normal_te)

   effect_estimate  effect_estimate_se      n
0          0.99308            0.051115  40000


In [4]:
def get_effect_estimate_se(treatment, control):
    return np.sqrt(np.var(treatment) / len(treatment) + np.var(control) / len(control))

def get_effect_estimate(treatment, control):
    return np.mean(treatment - control)

по стратам

In [5]:
groups = {}
for k, group in df_combined.groupby(by="group"):
    cur_df = pd.DataFrame()
    for g, variant in group.groupby(by="variant"):
        cur_df = pd.concat([cur_df, variant])
    groups[k] = cur_df

In [6]:
effect_estimate_se = []
effect_estimate = []
n = []
stratums = []

for key, df in groups.items():
    control = df[df.variant=="Control"]
    treatment = df[df.variant=="Treatment"]
    effect_estimate_se.append(get_effect_estimate_se(treatment.val, control.val))
    effect_estimate.append(get_effect_estimate(treatment.val, control.val))
    n.append(len(df))
    stratums.append(key)
    
strat_te = pd.DataFrame({
    "group": stratums,
    "effect_estimate_se": effect_estimate_se,
    "effect_estimate": effect_estimate,
    "n": n
})

In [7]:
strat_te

Unnamed: 0,group,effect_estimate_se,effect_estimate,n
0,stratum_1,0.025724,1.01448,24000
1,stratum_2,0.045432,0.944277,12000
2,stratum_3,0.092958,1.011093,4000


Взвешенная оценка

In [8]:
strat_te["effect_estimate_se"] = strat_te["effect_estimate_se"]*strat_te["n"]/np.sum(strat_te["n"])
strat_te["effect_estimate"] = strat_te["effect_estimate"]*strat_te["n"]/np.sum(strat_te["n"])

In [9]:
strat_te = strat_te[["effect_estimate_se","effect_estimate","n"]].apply("sum")
print(strat_te)

effect_estimate_se        0.03836
effect_estimate           0.99308
n                     40000.00000
dtype: float64


## Результаты

In [10]:
res = (1-(strat_te.effect_estimate_se/normal_te.effect_estimate_se))*100 
print(f"Сокращение дисперсии на {res[0]}%")

Сокращение дисперсии на 24.954630073771632%
