### 0. 系统信息

In [1]:
import platform
import sys
print('system info:', platform.platform())
print('python version:', sys.version.split('\n')[0])

system info: Darwin-18.6.0-x86_64-i386-64bit
python version: 3.7.4 (default, Aug 13 2019, 15:17:50) 


In [2]:
import numpy as np 
import pandas as pd 
import scipy
import statsmodels

print('numpy version:', np.__version__)
print('pandas version:', pd.__version__)
print('scipy version:', scipy.__version__)
print('statsmodels version:', statsmodels.__version__)

numpy version: 1.18.1
pandas version: 1.0.1
scipy version: 1.4.1
statsmodels version: 0.11.0


### 1. 导入相关包

In [3]:
import numpy as np 
import pandas as pd 
from statsmodels.regression.linear_model import OLS, GLS 
import statsmodels.formula.api as smf
import statsmodels.api as sm

### 2. 导入数据
由于使用的数据比较少，所以直接在下面代码中定义，最终将重复了5次的数据存在df_data这个DataFrame中。df_data的数据结构为120行3列，含义为

| user_tag| exp|GMV |
|---|---|---|
|用户标签| 实验组/对照组哑变量 |GMV|


df_data_his存放的是利用对照组代替的用户历史信息，所以只有60行对照组数据。

In [4]:
arr_data = np.array([[0,0,0,0,1,1,1,1,2,2,2,2],
                     [53,57,55,47,53,60,47,43,53,45,49,41],
                     [51,54,53,47,52,48,50,44,48,48,44,47]])
print(pd.DataFrame(arr_data))

    0   1   2   3   4   5   6   7   8   9  10  11
0   0   0   0   0   1   1   1   1   2   2   2   2
1  53  57  55  47  53  60  47  43  53  45  49  41
2  51  54  53  47  52  48  50  44  48  48  44  47


In [5]:
df_data_t = pd.DataFrame(arr_data[[0,1]], index=['user_tag','gmv']).T
df_data_c = pd.DataFrame(arr_data[[0,2]], index=['user_tag','gmv']).T
df_data_t['exp'] = 'treatment'
df_data_c['exp'] = 'control'
df_data_unit = pd.concat([df_data_t, df_data_c])

print(df_data_unit.shape)
df_data = pd.concat([df_data_unit]*6)
print(df_data.shape)

(24, 3)
(144, 3)


In [6]:
df_his_data = df_data[df_data['exp']=='control'][['gmv','user_tag']]
df_his_data.columns = ['gmv_his','user_tag']

### 3. 实验设计中的历史数据验证

In [7]:
model_block = smf.ols(formula='gmv_his ~ C(user_tag)', data=df_his_data)
results_block = model_block.fit()
df_anova=sm.stats.anova_lm(results_block, typ=1)
format_dict={'PR(>F)':'{:,.2%}'.format}
df_anova.style.format(format_dict)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(user_tag),2.0,247.0,123.5,19.063758,0.00%
Residual,69.0,447.0,6.478261,,nan%


### 4. 利用ANOVA进行随机区组实验的数据分析

In [8]:
model = smf.ols(formula='gmv ~ C(exp) + C(user_tag)', data=df_data)
results = model.fit()
print(results.summary())

                            OLS Regression Results                            
Dep. Variable:                    gmv   R-squared:                       0.244
Model:                            OLS   Adj. R-squared:                  0.228
Method:                 Least Squares   F-statistic:                     15.05
Date:                Sun, 09 Feb 2020   Prob (F-statistic):           1.53e-08
Time:                        17:13:07   Log-Likelihood:                -403.12
No. Observations:                 144   AIC:                             814.2
Df Residuals:                     140   BIC:                             826.1
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept              51.4167    

In [9]:
df_anova=sm.stats.anova_lm(results, typ=1)
format_dict={'PR(>F)':'{:,.2%}'.format}
df_anova.style.format(format_dict)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(exp),1.0,72.25,72.25,4.441273,3.69%
C(user_tag),2.0,662.0,331.0,20.346872,0.00%
Residual,140.0,2277.5,16.267857,,nan%


### 5. 如果不考虑区组，只是利用ANOVA的方法分析A/B实验

In [10]:
model = smf.ols(formula='gmv ~ C(exp)', data=df_data)
results = model.fit()
df_anova = sm.stats.anova_lm(results, typ=1)
format_dict={'PR(>F)':'{:,.2%}'.format}
df_anova.style.format(format_dict)

Unnamed: 0,df,sum_sq,mean_sq,F,PR(>F)
C(exp),1.0,72.25,72.25,3.490219,6.38%
Residual,142.0,2939.5,20.700704,,nan%
