# RによるOVBの確認

**Rコード**

```R
# broomの読み出し
libray(broom)

# モデル式のベクトルを用意
formula_vec <- c(
    spend ~ treatment + recency + channel, #モデルA
    spend ~ treatment + recency + channel + history, #モデルB
    history ~ treatment + channel + recency, #モデルC
)

# formulaに名前を付ける
names(formula_vec) <- paste("reg", LETTERS[1:3], sep = "_")

# モデル式のデータフレーム化
models <- formula_vec %>%
    enframe(name = "model_index", value = "formula")

# まとめて回帰分析を実行
df_models <- models %>%
    mutate(model = map(.x = formula, .f = lm, data = biased_data)) %>%
    mutate(lm_result = map(.x = model, .f = tidy))

# モデルの結果を整形
df_results <- df_models %>%
    mutate(formula = as.character(formula)) %>%
    select(formula, model_index, lm_result) %>%
    unnset(cols = c(lm_result))

# モデルA,B，Cでのtreatmentのパラメータ（回帰係数）を抜き出す
treatment_coef <- df_results %>%
    filter(term == "treatment") %>%
    pull(estimate)

# モデルBからhistoryのパラメータ（回帰係数）を抜き出す
history_coef <- df_results %>%
    filter(
        model_index == "reg_B",
        term == "history"
    ) %>%
    pull(estimate)

# OVBの確認
OVB <- history_coef * treatment_coef[3]
coef_gap <- treatment_coef[1] - treatment_coef[2]

OVB #beta_2 * gamma_1
coef_gap #alpha_1 - beta_1
```

**Pythonコード**

In [1]:
# ライブラリーimport
import numpy as np
import pandas as pd
from scipy.stats import uniform
import statsmodels.formula.api as smf

In [2]:
# データセット作成
df_email = pd.read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")

df_male = df_email.query("segment != 'Womens E-Mail'").copy()
df_male["treatment"] = np.where(df_male["segment"] == "Mens E-Mail", 1, 0)

obs_rate_c = 0.5
obs_rate_t = 0.5

df_biased_email = (df_male
                   .assign(
                       obs_rate_c=np.where(
                           (df_male["history"] > 300) | (df_male["recency"] < 6) | (df_male["channel"] == "Multichannel"),
                           obs_rate_c,
                           1
                       )
                   )
                   .assign(
                       obs_rate_t=np.where(
                           (df_male["history"] > 300) | (df_male["recency"] < 6) | (df_male["channel"] == "Multichannel"),
                           obs_rate_t,
                           1
                       )
                   )
                   .assign(random_number=uniform.rvs(size=len(df_male), random_state=46))
                   .query("(treatment == 0 & random_number < obs_rate_c) | (treatment == 1 & random_number < obs_rate_t)")
)

In [3]:
# モデルのformulaを用意
model_list = [
    "spend ~ treatment + recency + channel", #モデルA
    "spend ~ treatment + recency + channel + history", #モデルB
    "history ~ treatment + channel + recency", #モデルC
]
# 辞書内包表記でformulaに名前を付ける
model_dict = {f"reg_{idx}": formula for idx, formula in enumerate(model_list)}

# OLSでfitさせた結果をDataFrameに格納
df_model = pd.DataFrame(
    data=[smf.ols(formula=formula, data=df_biased_email).fit() for formula in model_dict.values()],
    index=[reg for reg in model_dict.keys()]
).T

In [4]:
# モデルA,B，Cでのtreatmentのパラメータ（回帰係数）を抜き出す
treatment_coef = [df_model[column][0].params["treatment"] for column in df_model.columns]

# モデルBからhistoryのパラメータ（回帰係数）を抜き出す
history_coef = df_model["reg_1"][0].params["history"]

In [5]:
# OVBの確認
OVB = history_coef * treatment_coef[2]
coef_gap = treatment_coef[0] - treatment_coef[1]

print(OVB)
print(coef_gap)

0.0015424165543142167
0.0015424165543151958


# 変数の選び方とモデルの評価

**Rコード**

```R
# 入れてはいけない変数を入れてみる
cor_visit_treatment <- lm(
    data = biased_data,
    formula = treatment ~ visit + channel + recency + history
) %>%
tidy()
```

In [6]:
cor_visit_treatment = smf.ols(
    formula="treatment ~ visit + channel + recency + history",
    data=df_biased_email
).fit()
print(cor_visit_treatment.summary())

                            OLS Regression Results                            
Dep. Variable:              treatment   R-squared:                       0.013
Model:                            OLS   Adj. R-squared:                  0.013
Method:                 Least Squares   F-statistic:                     76.64
Date:                Tue, 04 Oct 2022   Prob (F-statistic):           4.17e-80
Time:                        02:57:32   Log-Likelihood:                -21304.
No. Observations:               29616   AIC:                         4.262e+04
Df Residuals:                   29610   BIC:                         4.267e+04
Df Model:                           5                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept            0.4701      0.014  

共変量の影響を取り除いた状態での相関が0.1656という値が有意な結果として得られる．  
これらのことからOVBの値は大きくなると考えられるので，回帰モデルに $visit_i$ を追加したくなる．  
実際にサイト来訪を回帰モデルに入れてみる．

**Rコード**
```R
# visitを入れた回帰分析を実行
bad_control_reg <- lm(
    data = biased_data,
    formula = spend ~ treatment + channel + recency + history + visit
) %>% tidy()
```

In [7]:
bad_control_reg = smf.ols(
    formula="spend ~ treatment + channel + recency + history + visit",
    data=df_biased_email
).fit()
print(bad_control_reg.summary())

                            OLS Regression Results                            
Dep. Variable:                  spend   R-squared:                       0.030
Model:                            OLS   Adj. R-squared:                  0.030
Method:                 Least Squares   F-statistic:                     151.8
Date:                Tue, 04 Oct 2022   Prob (F-statistic):          1.79e-190
Time:                        02:57:32   Log-Likelihood:            -1.2173e+05
No. Observations:               29616   AIC:                         2.435e+05
Df Residuals:                   29609   BIC:                         2.435e+05
Df Model:                           6                                         
Covariance Type:            nonrobust                                         
                       coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------------
Intercept           -0.6590      0.414  