# 2.1.5 Rによるメールマーケティングデータの分析 (回帰編)

In [1]:
import random

import numpy as np
import pandas as pd

## データの準備

In [2]:
data_biased = pd.read_csv("data_biased_cibook.csv")
data_biased.shape

(31961, 13)

In [3]:
data_biased.head(3)

Unnamed: 0,recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
0,3,1) $0 - $100,58.13,1,0,Urban,1,Web,No E-Mail,1,0,0.0,0
1,9,1) $0 - $100,29.99,0,1,Surburban,1,Phone,No E-Mail,0,0,0.0,0
2,2,1) $0 - $100,29.99,0,1,Urban,1,Phone,No E-Mail,0,0,0.0,0


## 回帰分析
- treatmentの推定結果は 0.8100 であり、その検定におけるp値も 0.000 < 0.05 と非常に小さい値のため、帰無仮説 (メール送信の効果はない) を棄却できる
- 従って、この値はメールを送信することで売上が平均 0.8 ほど増加するという解釈が可能

In [4]:
import statsmodels.formula.api as smf

  from pandas import Int64Index as NumericIndex


In [9]:
biased_reg = smf.ols(
    data=data_biased,
    formula="spend ~ treatment + history"
).fit()

In [10]:
biased_reg.summary()

0,1,2,3
Dep. Variable:,spend,R-squared:,0.001
Model:,OLS,Adj. R-squared:,0.001
Method:,Least Squares,F-statistic:,21.54
Date:,"Sat, 05 Feb 2022",Prob (F-statistic):,4.46e-10
Time:,17:18:14,Log-Likelihood:,-133150.0
No. Observations:,31961,AIC:,266300.0
Df Residuals:,31958,BIC:,266300.0
Df Model:,2,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3798,0.139,2.735,0.006,0.108,0.652
treatment,0.8100,0.178,4.554,0.000,0.461,1.159
history,0.0013,0.000,3.771,0.000,0.001,0.002

0,1,2,3
Omnibus:,70697.854,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,348412425.135
Skew:,20.709,Prob(JB):,0.0
Kurtosis:,512.816,Cond. No.,804.0


In [14]:
# Coefficients に限定して出力
biased_reg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.3798,0.139,2.735,0.006,0.108,0.652
treatment,0.8100,0.178,4.554,0.000,0.461,1.159
history,0.0013,0.000,3.771,0.000,0.001,0.002


# 2.2.1 共変量の追加による効果への作用

## RCTデータ
- RCTを行っているデータでは treatment の値は 0.7698 となり、介入群とコントロール群での購入額の平均の差と同様の結果となる

In [15]:
data = pd.read_csv("data_cibook.csv")
data.shape

(42613, 13)

In [16]:
reg = smf.ols(
    data=data,
    formula="spend ~ treatment"
).fit()

In [17]:
reg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6528,0.103,6.356,0.000,0.451,0.854
treatment,0.7698,0.145,5.300,0.000,0.485,1.055


## バイアスデータ
- バイアスを加えたデータでは treatment の値は 0.9381 となり、セレクションバイアスによって効果が過剰に推定されていると考えられる

In [18]:
reg_biased = smf.ols(
    data=data_biased,
    formula="spend ~ treatment"
).fit()

In [19]:
reg_biased.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6334,0.122,5.212,0.000,0.395,0.872
treatment,0.9381,0.175,5.372,0.000,0.596,1.280


## バイアスデータに、共変量を加える
- バイアスデータに共変量を加えた結果 treatment の値は 0.7711 となり、RCTデータにおける結果に近づいた

In [22]:
reg_biased2 = smf.ols(
    data=data_biased,
    formula="spend ~ treatment + recency + channel + history"
).fit()

In [23]:
reg_biased2.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4538,0.377,1.205,0.228,-0.285,1.192
channel[T.Phone],0.1863,0.309,0.604,0.546,-0.418,0.791
channel[T.Web],0.3783,0.308,1.229,0.219,-0.225,0.982
treatment,0.7711,0.180,4.284,0.000,0.418,1.124
recency,-0.0523,0.026,-2.017,0.044,-0.103,-0.001
history,0.0013,0.000,3.373,0.001,0.001,0.002


# 2.2.3 Rによる脱落変数バイアス (OVB) の確認

In [24]:
reg_a = smf.ols(
    data=data_biased,
    formula="spend ~ treatment + recency + channel"
).fit()

reg_b = smf.ols(
    data=data_biased,
    formula="spend ~ treatment + recency + channel + history"
).fit()

reg_c = smf.ols(
    data=data_biased,
    formula="history ~ treatment + recency + channel"
).fit()

In [25]:
reg_a.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,1.1654,0.312,3.733,0.000,0.554,1.777
channel[T.Phone],-0.1935,0.287,-0.674,0.500,-0.757,0.369
channel[T.Web],0.0004,0.287,0.001,0.999,-0.562,0.563
treatment,0.8425,0.179,4.712,0.000,0.492,1.193
recency,-0.0697,0.025,-2.743,0.006,-0.120,-0.020


In [26]:
reg_b.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.4538,0.377,1.205,0.228,-0.285,1.192
channel[T.Phone],0.1863,0.309,0.604,0.546,-0.418,0.791
channel[T.Web],0.3783,0.308,1.229,0.219,-0.225,0.982
treatment,0.7711,0.180,4.284,0.000,0.418,1.124
recency,-0.0523,0.026,-2.017,0.044,-0.103,-0.001
history,0.0013,0.000,3.373,0.001,0.001,0.002


In [27]:
reg_c.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,551.6269,4.565,120.843,0.000,542.680,560.574
channel[T.Phone],-294.4489,4.200,-70.102,0.000,-302.682,-286.216
channel[T.Web],-292.9166,4.194,-69.845,0.000,-301.137,-284.696
treatment,55.3422,2.614,21.169,0.000,50.218,60.466
recency,-13.4940,0.372,-36.314,0.000,-14.222,-12.766


In [31]:
gamma_1 = 55.3422  # モデルCの treatment の coeffficient
beta_1 = 0.7711  # モデルBの treatment の coeffficient
beta_4 = 0.0013  # モデルBの history の coeffficient
alpha_1 = 0.8425  # モデルAの treatment の coeffficient

In [32]:
ovb = gamma_1 * beta_4
ovb

0.07194486

In [34]:
# OVBを含まない場合 (モデルB) と含む場合 (モデルA) の treatment の coeffficient の差
coef_gap = alpha_1 - beta_1
coef_gap

0.07140000000000002

- 上記より、推定される効果の差がOVBの式の結果と一致することがわかった
- つまり、共変量を追加することで推定される効果の値に変化が生じるのは、共変量を追加したことによってOVBが消失したことに由来していることがわかる

# 2.2.7 Post treatment bias

treatmentに対して、共変量(channel, recency, history)の影響を取り除いた状態での相関が 0.1588 という値が有意な結果として得られる

In [5]:
reg = smf.ols(
    data=data_biased,
    formula="treatment ~  visit + channel + recency + history"
).fit()

In [6]:
reg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,0.6074,0.011,54.413,0.000,0.586,0.629
channel[T.Phone],-0.1069,0.010,-11.240,0.000,-0.125,-0.088
channel[T.Web],-0.1105,0.009,-11.644,0.000,-0.129,-0.092
visit,0.1588,0.008,20.615,0.000,0.144,0.174
recency,-0.0187,0.001,-23.554,0.000,-0.020,-0.017
history,0.0002,1.17e-05,20.039,0.000,0.000,0.000


回帰モデルに visit を追加すると、 メール配信の効果は 0.1585 へと大きく低下し、実験結果と乖離する

In [7]:
reg = smf.ols(
    data=data_biased,
    formula="spend ~  treatment + channel + recency + history + visit"
).fit()
reg.summary().tables[1]

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-0.4448,0.373,-1.193,0.233,-1.176,0.286
channel[T.Phone],0.2759,0.304,0.907,0.365,-0.321,0.872
channel[T.Web],0.2476,0.304,0.815,0.415,-0.348,0.843
treatment,0.1585,0.179,0.887,0.375,-0.192,0.509
recency,-0.0065,0.026,-0.252,0.801,-0.057,0.044
history,0.0008,0.000,2.032,0.042,2.71e-05,0.002
visit,7.4120,0.248,29.923,0.000,6.927,7.898
