# Rによるメールマーケティングの効果の検証

## RCTされているデータの場合

In [1]:
# パッケージのインストール
install.packages("tidyverse")


The downloaded binary packages are in
	/var/folders/rr/fb4bmjkj789czq2w91y81l480000gn/T//RtmpZOzec6/downloaded_packages


In [2]:
library("tidyverse")

─ [1mAttaching packages[22m ──────────────────── tidyverse 1.3.1 ─

[32m✔[39m [34mggplot2[39m 3.3.3     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.1.0     [32m✔[39m [34mdplyr  [39m 1.0.5
[32m✔[39m [34mtidyr  [39m 1.1.3     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.1

─ [1mConflicts[22m ───────────────────── tidyverse_conflicts() ─
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()



In [3]:
email_data <- read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")


[36m─[39m [1m[1mColumn specification[1m[22m [36m────────────────────────────[39m
cols(
  recency = [32mcol_double()[39m,
  history_segment = [31mcol_character()[39m,
  history = [32mcol_double()[39m,
  mens = [32mcol_double()[39m,
  womens = [32mcol_double()[39m,
  zip_code = [31mcol_character()[39m,
  newbie = [32mcol_double()[39m,
  channel = [31mcol_character()[39m,
  segment = [31mcol_character()[39m,
  visit = [32mcol_double()[39m,
  conversion = [32mcol_double()[39m,
  spend = [32mcol_double()[39m
)




In [4]:
male_df <- email_data %>% filter(segment != "Womens E-Mail") %>%
mutate(treatment = if_else(segment == "Mens E-Mail", 1, 0))

In [5]:
male_df %>% head(5)

recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>
6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0,0
9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0,1
9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0,1
2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0,1
4,3) $200 - $350,241.42,0,1,Rural,1,Multichannel,No E-Mail,0,0,0,0


In [6]:
summary_by_segment <- male_df %>%
group_by(treatment) %>%
summarise(
    conversion_rate = mean(conversion),
    spend_mean = mean(spend),
    count = n()
         )

In [7]:
summary_by_segment

treatment,conversion_rate,spend_mean,count
<dbl>,<dbl>,<dbl>,<int>
0,0.005726087,0.6527894,21306
1,0.012531093,1.4226165,21307


In [8]:
mens_mail <- male_df %>%
    filter(treatment == 1) %>%
    pull(spend)

no_mail <- male_df %>%
    filter(treatment == 0) %>%
    pull(spend)

In [9]:
# 平均の差を検定
rct_ttest <- t.test(mens_mail, no_mail, var.equal = TRUE)

In [10]:
rct_ttest


	Two Sample t-test

data:  mens_mail and no_mail
t = 5.3001, df = 42611, p-value = 1.163e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.4851384 1.0545160
sample estimates:
mean of x mean of y 
1.4226165 0.6527894 


しっかりとRCTされている場合はP値も小さいので統計的に有意な差があると言える。

## バイアスのあるデータの効果検証

In [11]:
set.seed(1)

In [25]:
obs_rate_c <- 0.5
obs_rate_t <- 0.5
biased_data <- male_df %>% 
    mutate(
        obs_rate_c = if_else(
            (history > 300) | (recency < 6) | (channel == "Multichannel"),
            obs_rate_c, 1
        ),
        obs_rate_t = if_else(
            (history > 300) | (recency < 6) | (channel == "Multichannel"),
            1, obs_rate_t
        ),
        random_number = runif(n = NROW(male_df))
    ) %>%
    filter(
        (treatment == 0 & random_number < obs_rate_c) | (treatment == 1 & random_number < obs_rate_t)
          )

In [26]:
biased_data %>% head(5)

recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend,treatment,obs_rate_c,obs_rate_t,random_number
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>,<dbl>
6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0,0,0.5,1,0.3974271
9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0,1,0.5,1,0.817395
9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0,1,0.5,1,0.7874897
2,2) $100 - $200,101.64,0,1,Urban,0,Web,Mens E-Mail,1,0,0,1,0.5,1,0.6273674
5,1) $0 - $100,29.99,1,0,Surburban,0,Phone,Mens E-Mail,0,0,0,1,0.5,1,0.3254771


In [27]:
summary_by_segment_biased <- biased_data %>%
    group_by(treatment) %>% 
    summarise(conversion_rate = mean(conversion),
              spend_mean = mean(spend),
              count = n()
             )
summary_by_segment_biased

treatment,conversion_rate,spend_mean,count
<dbl>,<dbl>,<dbl>,<int>
0,0.005084056,0.5893892,14752
1,0.013014298,1.4850913,17135


In [28]:
mens_mail_biased <- biased_data %>%
    filter(treatment == 1) %>%
    pull(spend)

no_mail_biased <- biased_data %>%
    filter(treatment == 0) %>%
    pull(spend)

rct_ttest_biased <- t.test(mens_mail_biased, no_mail_biased, var.equal = T)
rct_ttest_biased


	Two Sample t-test

data:  mens_mail_biased and no_mail_biased
t = 5.2814, df = 31885, p-value = 1.29e-07
alternative hypothesis: true difference in means is not equal to 0
95 percent confidence interval:
 0.5632905 1.2281137
sample estimates:
mean of x mean of y 
1.4850913 0.5893892 


## 回帰分析の導入
$$
Spend_i = \beta_0 + \beta_{treatment}treatment_{i} + \beta_{history}history_{i}
$$

In [29]:
biased_reg <- lm(data = biased_data, formula = spend ~ treatment + history)

In [30]:
summary(biased_reg)


Call:
lm(formula = spend ~ treatment + history, data = biased_data)

Residuals:
   Min     1Q Median     3Q    Max 
 -5.39  -1.41  -1.16  -0.50 497.81 

Coefficients:
             Estimate Std. Error t value Pr(>|t|)    
(Intercept) 0.3168726  0.1416113   2.238   0.0253 *  
treatment   0.8015271  0.1711683   4.683 2.84e-06 ***
history     0.0013288  0.0003309   4.016 5.94e-05 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 15.1 on 31884 degrees of freedom
Multiple R-squared:  0.001379,	Adjusted R-squared:  0.001316 
F-statistic: 22.02 on 2 and 31884 DF,  p-value: 2.786e-10


In [31]:
# Coefficients以外の情報を気にしないためtidyを利用する
library("broom")

In [32]:
biased_reg_coef <- tidy(biased_reg)

In [33]:
biased_reg_coef

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.316872573,0.1416113104,2.237622,0.0252526
treatment,0.80152714,0.1711683204,4.682684,2.84308e-06
history,0.001328774,0.0003308897,4.015762,5.938966e-05


効果検証のための回帰分析は$\beta_{treatment}$以外の推定結果には興味はないため、介入効果を示すパラメータ以外は無視することになる。

## 回帰分析におけるバイアス

In [34]:
# RCTデータでの単回帰
rct_reg <- lm(data = male_df, formula = spend ~ treatment)
rct_reg_coef <- summary(rct_reg) %>% tidy()

# バイアスのあるデータでの単回帰
nonrct_reg <- lm(data = biased_data, formula = spend ~ treatment)
nonrct_reg_coef <- summary(nonrct_reg) %>% tidy()

In [35]:
rct_reg_coef

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.6527894,0.102707,6.355841,2.093808e-10
treatment,0.7698272,0.1452479,5.30009,1.163201e-07


In [36]:
nonrct_reg_coef

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.5893892,0.1243217,4.740841,2.137509e-06
treatment,0.8957021,0.1695944,5.281436,1.290146e-07


共変量がない場合、バイアスが加味されており高く評価されてしまった。
次に共変量Xをモデルに加えてみる。以下の式にしてみるとどうなるか。
$$
Spend_i = \beta_0 + \beta_{treatment}treatment_i + \beta_{recency}recency_i + \beta_{channel}channel_i + \beta_{history}history_i + u_i
$$

In [37]:
# バイアスのあるデータでの単回帰
nonrct_mreg <- lm(data = biased_data, formula = spend ~ treatment + recency + channel + history)
nonrct_mreg_coef <- summary(nonrct_mreg) %>% tidy()
nonrct_mreg_coef

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.470266673,0.3702582964,1.27010435,0.2040567
treatment,0.738858871,0.1751833668,4.21763141,2.475628e-05
recency,-0.046354508,0.0254244602,-1.82322486,0.0682787
channelPhone,0.028505964,0.297634668,0.09577501,0.9236999
channelWeb,0.329227001,0.2969198519,1.10880764,0.2675215
history,0.001283275,0.0003688548,3.47907774,0.0005038141


treatmentの数字が小さくなったため、共変量によりバイアスの影響が少なくなった。

「セレクションバイアスの影響をより小さくするために、どのような共変量をモデルに追加すべきか」
=> 「目的変数Yと介入変数Zに対して相関のある変数を加えるべき」