In [1]:
# データ操作を簡易化するパッケージであるdplyrをインストール(tydyverseに含まれている)
install.packages("tidyverse")
install.packages("broom")

# ライブラリの呼び出し
library("tidyverse")

# bloomパッケージのtidy(): 回帰分析のCoefficientsのみを出力するライブラリ
library("broom")

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

Installing package into ‘/usr/local/lib/R/site-library’
(as ‘lib’ is unspecified)

── [1mAttaching core tidyverse packages[22m ──────────────────────── tidyverse 2.0.0 ──
[32m✔[39m [34mdplyr    [39m 1.1.2     [32m✔[39m [34mreadr    [39m 2.1.4
[32m✔[39m [34mforcats  [39m 1.0.0     [32m✔[39m [34mstringr  [39m 1.5.0
[32m✔[39m [34mggplot2  [39m 3.4.2     [32m✔[39m [34mtibble   [39m 3.2.1
[32m✔[39m [34mlubridate[39m 1.9.2     [32m✔[39m [34mtidyr    [39m 1.3.0
[32m✔[39m [34mpurrr    [39m 1.0.2     
── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m    masks [34mstats[39m::lag()
[36mℹ[39m Use the conflicted package ([3m[34m<http://conflicted.r-lib.org/>[39m[23m) to force all conflicts to become errors


In [2]:
email_data <- read_csv("http://www.minethatdata.com/Kevin_Hillstrom_MineThatData_E-MailAnalytics_DataMiningChallenge_2008.03.20.csv")

[1mRows: [22m[34m64000[39m [1mColumns: [22m[34m12[39m
[36m──[39m [1mColumn specification[22m [36m────────────────────────────────────────────────────────[39m
[1mDelimiter:[22m ","
[31mchr[39m (4): history_segment, zip_code, channel, segment
[32mdbl[39m (8): recency, history, mens, womens, newbie, visit, conversion, spend

[36mℹ[39m Use `spec()` to retrieve the full column specification for this data.
[36mℹ[39m Specify the column types or set `show_col_types = FALSE` to quiet this message.


In [3]:
# データ型確認
typeof(email_data)

# 行列数を表示
dim(email_data)

In [4]:
email_data

recency,history_segment,history,mens,womens,zip_code,newbie,channel,segment,visit,conversion,spend
<dbl>,<chr>,<dbl>,<dbl>,<dbl>,<chr>,<dbl>,<chr>,<chr>,<dbl>,<dbl>,<dbl>
10,2) $100 - $200,142.44,1,0,Surburban,0,Phone,Womens E-Mail,0,0,0
6,3) $200 - $350,329.08,1,1,Rural,1,Web,No E-Mail,0,0,0
7,2) $100 - $200,180.65,0,1,Surburban,1,Web,Womens E-Mail,0,0,0
9,5) $500 - $750,675.83,1,0,Rural,1,Web,Mens E-Mail,0,0,0
2,1) $0 - $100,45.34,1,0,Urban,0,Web,Womens E-Mail,0,0,0
6,2) $100 - $200,134.83,0,1,Surburban,0,Phone,Womens E-Mail,1,0,0
9,3) $200 - $350,280.20,1,0,Surburban,1,Phone,Womens E-Mail,0,0,0
9,1) $0 - $100,46.42,0,1,Urban,0,Phone,Womens E-Mail,0,0,0
9,5) $500 - $750,675.07,1,1,Rural,1,Phone,Mens E-Mail,0,0,0
10,1) $0 - $100,32.84,0,1,Urban,1,Web,Womens E-Mail,0,0,0


In [5]:
# 女性向けメールが配信されたデータを削除して、男性向けメールが配信されたら介入を示すtreatmentカラムを追加
male_df <- email_data %>%
  filter(segment != "Womens E-Mail") %>%
  mutate(treatment = ifelse(segment == "Mens E-Mail", 1, 0))

dim(male_df)

In [6]:
# (5) セレクションバイアスのあるデータを作成
## seedを固定
set.seed(1)

## 条件に反応するサンプルの量を半分にする
obs_rate_c <- 0.5
obs_rate_t <- 0.5

## バイアスのあるデータを作成
### 昨年の購入額が300より小さい　or　最後の購入が3より小さい or 接着チャンネルが複数あるデータをランダムに削除
### つまり購入意欲が高いユーザーにはメールが多く配信されたデータになっている
biased_data <- male_df %>%
  mutate(obs_rate_c =
           ifelse( (history > 300) | (recency < 6) |
                     (channel == "Multichannel"), obs_rate_c, 1),
         obs_rate_t =
           ifelse( (history > 300) | (recency < 6) |
                     (channel == "Multichannel"), 1, obs_rate_t),
         random_number = runif(n = NROW(male_df))) %>%
  filter( (treatment == 0 & random_number < obs_rate_c ) |
            (treatment == 1 & random_number < obs_rate_t) )

In [7]:
## RCTデータでの単回帰
rct_reg <- lm(data = male_df, formula = spend ~ treatment)

In [8]:
summary(rct_reg)


Call:
lm(formula = spend ~ treatment, data = male_df)

Residuals:
   Min     1Q Median     3Q    Max 
 -1.42  -1.42  -0.65  -0.65 498.35 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.6528     0.1027   6.356 2.09e-10 ***
treatment     0.7698     0.1452   5.300 1.16e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 14.99 on 42611 degrees of freedom
Multiple R-squared:  0.0006588,	Adjusted R-squared:  0.0006354 
F-statistic: 28.09 on 1 and 42611 DF,  p-value: 1.163e-07


In [9]:
rct_reg_coef <- tidy(rct_reg)
rct_reg_coef

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.6527894,0.102707,6.355841,2.093808e-10
treatment,0.7698272,0.1452479,5.30009,1.163201e-07


In [10]:
## バイアスのあるデータでの単回帰
nonrct_reg <- lm(data = biased_data, formula = spend ~ treatment)

In [11]:
nonrct_reg_coefficient <- tidy(nonrct_reg)
nonrct_reg_coefficient

## RCTでの結果の方が、平均値比較での分析結果(0.767)と近い

term,estimate,std.error,statistic,p.value
<chr>,<dbl>,<dbl>,<dbl>,<dbl>
(Intercept),0.5483062,0.126891,4.321081,1.557365e-05
treatment,0.9794465,0.172717,5.670817,1.433467e-08


In [12]:
#　有意差検定の結果確認、RCTデータで。
summary(rct_reg)


Call:
lm(formula = spend ~ treatment, data = male_df)

Residuals:
   Min     1Q Median     3Q    Max 
 -1.42  -1.42  -0.65  -0.65 498.35 

Coefficients:
            Estimate Std. Error t value Pr(>|t|)    
(Intercept)   0.6528     0.1027   6.356 2.09e-10 ***
treatment     0.7698     0.1452   5.300 1.16e-07 ***
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 14.99 on 42611 degrees of freedom
Multiple R-squared:  0.0006588,	Adjusted R-squared:  0.0006354 
F-statistic: 28.09 on 1 and 42611 DF,  p-value: 1.163e-07


In [13]:
dim(male_df)

In [14]:
# treatmentの偏回帰係数: 0.7698で、そのt値は5.300、またこの回帰分析の自由度はn-(k+1)=n-2=42613-2=42611
# 推定値の標準誤差