# German Wikipedia Power Analysis

Using the registration, activation, and retention statistics from 2020–21 data gathering, do a simplified power analysis for German Wikipedia. We'll be using the `pwr` package in R for these estimates.

Note that our statistics include three months of some proportion of users getting some variation of the Growth Features, as that Wikipedia has had the features since August/September 2021. Since we're using 12-month averages, I think we're making reasonable estimates.

In [5]:
library(tidyverse)
library(pwr)
library(data.table)

── [1mAttaching packages[22m ─────────────────────────────────────── tidyverse 1.3.0 ──

[32m✔[39m [34mggplot2[39m 3.3.2     [32m✔[39m [34mpurrr  [39m 0.3.4
[32m✔[39m [34mtibble [39m 3.0.4     [32m✔[39m [34mdplyr  [39m 1.0.2
[32m✔[39m [34mtidyr  [39m 1.1.2     [32m✔[39m [34mstringr[39m 1.4.0
[32m✔[39m [34mreadr  [39m 1.4.0     [32m✔[39m [34mforcats[39m 0.5.0

── [1mConflicts[22m ────────────────────────────────────────── tidyverse_conflicts() ──
[31m✖[39m [34mdplyr[39m::[32mbetween()[39m   masks [34mdata.table[39m::between()
[31m✖[39m [34mdplyr[39m::[32mfilter()[39m    masks [34mstats[39m::filter()
[31m✖[39m [34mdplyr[39m::[32mfirst()[39m     masks [34mdata.table[39m::first()
[31m✖[39m [34mdplyr[39m::[32mlag()[39m       masks [34mstats[39m::lag()
[31m✖[39m [34mdplyr[39m::[32mlast()[39m      masks [34mdata.table[39m::last()
[31m✖[39m [34mpurrr[39m::[32mtranspose()[39m masks [34mdata.table[39m::transpose(

In [2]:
## Lists of effect sizes that we're estimating for.
## [2%, 30%] changes in activation
## [-30%, 30%] changes in retention

activation_deltas <- c(1.02, 1.05, 1.1, 1.2, 1.3)
retention_deltas <- c(0.7, 0.8, 0.9, 0.95, 0.98, 1.0, 1.02, 1.05, 1.1, 1.2, 1.3)

In [6]:
calc_edit_power <- function(base_proportion, proportion_deltas, N_control,
                           sig.level = 0.05, power = 0.8) {
    ## Given a base proportion and N users in a control group, for each delta
    ## (change in proportion) calculate the number of participants in the treatment
    ## group that's needed to achieve a significance level of `sig.level` and power of `power`
    
    ## Treatment group number of usres
    treat_counts <- c()
    
    for(prop_delta in proportion_deltas) {
        treat_counts <- append(
            treat_counts,
            tryCatch(
                pwr.2p2n.test(
                    h = ES.h(
                        p1 = base_proportion,
                        p2 = base_proportion * prop_delta
                    ),
                    n1 = N_control,
                    sig.level = sig.level,
                    power = power,
                    alternative = 'two.sided'
                )$n2,
                error = function(e) { return(NA) }
            )
        )
      }
      tibble(
          delta = proportion_deltas,
          n_treatment = treat_counts
      )
    }

In [20]:
calc_power_5050 = function(edit_base, edit_deltas,
                           sig.level = 0.05, power = 0.8) {
  ## Given an average proportion of registrations
  ## who make an edit during a given time span (`edit_base`), and a set of changes in said
  ## proportion (`edit_deltas`), calculate the number of participants needed in order to
  ## run an experiment with a significance level of `sig.level` and power of `power`.
  
  deltas = edit_deltas;
  ns = c();
  
  for(edit_delta in edit_deltas) {
    ns = append(ns,
                tryCatch(pwr.p.test(h = ES.h(p1 = edit_base,
                                             p2 = edit_base*edit_delta),
                                    sig.level = sig.level,
                                    power = power,
                                    alternative = 'two.sided')$n,
                         error = function(e) return(NA)));
  }
  data.table(delta = deltas, n = ns);
}

In [17]:
## Configuration variables from German Wikipedia

# monthly registration rate
registration_rate = 8057

# activation rate on mobile, the lower of the two (desktop is 43.2%)
activation_rate = 0.41

# retention rate on mobile, the lower of the two (desktop is 18.8%)
retention_rate = 0.107

# we're assuming 20% assignment to the Control group
control_group_registrations = registration_rate * 0.2
treatment_group_registrations = registration_rate * 0.8

control_group_activations = control_group_registrations * activation_rate
treatment_group_activations = treatment_group_registrations * activation_rate

In [12]:
activation_counts = calc_edit_power(activation_rate, activation_deltas, control_group_registrations)

In [16]:
activation_counts %>%
    mutate(n_months = n_treatment / treatment_group_registrations,
           n_days = ceiling(n_months * 30))

delta,n_treatment,n_months,n_days
<dbl>,<dbl>,<dbl>,<dbl>
1.02,,,
1.05,,,
1.1,3944.5589,0.61197699,19.0
1.2,351.1877,0.05448488,2.0
1.3,139.7709,0.0216847,1.0


In [18]:
retention_counts = calc_edit_power(retention_rate, retention_deltas, control_group_activations)

In [19]:
retention_counts %>%
    mutate(n_months = n_treatment / treatment_group_activations,
           n_days = ceiling(n_months * 30))

delta,n_treatment,n_months,n_days
<dbl>,<dbl>,<dbl>,<dbl>
0.7,11613.99,4.394751,132.0
0.8,,,
0.9,,,
0.95,,,
0.98,,,
1.0,,,
1.02,,,
1.05,,,
1.1,,,
1.2,,,


In [21]:
activation_counts_5050 = calc_power_5050(activation_rate, activation_deltas)

In [23]:
activation_counts_5050 %>%
    mutate(n_months = n / registration_rate,
           n_days = ceiling(n_months * 30))

delta,n,n_months,n_days
<dbl>,<dbl>,<dbl>,<dbl>
1.02,28320.2243,3.51498378,106
1.05,4549.6715,0.56468555,17
1.1,1144.0441,0.14199381,5
1.2,288.3458,0.03578823,2
1.3,128.615,0.01596313,1


In [24]:
retention_counts_5050 = calc_power_5050(retention_rate, retention_deltas)

In [25]:
retention_counts_5050 %>%
    mutate(n_months = n / (registration_rate * activation_rate),
           n_days = ceiling(n_months * 30))

delta,n,n_months,n_days
<dbl>,<dbl>,<dbl>,<dbl>
0.7,625.1137,0.1892352,6.0
0.8,1487.179,0.4502006,14.0
0.9,6256.188,1.8938805,57.0
0.95,25619.5246,7.7555722,233.0
0.98,162315.2034,49.1362467,1475.0
1.0,,,
1.02,165198.0996,50.0089604,1501.0
1.05,26772.8731,8.1047152,244.0
1.1,6833.203,2.0685551,63.0
1.2,1776.3793,0.5377476,17.0
