# Preparation

In [13]:
import datetime as dt
import re
import json
import time
import pandas as pd
import numpy as np

In [14]:
# An arbitrary integer to use as a sampling seed so the sampling is deterministic and can be re-run
RANDOM_STATE = 13440

frame = pd.read_parquet("secrets/sampling-frame.parquet")
strata_targets = pd.read_table("definitions/strata-sample-targets.tsv").set_index("project_group")
strata_targets = strata_targets.astype(int)

# Set sample sizes

In [15]:
# Stratify sample
strata_sizes = frame.groupby(["project_group", "edit_bin"]).size().unstack()

# `strata_targets` has string column names, but `strata_sizes` has categoricals
strata_targets.columns = strata_sizes.columns

In [16]:
# The targets must not be larger than the strata, or the sampling will hit an error
excessive_targets = (strata_targets > strata_sizes)
assert excessive_targets.any(axis=None) == False

**Staggered sampling:** In 2022, we are exploring a survey distribution mechanism which is likely to improve the response rate. As the sampling target calculation is based on the response rate of a particular strata, so that we meet the required sample size eventually, a higher response rate would require less number of editors to be targeted. For the initial distribution of the survey, for the three large wikis, Wikimedia Commons, German Wikipedia and English Wikipedia, we will target 75% of actual targets. In the follow-up sample, we will be calculating the targets based on the response rate of initial sample of 2022, rather than 2021. This will also help in preventing saturating communities from survey participation calls.

In [17]:
stagger_wikis = ['commons', 'dewiki', 'enwiki']
staggered_strata_targets = strata_targets.copy()
staggered_strata_targets.loc[stagger_wikis, :] = staggered_strata_targets.loc[stagger_wikis, :]*0.75
staggered_strata_targets = staggered_strata_targets.apply(np.ceil).astype(int)
staggered_strata_targets

edit_bin,10-29,30-149,150-599,600-1199,1200+
project_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arwiki,28,60,56,34,98
asia_wps,12,20,82,49,132
cee_wps,119,528,614,299,946
commons,97,367,402,194,978
dewiki,53,381,474,255,482
enwiki,443,1035,641,273,477
eswiki,68,328,370,151,358
frwiki,103,569,589,223,641
itwiki,44,241,252,104,328
jawiki,74,534,437,141,227


In [20]:
# Set target to 0 for any strata with fewer than 10 users,
# to prevent cases where de-anonymization of data would be too easy
strata_sizes = staggered_strata_targets.where(strata_sizes >= 10, other=0)

In [21]:
strata = (
  strata_sizes
  .unstack()
  .reset_index()
  .rename({0: "sample_size"}, axis=1)
  # Ensure columns are ordered in the expected way
  [["project_group", "edit_bin", "sample_size"]]
)

strata.head()

Unnamed: 0,project_group,edit_bin,sample_size
0,arwiki,10-29,28
1,asia_wps,10-29,12
2,cee_wps,10-29,119
3,commons,10-29,97
4,dewiki,10-29,53


# Pull samples

In [22]:
def sample_stratum(project_group, edit_bin, sample_size):
    """
    returns the sampled users as a data frame
    """
    stratum = frame.query("project_group == @project_group & edit_bin == @edit_bin")
    sample = stratum.sample(n=sample_size, random_state=RANDOM_STATE)
    return sample

f = []

for row in strata.itertuples(index=False):
    f.append(sample_stratum(*row))
    
sampled_users = pd.concat(f)

# Save samples table

In [23]:
sampled_users.to_csv("secrets/sampled_users.tsv", sep="\t")