# Preparation

In [2]:
import datetime as dt
import re
import json
import time
import pandas as pd
import numpy as np

In [3]:
# An arbitrary integer to use as a sampling seed so the sampling is deterministic and can be re-run
RANDOM_STATE = 788319

frame = pd.read_parquet("secrets/sampling-frame.parquet")
previous_sample = pd.read_table("secrets/sampled_users.tsv").set_index("user_name")
original_strata_targets = pd.read_table('definitions/strata-sample-targets.tsv').set_index('project_group')

The sampling targets (both for actual and follow-up) are dependent on the response rates of the respective wikis. For enwiki, dewiki, and commonswiki, we only sample 75% of the actual target in anticipation of higher response rates (due to a new distribution mechanism). The follow-up sample is only required for enwiki, where we will sampling the remainder of 25% of the actual sampling targets.

In [35]:
strata_targets = original_strata_targets.copy()
strata_targets.iloc[:] = 0.
strata_targets.loc['enwiki', :] = round(original_strata_targets.loc['enwiki', :] * 0.25)
strata_targets

Unnamed: 0_level_0,10-29,30-149,150-599,600-1199,1200+
project_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arwiki,0,0,0,0,0
asia_wps,0,0,0,0,0
cee_wps,0,0,0,0,0
commons,0,0,0,0,0
dewiki,0,0,0,0,0
enwiki,148,345,214,91,159
eswiki,0,0,0,0,0
frwiki,0,0,0,0,0
itwiki,0,0,0,0,0
jawiki,0,0,0,0,0


# Remove previously sampled users from frame

In [6]:
frame = frame.drop(previous_sample.index)

# Set sample sizes

In [48]:
# Stratify sample
strata_sizes = frame.groupby(["project_group", "edit_bin"]).size().unstack()

# `strata_targets` has string column names, but `strata_sizes` has categoricals
strata_targets.index = strata_sizes.index
strata_targets.columns = strata_sizes.columns

In [49]:
for i in strata_targets.index:
    for col in strata_targets.columns:
        if strata_targets.loc[i, col] > strata_sizes.loc[i, col]:
            strata_targets.loc[i, col] = strata_sizes.loc[i, col]
            print(f'{i} ({col}) capped at {strata_sizes.loc[i, col]}')

enwiki (10-29) capped at 147


In [50]:
# The targets must not be larger than the strata, or the sampling will hit an error
excessive_targets = (strata_targets > strata_sizes)
assert excessive_targets.any(axis=None) == False

In [51]:
# Set target to 0 for any strata with fewer than 10 users,
# to prevent cases where de-anonymization of data would be too easy
strata_sizes = strata_targets.where(strata_sizes >= 10, other=0)

In [52]:
strata = (
  strata_sizes
  .unstack()
  .reset_index()
  .rename({0: "sample_size"}, axis=1)
  # Ensure columns are ordered in the expected way
  [["project_group", "edit_bin", "sample_size"]]
)

strata.head()

Unnamed: 0,project_group,edit_bin,sample_size
0,arwiki,10-29,0
1,asia_wps,10-29,0
2,cee_wps,10-29,0
3,commons,10-29,0
4,dewiki,10-29,0


# Pull samples

In [55]:
def sample_stratum(project_group, edit_bin, sample_size):
    """
    returns the sampled users as a data frame
    """
    stratum = frame.query("project_group == @project_group & edit_bin == @edit_bin")
    sample = stratum.sample(n=sample_size, random_state=RANDOM_STATE)
    return sample

f = []

for row in strata.itertuples(index=False):
    f.append(sample_stratum(*row))
    
sampled_users = pd.concat(f)

In [56]:
sampled_users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 956 entries, Jerzyewaz to Bungle
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   email          956 non-null    object  
 1   home_project   956 non-null    category
 2   global_edits   956 non-null    float64 
 3   project_group  956 non-null    category
 4   edit_bin       956 non-null    category
dtypes: category(3), float64(1), object(1)
memory usage: 46.4+ KB


# Save samples table

In [57]:
sampled_users.to_csv("secrets/followup-sampled-users.tsv", sep="\t")