# Preparation

In [25]:
import datetime as dt
import re
import json
import time
import pandas as pd
import numpy as np

In [36]:
# An arbitrary integer to use as a sampling seed so the sampling is deterministic and can be re-run
RANDOM_STATE = 13440

frame = pd.read_parquet("secrets/sampling-frame.parquet")
strata_targets = pd.read_table("definitions/strata-sample-targets.tsv").set_index("project_group")

# Set sample sizes

In [38]:
# Stratify sample
strata_sizes = frame.groupby(["project_group", "edit_bin"]).size().unstack()

# `strata_targets` has string column names, but `strata_sizes` has categoricals
strata_targets.columns = strata_sizes.columns

In [39]:
# The targets must not be larger than the strata, or the sampling will hit an error
excessive_targets = (strata_targets > strata_sizes)
assert excessive_targets.any(axis=None) == False

In [40]:
# Set target to 0 for any strata with fewer than 10 users,
# to prevent cases where de-anonymization of data would be too easy
strata_sizes = strata_targets.where(strata_sizes >= 10, other=0)

In [41]:
strata = (
  strata_sizes
  .unstack()
  .reset_index()
  .rename({0: "sample_size"}, axis=1)
  # Ensure columns are ordered in the expected way
  [["project_group", "edit_bin", "sample_size"]]
)

strata.head()

Unnamed: 0,project_group,edit_bin,sample_size
0,arwiki,10-29,24
1,asia_wps,10-29,10
2,cee_wps,10-29,83
3,commons,10-29,163
4,dewiki,10-29,80


# Pull samples

In [45]:
def sample_stratum(project_group, edit_bin, sample_size):
  """
  returns the sampled users as a data frame
  """
  stratum = frame.query("project_group == @project_group & edit_bin == @edit_bin")
  sample = stratum.sample(n=sample_size, random_state=RANDOM_STATE)
  return sample

f = []

for row in strata.itertuples(index=False):
  f.append(sample_stratum(*row))
  
sampled_users = pd.concat(f)

# Save samples table

In [50]:
sampled_users.to_csv("secrets/sampled_users.tsv", sep="\t")