# Preparation

In [2]:
import datetime as dt
import re
import json
import time
import pandas as pd
import numpy as np

In [12]:
# An arbitrary integer to use as a sampling seed so the sampling is deterministic and can be re-run
RANDOM_STATE = 788319

frame = pd.read_parquet("secrets/sampling-frame.parquet")
previous_sample = pd.read_table("secrets/sampled-users.tsv").set_index("user_name")
strata_targets = pd.read_table("definitions/strata-followup-sample-targets.tsv").set_index("project_group")

# Remove previously sampled users from frame

In [13]:
frame = frame.drop(previous_sample.index)

# Set sample sizes

In [22]:
# Stratify sample
strata_sizes = frame.groupby(["project_group", "edit_bin"]).size().unstack()

# `strata_targets` has string column names, but `strata_sizes` has categoricals
strata_targets.index = strata_sizes.index

In [25]:
# The targets must not be larger than the strata, or the sampling will hit an error
excessive_targets = (strata_targets > strata_sizes)
assert excessive_targets.any(axis=None) == False

In [26]:
# Set target to 0 for any strata with fewer than 10 users,
# to prevent cases where de-anonymization of data would be too easy
strata_sizes = strata_targets.where(strata_sizes >= 10, other=0)

In [27]:
strata = (
  strata_sizes
  .unstack()
  .reset_index()
  .rename({0: "sample_size"}, axis=1)
  # Ensure columns are ordered in the expected way
  [["project_group", "edit_bin", "sample_size"]]
)

strata.head()

Unnamed: 0,project_group,edit_bin,sample_size
0,arwiki,10-29,0
1,asia_wps,10-29,0
2,cee_wps,10-29,25
3,commons,10-29,0
4,dewiki,10-29,0


# Pull samples

In [28]:
def sample_stratum(project_group, edit_bin, sample_size):
  """
  returns the sampled users as a data frame
  """
  stratum = frame.query("project_group == @project_group & edit_bin == @edit_bin")
  sample = stratum.sample(n=sample_size, random_state=RANDOM_STATE)
  return sample

f = []

for row in strata.itertuples(index=False):
  f.append(sample_stratum(*row))
  
sampled_users = pd.concat(f)

In [29]:
sampled_users.info()

<class 'pandas.core.frame.DataFrame'>
Index: 9476 entries, Dimagp to Cedric tsan cantonais
Data columns (total 5 columns):
 #   Column         Non-Null Count  Dtype   
---  ------         --------------  -----   
 0   email          9476 non-null   object  
 1   home_project   9476 non-null   category
 2   global_edits   9476 non-null   float64 
 3   project_group  9476 non-null   category
 4   edit_bin       9476 non-null   category
dtypes: category(3), float64(1), object(1)
memory usage: 273.0+ KB


# Save samples table

In [32]:
sampled_users.to_csv("secrets/followup-sampled-users.tsv", sep="\t")