# Obtain samples of active editors

In [6]:
import datetime as dt
import re
import json
import time
import pandas as pd
import numpy as np


In [7]:
frame = pd.read_csv("data/interim/sampling-frame.tsv", sep = "\t")

# Set sample sizes

In [8]:
targets = pd.read_table("data/raw/project-group-sample-targets.tsv").set_index("proj_group")
targets.head()

Unnamed: 0_level_0,sample_target
proj_group,Unnamed: 1_level_1
enwiki,912
arwiki,2716
zhwiki,271
frwiki,337
eswiki,1409


In [9]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)
edit_bins = [10, 30, 150, 600, 1200, top_edge]
frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

In [10]:
#stratify sample 
strata_sizes = frame.groupby(["project_group", "binned_edits"]).size().unstack()

group_sizes = frame.groupby(["project_group"]).size()
strata_group_pcts = strata_sizes.divide(group_sizes, axis=0)

strata_samples = (
    strata_group_pcts
    .multiply(targets.squeeze(), axis=0)
    # Make sure we don't target fewer than 20 users per bin for privacy reasons
    .clip(lower=20)
    # Round up
    .pipe(np.ceil)
    .applymap(int)
    # We can't sample more users than the bin actually contains
    .clip(upper=strata_sizes)
)

In [11]:
strata_samples.head()

binned_edits,"[10, 30)","[30, 150)","[150, 600)","[600, 1200)","[1200, 700000)"
arwiki,31,97,47,21,40
asia_wps,41,174,138,51,101
cee_wps,45,185,190,85,215
commons,100,330,239,122,471
dewiki,20,20,20,20,20


In [13]:
strata = pd.DataFrame(
    strata_samples.stack()
).reset_index()

strata.columns = ["project_group", "edit_bin", "sample_size"]

strata.head()

Unnamed: 0,project_group,edit_bin,sample_size
0,arwiki,"[10, 30)",31
1,arwiki,"[30, 150)",97
2,arwiki,"[150, 600)",47
3,arwiki,"[600, 1200)",21
4,arwiki,"[1200, 700000)",40


In [14]:
# Ignore any groups where we couldn't sample the minimum 20 users, because of re-identification risk
# In the 2019 edition, this is 3 groups
idx = strata.index[strata["sample_size"] < 20]

strata.loc[idx, "sample_size"] = 0

In [15]:
#set all dewiki groups to 0 since we are not sampling any users for this group

idx = strata.index[strata["project_group"] == 'dewiki']

strata.loc[idx, "sample_size"] = 0

# Pull samples

In [18]:
def sample_stratum(stratum_query, sample_size):
    """
    returns the sampled users as a list of (user, project_domain) tuples
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    sample = stratum.sample(n = sample_size, random_state = 134)[["user", "home_proj", "user_email", "email_verification_date"]]

    sample_list = [
        i for i in sample.itertuples(index = False, name = "user")
    ]

    return sample_list



In [19]:
def sample_row(row):
    query = "project_group == '{pg}' & binned_edits == '{eb}'".format(
        pg = row.project_group,
        eb = row.edit_bin
    )
    
    return sample_stratum(query, row.sample_size)

In [20]:
#change binned edits column in frame to string for comparison
frame['binned_edits'] = frame['binned_edits'].astype(str)

In [21]:
strata["sampled_users"] = strata.apply(sample_row, axis = 1)

In [22]:
# Do all our strata have the correct number of sampled users?
(strata["sample_size"] == strata["sampled_users"].map(len)).value_counts()

True    110
dtype: int64

# Create table of all sampled users 

In [23]:
# Convert table to desired format

sampled_users = pd.DataFrame([], columns = ["user_name","home_project", "email_address", "email_verification_date", "project_group", "edit_bin"])
for strat in strata.values:
    for user in strat[3]:
        sampled_users = sampled_users.append(pd.Series([user[0],user[1],user[2], user[3], strat[0], strat[1]],index=sampled_users.columns),ignore_index=True)
              

In [24]:
#  check of length to ensure there are no duplicates
sampled_users_len = len(sampled_users["user_name"])

In [25]:
sampled_users.user_name.nunique() == sampled_users_len

True

# Save samples table

In [26]:
sampled_users.to_csv("data/interim/sampled_users.tsv", sep = "\t", index = False)