# Obtain samples of active editors

In [1]:
import datetime as dt
import re
import json
import time
import pandas as pd
import numpy as np


In [2]:
frame = pd.read_csv("data/interim/sampling-frame.tsv", sep = "\t")

# Set sample sizes

In [3]:
targets = pd.read_table("data/raw/project-group-sample-targets.tsv").set_index("proj_group")
targets.head()

Unnamed: 0_level_0,sample_target
proj_group,Unnamed: 1_level_1
enwiki,900
arwiki,675
zhwiki,450
frwiki,450
eswiki,1125


In [4]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)
edit_bins = [10, 30, 150, 600, 1200, top_edge]
frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

In [5]:
strata_sizes = frame.groupby(["project_group", "binned_edits"]).size().unstack()

group_sizes = frame.groupby(["project_group"]).size()
strata_group_pcts = strata_sizes.divide(group_sizes, axis=0)

strata_samples = (
    strata_group_pcts
    .multiply(targets.squeeze(), axis=0)
    # Make sure we don't target fewer than 20 users per bin for privacy reasons
    .clip(lower=20)
    # Round up
    .pipe(np.ceil)
    .applymap(int)
    # We can't sample more users than the bin actually contains
    .clip(upper=strata_sizes)
)

In [20]:
strata_samples

binned_edits,"[10, 30)","[30, 150)","[150, 600)","[600, 1200)","[1200, 2600000)"
arwiki,50,116,99,40,124
asia_wps,29,106,110,59,149
cee_wps,55,256,267,130,346
commons,93,337,334,191,847
dewiki,20,105,130,61,138
enwiki,56,291,263,97,196
eswiki,91,335,315,135,251
frwiki,26,126,118,53,129
itwiki,29,124,123,51,125
jawiki,35,167,142,50,59


In [6]:
strata = pd.DataFrame(
    strata_samples.stack()
).reset_index()

strata.columns = ["project_group", "edit_bin", "sample_size"]

strata.head()

Unnamed: 0,project_group,edit_bin,sample_size
0,arwiki,"[10, 30)",50
1,arwiki,"[30, 150)",116
2,arwiki,"[150, 600)",99
3,arwiki,"[600, 1200)",40
4,arwiki,"[1200, 2600000)",124


In [7]:
# Ignore any groups where we couldn't sample the minimum 20 users, because of re-identification risk
# In the 2019 edition, this is 3 groups
idx = strata.index[strata["sample_size"] < 20]

strata.loc[idx, "sample_size"] = 0

# Pull samples

In [8]:
def sample_stratum(stratum_query, sample_size):
    """
    returns the sampled users as a list of (user, project_domain) tuples
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    sample = stratum.sample(n = sample_size, random_state = 123)[["user", "home_proj", "user_email", "email_verification_date"]]

    sample_list = [
        i for i in sample.itertuples(index = False, name = "user")
    ]

    return sample_list

In [9]:
def sample_row(row):
    query = "project_group == '{pg}' & binned_edits == '{eb}'".format(
        pg = row.project_group,
        eb = row.edit_bin
    )
    
    return sample_stratum(query, row.sample_size)

In [10]:
strata["sampled_users"] = strata.apply(sample_row, axis = 1)

In [11]:
# Do all our strata have the correct number of sampled users?
(strata["sample_size"] == strata["sampled_users"].map(len)).value_counts()

True    110
dtype: int64

# Create table of all sampled users 

In [12]:
# Convert table to desired format

sampled_users = pd.DataFrame([], columns = ["user_name","home_project", "email_address", "email_verification_date", "project_group", "edit_bin"])
for strat in strata.values:
    for user in strat[3]:
        sampled_users = sampled_users.append(pd.Series([user[0],user[1],user[2], user[3], strat[0], strat[1]],index=sampled_users.columns),ignore_index=True)
              

In [13]:
## Append the Program and Dashboard event leaders groups
## Note: Output hidden due to inclusion of private info
pe_users = pd.read_csv("data/interim/pe_users_table.tsv", sep = "\t")

In [None]:
#Reindex pe_users table to match sampled users
## Note: Output hidden due to inclusion of private info
column_titles = ['user_name', 'home_project', 'email_address', 'email_verification_date', 'project_group']
pe_users.reindex(columns=column_titles)


In [15]:
sampled_users_all = sampled_users.append(pe_users, ignore_index= True)

In [16]:
# quick check of length to ensure the data frames were combined correctly
sampled_users_all_len = len(sampled_users_all["user_name"])

In [17]:
sampled_users_all.user_name.nunique() == sampled_users_all_len

True

In [18]:
# Do we have the users we want (1325 PE leaders + 14965 active editors from sample] = 16290
sampled_users_all["user_name"].nunique() == 16290

True

# Save samples table

In [19]:
sampled_users_all.to_csv("data/interim/sampled_users_all.tsv", sep = "\t", index = False)