In [176]:
import pywikibot as pwb
import datetime as dt

In [79]:
frame = pd.read_csv("sampling-frame.tsv", sep = "\t")
frame.head()

Unnamed: 0,user,home_proj,global_edits,proj_group,proj_domain
0,! Bikkit !,dewiki,219.0,dewiki,de.wikipedia.org
1,!NewLondon31,jawiki,46.0,jawiki,ja.wikipedia.org
2,!Silent,ptwiki,20688.8,ptwiki,pt.wikipedia.org
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki,pt.wikipedia.org
4,"""quasi"" tuttologo",itwiki,117.0,itwiki,it.wikipedia.org


# Setting sample sizes

In [80]:
targets = pd.read_table("project-group-sample-targets.tsv").set_index("proj_group")
targets.head()

Unnamed: 0_level_0,sample_target
proj_group,Unnamed: 1_level_1
enwiki,900
arwiki,675
zhwiki,450
frwiki,450
eswiki,1125


In [82]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)

edit_bins = [10, 30, 150, 600, 1200, 3500, top_edge]

frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

In [84]:
strata_sizes = frame.groupby(["proj_group", "binned_edits"]).size().unstack()

In [85]:
group_sizes = pd.DataFrame(frame.groupby(["proj_group"]).count()["user"])

In [86]:
strata_group_pcts = np.divide(strata_sizes, group_sizes)

In [87]:
strata_targets = np.ceil(
    # Make sure we don't target fewer than 20 users per bin for privacy reasons
    np.maximum(
        np.multiply(strata_group_pcts, targets),
        20
    )
).apply(np.int64)

In [88]:
strata_samples = np.minimum(strata_targets, strata_sizes)

In [89]:
strata_samples

binned_edits,"[10, 30)","[30, 150)","[150, 600)","[600, 1200)","[1200, 3500)","[3500, 1100000)"
proj_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arwiki,26,94,96,56,68,72
asia_wps,35,165,182,90,108,98
cee_wps,22,106,121,53,76,75
commons,20,53,84,51,88,166
dewiki,42,289,322,142,183,150
enwiki,28,143,133,49,54,46
eswiki,31,127,131,55,60,49
frwiki,65,298,303,137,166,159
itwiki,23,140,125,55,56,54
jawiki,30,172,143,47,44,20


# Code below not yet updated for 2018

In [230]:
strata = pd.DataFrame(
    strata_samples.stack()
).reset_index()

strata.columns = ["proj_group", "edit_bin", "sample_size"]

strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size
0,arwiki,"[10, 30)",26
1,arwiki,"[30, 150)",94
2,arwiki,"[150, 600)",96
3,arwiki,"[600, 1200)",56
4,arwiki,"[1200, 3500)",68


In [248]:
# Ignore any groups where we couldn't sample the minimum 20 users, because of re-identification risk
# In the 2018 edition, this is only 1 group
idx = strata.index[strata["sample_size"] < 20]

strata.loc[idx, "sample_size"] = 0

In [251]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a list of the sampled users 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    sample = stratum.sample(n = sample_size)[["user", "home_proj"]]

    sample_list = [
        i for i in sample.itertuples(index = False, name = "user")
    ]

    return sample_list

In [252]:
results = []

for row in strata.itertuples(index = False):
    query = "proj_group == '{pg}' & binned_edits == '{eb}'".format(
        pg = row.proj_group,
        eb = row.edit_bin
    )
    
    sample = sample_stratum(query, row.sample_size)
    
    results.append(sample)

strata["sampled_users"] = results

In [255]:
# Do all our strata have the correct number of sampled users?
(strata["sample_size"] == strata["sampled_users"].map(len)).value_counts()

True    108
dtype: int64

# Test Pywikibot access 

In [177]:
# Pywikibot automatically authenticates using your user-config.py file
# See https://www.mediawiki.org/wiki/Manual:Pywikibot/user-config.py

site = pwb.Site("meta", "meta")

username = site.user()

page = pwb.Page(
    site, 
    "User:{user}/Pywikibot test".format(user = username)
)

# If the page already exists, this will overwrite it
page.text = ("Pywikibot saved this page at " + str(dt.datetime.now()) + ".")
page.save(u"Test Pywikibot access", minor = False)

Page [[User:Neil P. Quinn-WMF/Pywikibot test]] saved


# Upload MassMessage lists

In [None]:
for row in strata.itertuples(index = False):
    # No need to upload a list if the group is being ignored
    if len(row.sampled_users) > 0:
        text = ""
        for user in row.sampled_users:
            text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], lookup_domain(user[1]))
        

        site = pwb.Site()
        page = pwb.Page(site, "Community Engagement Insights/MassMessages/Lists/2018/" + row.code)
        page.text = text
        page.save(u"Upload a target list for the 2018 editor survey", minor = False)