In [308]:
import pywikibot as pwb
import datetime as dt
import re

In [285]:
frame = pd.read_csv("sampling-frame.tsv", sep = "\t")
frame.head()

Unnamed: 0,user,home_proj,global_edits,proj_group,proj_domain
0,! Bikkit !,dewiki,219.0,dewiki,de.wikipedia.org
1,!NewLondon31,jawiki,46.0,jawiki,ja.wikipedia.org
2,!Silent,ptwiki,20688.8,ptwiki,pt.wikipedia.org
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki,pt.wikipedia.org
4,"""quasi"" tuttologo",itwiki,117.0,itwiki,it.wikipedia.org


# Set sample sizes

In [286]:
targets = pd.read_table("project-group-sample-targets.tsv").set_index("proj_group")
targets.head()

Unnamed: 0_level_0,sample_target
proj_group,Unnamed: 1_level_1
enwiki,900
arwiki,675
zhwiki,450
frwiki,450
eswiki,1125


In [287]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)

edit_bins = [10, 30, 150, 600, 1200, 3500, top_edge]

frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

In [288]:
strata_sizes = frame.groupby(["proj_group", "binned_edits"]).size().unstack()

In [289]:
group_sizes = pd.DataFrame(frame.groupby(["proj_group"]).count()["user"])

In [290]:
strata_group_pcts = np.divide(strata_sizes, group_sizes)

In [291]:
strata_targets = np.ceil(
    # Make sure we don't target fewer than 20 users per bin for privacy reasons
    np.maximum(
        np.multiply(strata_group_pcts, targets),
        20
    )
).apply(np.int64)

In [292]:
strata_samples = np.minimum(strata_targets, strata_sizes)

In [293]:
strata_samples

binned_edits,"[10, 30)","[30, 150)","[150, 600)","[600, 1200)","[1200, 3500)","[3500, 1100000)"
proj_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arwiki,26,94,96,56,68,72
asia_wps,35,165,182,90,108,98
cee_wps,22,106,121,53,76,75
commons,20,53,84,51,88,166
dewiki,42,289,322,142,183,150
enwiki,28,143,133,49,54,46
eswiki,31,127,131,55,60,49
frwiki,65,298,303,137,166,159
itwiki,23,140,125,55,56,54
jawiki,30,172,143,47,44,20


In [294]:
strata = pd.DataFrame(
    strata_samples.stack()
).reset_index()

strata.columns = ["proj_group", "edit_bin", "sample_size"]

strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size
0,arwiki,"[10, 30)",26
1,arwiki,"[30, 150)",94
2,arwiki,"[150, 600)",96
3,arwiki,"[600, 1200)",56
4,arwiki,"[1200, 3500)",68


In [295]:
# Ignore any groups where we couldn't sample the minimum 20 users, because of re-identification risk
# In the 2018 edition, this is only 1 group
idx = strata.index[strata["sample_size"] < 20]

strata.loc[idx, "sample_size"] = 0

# Pull samples

In [296]:
def sample_stratum(stratum_query, sample_size):
    """
    returns the sampled users as a list of (user, proj_domain) tuples
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    sample = stratum.sample(n = sample_size)[["user", "proj_domain"]]

    sample_list = [
        i for i in sample.itertuples(index = False, name = "user")
    ]

    return sample_list

In [297]:
def sample_row(row):
    query = "proj_group == '{pg}' & binned_edits == '{eb}'".format(
        pg = row.proj_group,
        eb = row.edit_bin
    )
    
    return sample_stratum(query, row.sample_size)

In [298]:
strata["sampled_users"] = strata.apply(sample_row, axis = 1)

In [299]:
strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size,sampled_users
0,arwiki,"[10, 30)",26,"[(Benohamid, ar.wikipedia.org), (Aboutthereade..."
1,arwiki,"[30, 150)",94,"[(Ameen Rammal, ar.wikipedia.org), (Ahmed.m.ha..."
2,arwiki,"[150, 600)",96,"[(اعرف اكثر, ar.wikipedia.org), (Omar Alyahya,..."
3,arwiki,"[600, 1200)",56,"[(وضاح, ar.wikipedia.org), (Snopic, ar.wikiped..."
4,arwiki,"[1200, 3500)",68,"[(وهراني, ar.wikipedia.org), (Qrmoo3, ar.wikip..."


In [300]:
# Do all our strata have the correct number of sampled users?
(strata["sample_size"] == strata["sampled_users"].map(len)).value_counts()

True    108
dtype: int64

# Upload MassMessage lists

## Set up and test Pywikibot access 

In [321]:
# Pywikibot automatically authenticates using your user-config.py file
# See https://www.mediawiki.org/wiki/Manual:Pywikibot/user-config.py
pwb_site = pwb.Site("meta", "meta")

def save_page(title, text, comment):
    page = pwb.Page(pwb_site, title)
    page.text = text
    page.save(comment, minor = False)

In [322]:
my_username = pwb_site.user()
test_title = "User:{user}/Pywikibot test".format(user = my_username)
test_text = "Pywikibot saved this page at " + str(dt.datetime.now()) + "."
test_comment = "Test Pywikibot access"

# If the page already exists, this will overwrite it 
save_page(test_title, test_text, test_comment)

Page [[User:Neil P. Quinn-WMF/Pywikibot test]] saved


## Set up and test target list generation

In [303]:
def list_text(users):
    text = ""
    
    for user in users:
        text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], user[1])
        
    return text

In [310]:
def page_title(row):
    title = "Community Engagement Insights/MassMessages/Lists/2018/{pg} - {eb}"
    
    # Mediawiki does not allow brackets in page titles
    sanitized_bin = re.sub("[\[\]\(\)]", "", row.edit_bin)

    return title.format(pg = row.proj_group, eb = sanitized_bin)

In [311]:
strata["page_title"] = strata.apply(page_title, axis = 1)
strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size,sampled_users,page_title
0,arwiki,"[10, 30)",26,"[(Benohamid, ar.wikipedia.org), (Aboutthereade...",Community Engagement Insights/MassMessages/Lis...
1,arwiki,"[30, 150)",94,"[(Ameen Rammal, ar.wikipedia.org), (Ahmed.m.ha...",Community Engagement Insights/MassMessages/Lis...
2,arwiki,"[150, 600)",96,"[(اعرف اكثر, ar.wikipedia.org), (Omar Alyahya,...",Community Engagement Insights/MassMessages/Lis...
3,arwiki,"[600, 1200)",56,"[(وضاح, ar.wikipedia.org), (Snopic, ar.wikiped...",Community Engagement Insights/MassMessages/Lis...
4,arwiki,"[1200, 3500)",68,"[(وهراني, ar.wikipedia.org), (Qrmoo3, ar.wikip...",Community Engagement Insights/MassMessages/Lis...


In [312]:
for row in strata.sample(n = 4).itertuples(index = False):
    print(row.page_title)
    print(list_text(row.sampled_users))

Community Engagement Insights/MassMessages/Lists/2018/wikidata - 3500, 1100000
* {{target | user = Labant | site = www.wikidata.org}}
* {{target | user = Canley | site = www.wikidata.org}}
* {{target | user = &beer&love | site = www.wikidata.org}}
* {{target | user = FShbib | site = www.wikidata.org}}
* {{target | user = ShinePhantom | site = www.wikidata.org}}
* {{target | user = Vojtěch Dostál | site = www.wikidata.org}}
* {{target | user = Jon Harald Søby | site = www.wikidata.org}}
* {{target | user = Ahoerstemeier | site = www.wikidata.org}}
* {{target | user = ESM | site = www.wikidata.org}}
* {{target | user = Magnus Manske | site = www.wikidata.org}}
* {{target | user = Teolemon | site = www.wikidata.org}}
* {{target | user = YMS | site = www.wikidata.org}}
* {{target | user = Stevenliuyi | site = www.wikidata.org}}
* {{target | user = Jklamo | site = www.wikidata.org}}
* {{target | user = Aftabuzzaman | site = www.wikidata.org}}
* {{target | user = Pasleim | site = www.wikidat

## Upload the list pages

<div class="alert alert-block alert-warning">
    <p>Running the cell below will create a lot of pages on Meta (or overwrite them if they've already been created).</p>
    
    <p>Run it only if you want to do that (but, if you run it accidentally, no major harm done).</p>
</div>

In [None]:
for row in strata.itertuples(index = False):
    users = row.sampled_users
    
    # Only upload a list if the group has some sampled users
    if len(users) > 0:
        title = row.page_title
        text = list_text(users)
        comment = "Upload an invitation list for the 2018 [[Community Engagement Insights]] survey"
        
        # save_page(title, text, comment)

# Save our samples

In [315]:
strata.to_csv("./sampled-strata.tsv", sep = "\t", index = False)