In [28]:
# Importing Pywikibot may produce `Configuration variable ... is defined but unknown` warnings,
# but this is a false positive. It still authenticates properly.
import pywikibot as pwb
import datetime as dt
import re
import json
import time

In [2]:
SURVEY_BASE_URL = "https://wikimedia.qualtrics.com/jfe/form/SV_5ABs6WwrDHzAeLr?"
PREVIEW_BASE_URL = "https://wikimedia.qualtrics.com/jfe/preview/SV_5ABs6WwrDHzAeLr?Q_CHL=preview&"

LIST_BASE = "Community Engagement Insights/MassMessages/Lists/2018/"
TEST_LIST_BASE = "User:Neil P. Quinn/2018 CE Insights/"

In [3]:
frame = pd.read_csv("data/interim/sampling-frame.tsv", sep = "\t")
frame.head()

Unnamed: 0,user,home_proj,global_edits,proj_group,proj_domain
0,! Bikkit !,dewiki,219.0,dewiki,de.wikipedia.org
1,!NewLondon31,jawiki,46.0,jawiki,ja.wikipedia.org
2,!Silent,ptwiki,20688.8,ptwiki,pt.wikipedia.org
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki,pt.wikipedia.org
4,"""quasi"" tuttologo",itwiki,117.0,itwiki,it.wikipedia.org


# Set sample sizes

In [4]:
targets = pd.read_table("data/raw/project-group-sample-targets.tsv").set_index("proj_group")
targets.head()

Unnamed: 0_level_0,sample_target
proj_group,Unnamed: 1_level_1
enwiki,900
arwiki,675
zhwiki,450
frwiki,450
eswiki,1125


In [5]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)
edit_bins = [10, 30, 150, 600, 1200, 3500, top_edge]
frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

In [6]:
strata_sizes = frame.groupby(["proj_group", "binned_edits"]).size().unstack()
group_sizes = pd.DataFrame(frame.groupby(["proj_group"]).count()["user"])
strata_group_pcts = np.divide(strata_sizes, group_sizes)
strata_targets = np.ceil(
    # Make sure we don't target fewer than 20 users per bin for privacy reasons
    np.maximum(
        np.multiply(strata_group_pcts, targets),
        20
    )
).apply(np.int64)
strata_samples = np.minimum(strata_targets, strata_sizes)

In [7]:
strata_samples

binned_edits,"[10, 30)","[30, 150)","[150, 600)","[600, 1200)","[1200, 3500)","[3500, 1100000)"
proj_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arwiki,26,94,96,56,68,72
asia_wps,35,165,182,90,108,98
cee_wps,22,106,121,53,76,75
commons,20,53,84,51,88,166
dewiki,42,289,322,142,183,150
enwiki,28,143,133,49,54,46
eswiki,31,127,131,55,60,49
frwiki,65,298,303,137,166,159
itwiki,23,140,125,55,56,54
jawiki,30,172,143,47,44,20


In [8]:
strata = pd.DataFrame(
    strata_samples.stack()
).reset_index()

strata.columns = ["proj_group", "edit_bin", "sample_size"]

strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size
0,arwiki,"[10, 30)",26
1,arwiki,"[30, 150)",94
2,arwiki,"[150, 600)",96
3,arwiki,"[600, 1200)",56
4,arwiki,"[1200, 3500)",68


In [9]:
# Ignore any groups where we couldn't sample the minimum 20 users, because of re-identification risk
# In the 2018 edition, this is only 1 group
idx = strata.index[strata["sample_size"] < 20]

strata.loc[idx, "sample_size"] = 0

# Pull samples

In [10]:
def sample_stratum(stratum_query, sample_size):
    """
    returns the sampled users as a list of (user, proj_domain) tuples
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    sample = stratum.sample(n = sample_size)[["user", "proj_domain"]]

    sample_list = [
        i for i in sample.itertuples(index = False, name = "user")
    ]

    return sample_list

In [11]:
def sample_row(row):
    query = "proj_group == '{pg}' & binned_edits == '{eb}'".format(
        pg = row.proj_group,
        eb = row.edit_bin
    )
    
    return sample_stratum(query, row.sample_size)

In [12]:
strata["sampled_users"] = strata.apply(sample_row, axis = 1)

In [13]:
strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size,sampled_users
0,arwiki,"[10, 30)",26,"[(رائد عزمي حموده, ar.wikipedia.org), (عباس مج..."
1,arwiki,"[30, 150)",94,"[(BotScanner, ar.wikipedia.org), (فرح عماد الض..."
2,arwiki,"[150, 600)",96,"[(مروان عزب, ar.wikipedia.org), (Mjs 78, ar.wi..."
3,arwiki,"[600, 1200)",56,"[(Mr.mush3, ar.wikipedia.org), (جهاد كديس, ar...."
4,arwiki,"[1200, 3500)",68,"[(IamIRAQI, ar.wikipedia.org), (أميدامارو, ar...."


In [14]:
# Do all our strata have the correct number of sampled users?
(strata["sample_size"] == strata["sampled_users"].map(len)).value_counts()

True    108
dtype: int64

## Load corresponding test file

In [15]:
test_strata = pd.read_csv("data/raw/test-strata.tsv", sep = "\t")
test_strata["sampled_users"] = test_strata["sampled_users"].map(json.loads)
test_strata.head()

Unnamed: 0,proj_group,edit_bin,sampled_users
0,arwiki,"[10, 30)","[[Neil P. Quinn-WMF, ar.wikipedia.org], [EGalv..."
1,asia_wps,"[30, 150)","[[Neil P. Quinn-WMF, ko.wikipedia.org], [EGalv..."
2,cee_wps,"[150, 600)","[[Neil P. Quinn-WMF, uk.wikipedia.org], [EGalv..."
3,commons,"[600, 1200)","[[Neil P. Quinn-WMF, commons.wikimedia.org], [..."
4,dewiki,"[1200, 3500)","[[Neil P. Quinn-WMF, de.wikipedia.org], [EGalv..."


# Link building

In [16]:
edit_codes = {
    "[10, 30)": 1,
    "[30, 150)": 2,
    "[150, 600)": 3,
    "[600, 1200)": 4,
    "[1200, 3500)": 5,
    "[3500, 1100000)": 6
}

proj_codes = {
    "enwiki": "en",
    "arwiki": "ar",
    "zhwiki": "zh",
    "frwiki": "fr",
    "eswiki": "es",
    "ruwiki": "ru",
    "dewiki": "de",
    "ptwiki": "pt",
    "nlwiki": "nl",
    "itwiki": "it",
    "jawiki": "ja",
    "meaf_wps": "me",
    "cee_wps": "ce",
    "asia_wps": "as",
    "weur_wps": "we",
    "commons": "co",
    "wikidata": "dt",
    "other": "ot"
}

In [17]:
def audience(edit_bin):
    if edit_bin <= 3:
        return "AE"
    else:
        return "VAE"

def build_url(row, base_url):
    extra_params = "aud={aud}&prj={prj}&edc={edc}&prjedc={prj}{edc}"
    prj = proj_codes[row.proj_group]
    edc = edit_codes[row.edit_bin]
    
    return base_url + extra_params.format(
        aud = audience(edc),
        prj = prj,
        edc = edc,
    )

def survey_url(row):
    return build_url(row, SURVEY_BASE_URL)

def preview_url(row):
    return build_url(row, PREVIEW_BASE_URL)

In [18]:
strata["survey_url"] = strata.apply(survey_url, axis = 1)
strata["preview_url"] = strata.apply(preview_url, axis = 1)

## Create test links

In [20]:
test_strata["survey_url"] = test_strata.apply(survey_url, axis = 1)
test_strata["preview_url"] = test_strata.apply(preview_url, axis = 1)

# Upload MassMessage lists

## Set up and test Pywikibot access 

In [29]:
# Pywikibot automatically authenticates our user-config.py file
# See https://www.mediawiki.org/wiki/Manual:Pywikibot/user-config.py
pwb_site = pwb.Site("meta", "meta")

def save_page(title, text, comment):
    page = pwb.Page(pwb_site, title)
    page.text = text
    page.save(comment, minor = False)

In [30]:
my_username = pwb_site.user()

# Pywikibot sometimes takes time to authenticate even after finishing the cell above.
# Make sure it has finished before proceeding.
if my_username is None:
    raise ValueError("Pywikibot username is undefined.")

test_title = "User:{user}/Pywikibot test".format(user = my_username)
test_text = "Pywikibot saved this page at " + str(dt.datetime.now()) + "."
test_comment = "Test Pywikibot access"

# If the page already exists, this will overwrite it 
save_page(test_title, test_text, test_comment)

Page [[User:WMF Surveys/Pywikibot test]] saved


## Set up and test target list generation

In [31]:
def list_text(users):
    text = ""
    
    for user in users:
        text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], user[1])
        
    return text

In [32]:
def page_title(row, title_base):
    
    title = title_base + "{pg}{eb}"

    return title.format(
        pg = proj_codes[row.proj_group], 
        eb = edit_codes[row.edit_bin]
    )

def real_page_title(row):
    return page_title(row, LIST_BASE)

def test_page_title(row):
    return page_title(row, TEST_LIST_BASE)

In [33]:
strata["page_title"] = strata.apply(real_page_title, axis = 1)
strata.head()

Unnamed: 0,proj_group,edit_bin,sample_size,sampled_users,survey_url,preview_url,page_title
0,arwiki,"[10, 30)",26,"[(رائد عزمي حموده, ar.wikipedia.org), (عباس مج...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,Community Engagement Insights/MassMessages/Lis...
1,arwiki,"[30, 150)",94,"[(BotScanner, ar.wikipedia.org), (فرح عماد الض...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,Community Engagement Insights/MassMessages/Lis...
2,arwiki,"[150, 600)",96,"[(مروان عزب, ar.wikipedia.org), (Mjs 78, ar.wi...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,Community Engagement Insights/MassMessages/Lis...
3,arwiki,"[600, 1200)",56,"[(Mr.mush3, ar.wikipedia.org), (جهاد كديس, ar....",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,Community Engagement Insights/MassMessages/Lis...
4,arwiki,"[1200, 3500)",68,"[(IamIRAQI, ar.wikipedia.org), (أميدامارو, ar....",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,Community Engagement Insights/MassMessages/Lis...


In [34]:
for row in strata.sample(n = 4).itertuples(index = False):
    print(row.page_title)
    print(list_text(row.sampled_users))

Community Engagement Insights/MassMessages/Lists/2018/de5
* {{target | user = Distelfinck | site = de.wikipedia.org}}
* {{target | user = WhoisWhoME | site = de.wikipedia.org}}
* {{target | user = Enyavar | site = de.wikipedia.org}}
* {{target | user = ⵆ | site = de.wikipedia.org}}
* {{target | user = MSchnitzler2000 | site = de.wikipedia.org}}
* {{target | user = Roehrensee | site = de.wikipedia.org}}
* {{target | user = Wibramuc | site = de.wikipedia.org}}
* {{target | user = Nobody perfect | site = de.wikipedia.org}}
* {{target | user = Acky69 | site = de.wikipedia.org}}
* {{target | user = Johnny T | site = de.wikipedia.org}}
* {{target | user = Dr Lol | site = de.wikipedia.org}}
* {{target | user = Schroths | site = de.wikipedia.org}}
* {{target | user = BMK | site = de.wikipedia.org}}
* {{target | user = Vexillum | site = de.wikipedia.org}}
* {{target | user = Göte | site = de.wikipedia.org}}
* {{target | user = HubiB | site = de.wikipedia.org}}
* {{target | user = Pyaet | site =

## Upload the test list pages

In [35]:
test_strata["page_title"] = test_strata.apply(test_page_title, axis = 1)
test_strata.head()

Unnamed: 0,proj_group,edit_bin,sampled_users,survey_url,preview_url,page_title
0,arwiki,"[10, 30)","[[Neil P. Quinn-WMF, ar.wikipedia.org], [EGalv...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,User:Neil P. Quinn/2018 CE Insights/ar1
1,asia_wps,"[30, 150)","[[Neil P. Quinn-WMF, ko.wikipedia.org], [EGalv...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,User:Neil P. Quinn/2018 CE Insights/as2
2,cee_wps,"[150, 600)","[[Neil P. Quinn-WMF, uk.wikipedia.org], [EGalv...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,User:Neil P. Quinn/2018 CE Insights/ce3
3,commons,"[600, 1200)","[[Neil P. Quinn-WMF, commons.wikimedia.org], [...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,User:Neil P. Quinn/2018 CE Insights/co4
4,dewiki,"[1200, 3500)","[[Neil P. Quinn-WMF, de.wikipedia.org], [EGalv...",https://wikimedia.qualtrics.com/jfe/form/SV_5A...,https://wikimedia.qualtrics.com/jfe/preview/SV...,User:Neil P. Quinn/2018 CE Insights/de5


In [36]:
for row in test_strata.itertuples(index = False):
    users = row.sampled_users
    
    # Only upload a list if the group has some sampled users
    if len(users) > 0:
        title = row.page_title
        text = list_text(users)
        comment = "Upload a test invitation list for the 2018 [[Community Engagement Insights]] survey"
        
        save_page(title, text, comment)

Sleeping for 9.9 seconds, 2018-03-28 17:40:42
Page [[User:Neil P. Quinn/2018 CE Insights/ar1]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:40:52
Page [[User:Neil P. Quinn/2018 CE Insights/as2]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:41:02
Page [[User:Neil P. Quinn/2018 CE Insights/ce3]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:41:12
Page [[User:Neil P. Quinn/2018 CE Insights/co4]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:41:22
Page [[User:Neil P. Quinn/2018 CE Insights/de5]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:41:32
Page [[User:Neil P. Quinn/2018 CE Insights/en6]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:41:42
Page [[User:Neil P. Quinn/2018 CE Insights/es1]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:41:52
Page [[User:Neil P. Quinn/2018 CE Insights/fr2]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:42:02
Page [[User:Neil P. Quinn/2018 CE Insights/it3]] saved
Sleeping for 9.7 seconds, 2018-03-28 17:42:12
Page [[User:Neil P. Quinn/2018 CE Insights/ja

## Upload the list pages

<div class="alert alert-block alert-warning">
    <p>Running the cell below will create a lot of pages on Meta (or overwrite them if they've already been created).</p>
    
    <p>Run it only if you want to do that (but, if you run it accidentally, no major harm done).</p>
</div>

In [None]:
for row in strata.itertuples(index = False):
    users = row.sampled_users
    
    # Only upload a list if the group has some sampled users
    if len(users) > 0:
        title = row.page_title
        text = list_text(users)
        comment = "Upload an invitation list for the 2018 [[Community Engagement Insights]] survey"
        
        # Uncomment this to really create all the pages
        # save_page(title, text, comment)

# Save our samples

In [37]:
# Have to serialize our arrays properly so we can reuse them in another file
strata["sampled_users"] = strata["sampled_users"].map(json.dumps)
strata.to_csv("data/interim/strata.tsv", sep = "\t", index = False)

In [38]:
test_strata["sampled_users"] = test_strata["sampled_users"].map(json.dumps)
test_strata.to_csv("data/interim/test-strata.tsv", sep = "\t", index = False)