In [1]:
import pandas as pd
import pywikibot as pwb

In [2]:
frame_data = pd.read_table("sampling_frame.tsv")
frame_data.head()

Unnamed: 0,user_name,home_wiki,global_edits,wiki,edits
0,! Bikkit !,dewiki,1261.6,dewiki,1170.0
1,! Bikkit !,dewiki,1261.6,enwiki,77.0
2,! Bikkit !,dewiki,1261.6,commonswiki,13.0
3,! Bikkit !,dewiki,1261.6,frwiki,1.0
4,! Bikkit !,dewiki,1261.6,wikidatawiki,0.6


In [3]:
per_wiki = frame_data[["user_name", "wiki", "edits"]]
per_wiki = per_wiki.pivot("user_name", "wiki")
per_wiki.columns = per_wiki.columns.droplevel()
per_wiki.head()

wiki,abwiki,acewiki,adywiki,afwiki,afwikibooks,afwikiquote,afwiktionary,akwiki,alswiki,amwiki,...,zh_yuewiki,zhwiki,zhwikibooks,zhwikinews,zhwikiquote,zhwikisource,zhwikivoyage,zhwiktionary,zuwiki,zuwiktionary
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! Bikkit !,,,,,,,,,,,...,,,,,,,,,,
!KrzysiekBu!,,,,,,,,,,,...,,,,,,,,,,
!Manihiki,,,,,,,,,,,...,,,,,,,,,,
!Silent,,,,,,,,,,,...,,,,,,,,,,
!minmi73!,,,,,,,,,,,...,,,,,,,,,,


In [4]:
all_wikis = frame_data[["user_name", "home_wiki", "global_edits"]]
grouped = all_wikis.groupby("user_name")
all_wikis = grouped.first().reset_index()
all_wikis.head()

Unnamed: 0,user_name,home_wiki,global_edits
0,! Bikkit !,dewiki,1261.6
1,!KrzysiekBu!,plwiki,2383.6
2,!Manihiki,itwikibooks,723.0
3,!Silent,ptwiki,31509.8
4,!minmi73!,jawiki,65.0


In [5]:
frame = all_wikis.join(per_wiki, on = "user_name")
frame.head()

Unnamed: 0,user_name,home_wiki,global_edits,abwiki,acewiki,adywiki,afwiki,afwikibooks,afwikiquote,afwiktionary,...,zh_yuewiki,zhwiki,zhwikibooks,zhwikinews,zhwikiquote,zhwikisource,zhwikivoyage,zhwiktionary,zuwiki,zuwiktionary
0,! Bikkit !,dewiki,1261.6,,,,,,,,...,,,,,,,,,,
1,!KrzysiekBu!,plwiki,2383.6,,,,,,,,...,,,,,,,,,,
2,!Manihiki,itwikibooks,723.0,,,,,,,,...,,,,,,,,,,
3,!Silent,ptwiki,31509.8,,,,,,,,...,,,,,,,,,,
4,!minmi73!,jawiki,65.0,,,,,,,,...,,,,,,,,,,


In [6]:
strata = pd.read_table("strata_definitions.tsv")
strata.head()

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population
0,01-AEAR,AE,arwiki,0,150,225,
1,02-AEZH,AE,zhwiki,0,150,225,
2,03-AEFR,AE,frwiki,0,150,225,
3,04-AEES,AE,eswiki,0,375,563,
4,05-AERU,AE,ruwiki,0,150,225,


In [7]:
# Tidy data
strata["home_wiki"] = strata["home_wiki"].apply(str.strip)
strata.loc[
    strata['home_wiki'] == "other projects",
    "home_wiki"
] = "other"

In [8]:
wps_by_group = pd.read_table("wikipedias_by_group.tsv")

# Fix one error in the data entry
wps_by_group.loc[wps_by_group["site_key"] == "ruwiki", "group"] = "ruwiki"
wps_by_group.head()

Unnamed: 0,language,site_key,group
0,Afar,aawiki,ssa
1,Abkhazian,abwiki,mena
2,Acehnese,acewiki,asia
3,Adyghe,adywiki,cee
4,Afrikaans,afwiki,ssa


In [9]:
# Map each group to its component wikis
wiki_groups = {group: None for group in set(strata["home_wiki"])}

regions = ["ssa", "mena", "asia", "cee", "weur"]

for group in regions:
    wiki_groups[group] = set(wps_by_group.query("group == @group")["site_key"])

all_projects = set(frame_data["wiki"])

wiki_groups["other"] = all_projects - wiki_groups.keys() - wiki_groups["ssa"] - wiki_groups["mena"] \
    - wiki_groups["asia"] - wiki_groups["cee"] - wiki_groups["weur"]
    
for key, val in wiki_groups.items():
    if val is None:
        wiki_groups[key] = set([key])

In [10]:
queries = []

for stratum in strata.itertuples(index = False):
    if stratum.audience == "AE":
        query = "global_edits < 600"
    else:
        query = "global_edits >= 600"
    
    query += (" & home_wiki.isin({})".format(list(wiki_groups[stratum.home_wiki])))
    
    if stratum.home_wiki != "enwiki":
        if stratum.also_active_enwiki:
            query += " & enwiki >= 30"
        else:
            query += " & (enwiki < 30 | enwiki.isnull())"
    
    queries.append(query)
    
strata["query"] = queries

for query in strata["query"][:5]:
    print(query)

global_edits < 600 & home_wiki.isin(['arwiki']) & (enwiki < 30 | enwiki.isnull())
global_edits < 600 & home_wiki.isin(['zhwiki']) & (enwiki < 30 | enwiki.isnull())
global_edits < 600 & home_wiki.isin(['frwiki']) & (enwiki < 30 | enwiki.isnull())
global_edits < 600 & home_wiki.isin(['eswiki']) & (enwiki < 30 | enwiki.isnull())
global_edits < 600 & home_wiki.isin(['ruwiki']) & (enwiki < 30 | enwiki.isnull())


In [11]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a tuple consisting of (1) the population count for that subgroup and
    (2) a list of the sampled users in MassMessage format 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    if sample_size == 0:
        sample = None
    elif sample_size > stratum_size:
        sample = stratum[["user_name", "home_wiki"]]
    else:
        sample = stratum.sample(n = sample_size)[["user_name", "home_wiki"]]
        
    if sample is None:
        sample_list = None
    else:
        sample_list = [
            i for i in sample.itertuples(index = False, name = "user")
        ]

    return (stratum_size, sample_list)

In [12]:
results = [[], []]

for row in strata.itertuples(index = False):
    size, sample = sample_stratum(row.query, row.sample)
    results[0].append(size)
    results[1].append(sample)
    
strata["population"] = results[0]
strata["sampled_users"] = results[1]

In [15]:
strata.tail(n = 20)

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population,query,sampled_users
54,56-VAEWD,VAE,wikidatawiki,0,600,900,122,global_edits >= 600 & home_wiki.isin(['wikidat...,"[(1Or, wikidatawiki), (2%ɐ, wikidatawiki), (A ..."
55,57-VAEOP,VAE,other,0,1200,1800,1053,global_edits >= 600 & home_wiki.isin(['bgwikti...,"[(!Manihiki, itwikibooks), (*j*jac, frwikisour..."
56,58-VAEARe,VAE,arwiki,1,100,150,75,global_edits >= 600 & home_wiki.isin(['arwiki'...,"[(967Bytes, arwiki), (Abdelrhman 1990, arwiki)..."
57,59-VAEZHe,VAE,zhwiki,1,100,150,197,global_edits >= 600 & home_wiki.isin(['zhwiki'...,"[(Pedrotangtang, zhwiki), (Okstartnow, zhwiki)..."
58,60-VAEFRe,VAE,frwiki,1,100,150,264,global_edits >= 600 & home_wiki.isin(['frwiki'...,"[(Superbenjamin, frwiki), (Carlassimo, frwiki)..."
59,61-VAEESe,VAE,eswiki,1,100,150,242,global_edits >= 600 & home_wiki.isin(['eswiki'...,"[(Xxsugus, eswiki), (Miguel Palafox, eswiki), ..."
60,62-VAERUe,VAE,ruwiki,1,100,150,242,global_edits >= 600 & home_wiki.isin(['ruwiki'...,"[(Liquorkaru, ruwiki), (Vayvor, ruwiki), (Леон..."
61,63-VAEDEe,VAE,dewiki,1,100,150,358,global_edits >= 600 & home_wiki.isin(['dewiki'...,"[(Wiki-vr.mp, dewiki), (Kgfleischmann, dewiki)..."
62,64-VAEPTe,VAE,ptwiki,1,100,150,113,global_edits >= 600 & home_wiki.isin(['ptwiki'...,"[((Carlos Emanuel), ptwiki), (2016Começa, ptwi..."
63,65-VAENLe,VAE,nlwiki,1,100,150,97,global_edits >= 600 & home_wiki.isin(['nlwiki'...,"[(Agora, nlwiki), (Aiko, nlwiki), (Akadunzio, ..."


In [19]:
for val in strata["population"]:
    print(val)

1126
2855
5518
5070
4297
6791
1983
1226
3115
5671
44207
3781
66
5675
4171
2488
5040
161
3384
31
82
172
184
95
217
69
51
72
53
123
6
189
186
132
309
19
156
145
435
1113
760
910
1559
280
285
552
746
6825
621
23
1364
591
697
1057
122
1053
75
197
264
242
242
358
113
97
142
72
211
11
426
349
270
851
106
288


The users' projects need to be in domain form, so get a table mapping site keys to site domains:
```
select 
site_global_key as site_key,
substring(reverse(site_domain), 2) as site_domain
from enwiki.sites;
```

In [15]:
domains = pd.read_table("site_domains.tsv", index_col = 0)

def lookup_domain(site_key):
    return domains.loc[site_key, "site_domain"]

domains.head()

Unnamed: 0_level_0,site_domain
site_key,Unnamed: 1_level_1
aawiki,aa.wikipedia.org
aawiktionary,aa.wiktionary.org
aawikibooks,aa.wikibooks.org
abwiki,ab.wikipedia.org
abwiktionary,ab.wiktionary.org


In [16]:
for row in strata.itertuples(index = False):
    if row.sampled_users:
        text = ""
        for user in row.sampled_users:
            text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], lookup_domain(user[1]))
        site = pwb.Site()
        page = pwb.Page(site, "Community Engagement Insights/MassMessages/Lists/2016/" + row.code)
        page.text = text
        page.save(u"Upload a target list for the 2016 Performance Survey", minor = False)

Sleeping for 9.6 seconds, 2017-01-04 15:04:56
Page [[Community Engagement Insights/MassMessages/Lists/2016/01-AEAR]] saved
Sleeping for 8.9 seconds, 2017-01-04 15:05:07
Page [[Community Engagement Insights/MassMessages/Lists/2016/02-AEZH]] saved
Sleeping for 8.7 seconds, 2017-01-04 15:05:17
Page [[Community Engagement Insights/MassMessages/Lists/2016/03-AEFR]] saved
Sleeping for 9.0 seconds, 2017-01-04 15:05:27
Page [[Community Engagement Insights/MassMessages/Lists/2016/04-AEES]] saved
Sleeping for 8.2 seconds, 2017-01-04 15:05:38
Page [[Community Engagement Insights/MassMessages/Lists/2016/05-AERU]] saved
Sleeping for 9.0 seconds, 2017-01-04 15:05:47
Page [[Community Engagement Insights/MassMessages/Lists/2016/06-AEDE]] saved
Sleeping for 8.9 seconds, 2017-01-04 15:05:57
Page [[Community Engagement Insights/MassMessages/Lists/2016/07-AEPT]] saved
Sleeping for 8.7 seconds, 2017-01-04 15:06:07
Page [[Community Engagement Insights/MassMessages/Lists/2016/08-AENL]] saved
Sleeping for 9.2