In [1]:
import pandas as pd
frame_data = pd.read_table("sampling_frame.tsv")
frame_data.head()

Unnamed: 0,user_name,home_wiki,global_edits,wiki,edits
0,! Bikkit !,dewiki,1658.6,dewiki,1435.0
1,! Bikkit !,dewiki,1658.6,enwiki,190.0
2,! Bikkit !,dewiki,1658.6,commonswiki,32.0
3,! Bikkit !,dewiki,1658.6,frwiki,1.0
4,! Bikkit !,dewiki,1658.6,wikidatawiki,0.6


In [2]:
per_wiki = frame_data[["user_name", "wiki", "edits"]]
per_wiki = per_wiki.pivot("user_name", "wiki")
per_wiki.columns = per_wiki.columns.droplevel()
per_wiki.head()

wiki,abwiki,acewiki,adywiki,afwiki,afwikibooks,afwikiquote,afwiktionary,akwiki,alswiki,amwiki,...,zh_yuewiki,zhwiki,zhwikibooks,zhwikinews,zhwikiquote,zhwikisource,zhwikivoyage,zhwiktionary,zuwiki,zuwiktionary
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! Bikkit !,,,,,,,,,,,...,,,,,,,,,,
!KrzysiekBu!,,,,,,,,,,,...,,,,,,,,,,
!Manihiki,,,,,,,,,,,...,,,,,,,,,,
!Silent,,,,,,,,,,,...,,,,,,,,,,
!minmi73!,,,,,,,,,,,...,,,,,,,,,,


In [3]:
global_data = frame_data[["user_name", "home_wiki", "global_edits"]]
grouped = global_data.groupby("user_name")
global_data = grouped.first().reset_index()
global_data.head()

Unnamed: 0,user_name,home_wiki,global_edits
0,! Bikkit !,dewiki,1658.6
1,!KrzysiekBu!,plwiki,2463.6
2,!Manihiki,itwikibooks,648.0
3,!Silent,ptwiki,26321.5
4,!minmi73!,jawiki,64.0


In [4]:
frame = global_data.join(per_wiki, on = "user_name")
frame.head()

Unnamed: 0,user_name,home_wiki,global_edits,abwiki,acewiki,adywiki,afwiki,afwikibooks,afwikiquote,afwiktionary,...,zh_yuewiki,zhwiki,zhwikibooks,zhwikinews,zhwikiquote,zhwikisource,zhwikivoyage,zhwiktionary,zuwiki,zuwiktionary
0,! Bikkit !,dewiki,1658.6,,,,,,,,...,,,,,,,,,,
1,!KrzysiekBu!,plwiki,2463.6,,,,,,,,...,,,,,,,,,,
2,!Manihiki,itwikibooks,648.0,,,,,,,,...,,,,,,,,,,
3,!Silent,ptwiki,26321.5,,,,,,,,...,,,,,,,,,,
4,!minmi73!,jawiki,64.0,,,,,,,,...,,,,,,,,,,


In [5]:
defs = pd.read_table("strata_definitions.tsv")
defs.head()

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population
0,01-AEAR,AE,arwiki,0,150,225,
1,02-AEZH,AE,zhwiki,0,150,225,
2,03-AEFR,AE,frwiki,0,150,225,
3,04-AEES,AE,eswiki,0,375,563,
4,05-AERU,AE,ruwiki,0,150,225,


In [6]:
# Tidy data
defs["home_wiki"] = defs["home_wiki"].apply(str.strip)
defs.loc[
    defs['home_wiki'] == "other projects",
    "home_wiki"
] = "other"

In [7]:
wps_by_group = pd.read_table("wikipedias_by_group.tsv")

#Remove when fixed in data
wps_by_group.loc[wps_by_group["site_key"] == "ruwiki", "group"] = "ruwiki"
wps_by_group.head()

Unnamed: 0,language,site_key,group
0,Afar,aawiki,ssa
1,Abkhazian,abwiki,mena
2,Acehnese,acewiki,asia
3,Adyghe,adywiki,cee
4,Afrikaans,afwiki,ssa


In [8]:
# Map each group to its component wikis
wiki_groups = {group: None for group in set(defs["home_wiki"])}

regions = ["ssa", "mena", "asia", "cee", "weur"]

for group in regions:
    wiki_groups[group] = set(wps_by_group.query("group == @group")["site_key"])

all_projects = set(frame_data["wiki"])

wiki_groups["other"] = all_projects - wiki_groups.keys() - wiki_groups["ssa"] - wiki_groups["mena"] \
    - wiki_groups["asia"] - wiki_groups["cee"] - wiki_groups["weur"]
    
for key, val in wiki_groups.items():
    if val is None:
        wiki_groups[key] = set([key])

In [9]:
queries = []

for row in defs.itertuples(index = False):
    if row.audience == "AE":
        query = "global_edits < 600"
    else:
        query = "global_edits >= 600"
    
    query += (" & home_wiki.isin({})".format(list(wiki_groups[row.home_wiki])))
    
    if row.home_wiki != "enwiki":
        if row.also_active_enwiki:
            query += " & enwiki >= 30"
        else:
            query += " & (enwiki < 30 | enwiki.isnull())"
    
    queries.append(query)

defs["query"] = queries

defs.tail()

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population,query
69,72-VAEAIe,VAE,asia,1,0,0,,global_edits >= 600 & home_wiki.isin(['chwiki'...
70,73-VAEWEe,VAE,weur,1,0,0,,global_edits >= 600 & home_wiki.isin(['iswiki'...
71,74-VAECMe,VAE,commonswiki,1,0,0,,global_edits >= 600 & home_wiki.isin(['commons...
72,75-VAEWDe,VAE,wikidatawiki,1,0,0,,global_edits >= 600 & home_wiki.isin(['wikidat...
73,76-VAEOPe,VAE,other,1,0,0,,global_edits >= 600 & home_wiki.isin(['foundat...


In [10]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a tuple consisting of (1) the population count for that subgroup and
    (2) a list of the sampled users in MassMessage format 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    if sample_size > stratum_size:
        sample_size = stratum_size
    
    if sample_size == 0:
        sample_list = False
    else:
        sample = stratum.sample(n = sample_size)[["user_name", "home_wiki"]]
        sample_list = [
            "{}@{}".format(i.user_name, i.home_wiki) 
            for i in sample.itertuples(index = False, name = "user")
        ]

    return (stratum_size, sample_list)

In [11]:
results = [[], []]

for row in defs.itertuples(index = False):
    size, sample = sample_stratum(row.query, row.sample)
    results[0].append(size)
    results[1].append(sample)
    
# defs.population = results[0]

for n in results[0]:
    print(n)

1077
2870
5539
5023
4268
6847
1995
1223
3146
5635
43806
3770
68
5652
4226
2485
5326
150
3348
31
87
185
185
100
218
66
46
72
47
120
4
198
198
129
326
20
149
147
428
1112
769
911
1537
278
291
551
751
6837
625
23
1375
589
692
1063
128
1033
76
189
269
247
243
365
103
96
142
70
221
10
422
323
266
845
99
299


In [None]:
for pop in defs.population:
    print(pop)

In [None]:
import pywikibot as pwb

In [None]:
site = pwb.Site("test", "wikipedia")
page = pwb.Page(site, u"Main Page")
page.text

In [None]:
a = frame.query("global_edits < 600 & home_wiki.isin(['wikidatawiki']) & enwiki >= 30")
len(a)

In [None]:
for a in defs.loc[defs["code"] =="49-VAEEN", "query"]:
    print(a)

In [None]:
frame.query("enwiki >= 30")

In [None]:
frame_test.query("global_edits < 600 & home_wiki.isin(['enwiki'])").index.values

In [None]:
import copy
import numpy as np

frame_test = copy.copy(frame)
frame_test["group"] = np.NaN
frame_test.head()

for row in defs.itertuples(index = False):
    group_rows = frame_test.query(row.query).index.values
    print(len(group_rows))
    frame_test.drop(group_rows, inplace = True)
    
frame_test