In [1]:
import pywikibot as pwb

In [3]:
frame = pd.read_csv("sampling-frame.tsv", sep = "\t")
frame.head()

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months
0,! Bikkit !,dewiki,559.2,1.0,3
1,!NewLondon31,jawiki,46.0,1.0,2
2,!Silent,ptwiki,22349.8,2.0,3
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3


In [4]:
strata = pd.read_table("strata_definitions.tsv")
strata.head()

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population
0,01-AEAR,AE,arwiki,0,150,225,
1,02-AEZH,AE,zhwiki,0,150,225,
2,03-AEFR,AE,frwiki,0,150,225,
3,04-AEES,AE,eswiki,0,375,563,
4,05-AERU,AE,ruwiki,0,150,225,


In [6]:
# Import without human-readable name column
proj_groups = pd.read_table("project-groups.tsv").drop("project_name", axis = 1)

proj_groups.head()

Unnamed: 0,project_key,group
0,aawiki,ssa_wps
1,abwiki,mena_wps
2,acewiki,asia_wps
3,adywiki,cee_wps
4,afwiki,ssa_wps


In [7]:
frame = frame.merge(proj_groups, how = "left", left_on = "home_wiki", right_on = "project_key" )
frame = frame.drop("project_key", axis = 1)
frame.head()

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months,group
0,! Bikkit !,dewiki,559.2,1.0,3,dewiki
1,!NewLondon31,jawiki,46.0,1.0,2,jawiki
2,!Silent,ptwiki,22349.8,2.0,3,ptwiki
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3,ptwiki
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3,itwiki


In [16]:
# Any project not in project-groups.tsv is in the "other" group
frame["group"] = frame["group"].fillna("other")

In [17]:
frame.groupby("group")["user"].count()

group
arwiki        415
asia_wps     1851
cee_wps      3493
commons      2641
dewiki       3815
enwiki      18126
eswiki       2456
frwiki       3097
itwiki       1730
jawiki       3198
mena_wps     1621
nlwiki        734
other        1959
ptwiki        964
ruwiki       2340
ssa_wps        57
weur_wps     1784
wikidata      336
zhwiki       1879
Name: user, dtype: int64

In [18]:
# Are all our users are in a group?
frame.groupby("group")["user"].count().sum() == len(frame["user"])

True

In [19]:
# Are there 19 groups like we want?
frame["group"].nunique() == 19

True

In [20]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)

edit_bins = [0, 10, 30, 100, 600, 1000, 10000, top_edge]

frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

frame["binned_edits"].value_counts()

[100, 600)          18953
[1000, 10000)       13511
[30, 100)           10068
[600, 1000)          4529
[10, 30)             2867
[10000, 1200000)     2568
[0, 10)                 0
Name: binned_edits, dtype: int64

# Code below not yet updated for 2018

In [None]:
queries = []

for stratum in strata.itertuples(index = False):
    if stratum.audience == "AE":
        query = "global_edits < 600"
    else:
        query = "global_edits >= 600"
    
    query += (" & home_wiki.isin({})".format(list(wiki_groups[stratum.home_wiki])))
    
    if stratum.home_wiki != "enwiki":
        if stratum.also_active_enwiki:
            query += " & enwiki >= 30"
        else:
            query += " & (enwiki < 30 | enwiki.isnull())"
    
    queries.append(query)
    
strata["query"] = queries

for query in strata["query"][:5]:
    print(query)

In [None]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a tuple consisting of (1) the population count for that subgroup and
    (2) a list of the sampled users in MassMessage format 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    if sample_size == 0:
        sample = None
    elif sample_size > stratum_size:
        sample = stratum[["user_name", "home_wiki"]]
    else:
        sample = stratum.sample(n = sample_size)[["user_name", "home_wiki"]]
        
    if sample is None:
        sample_list = None
    else:
        sample_list = [
            i for i in sample.itertuples(index = False, name = "user")
        ]

    return (stratum_size, sample_list)

In [None]:
results = [[], []]

for row in strata.itertuples(index = False):
    size, sample = sample_stratum(row.query, row.sample)
    results[0].append(size)
    results[1].append(sample)
    
strata["population"] = results[0]
strata["sampled_users"] = results[1]

In [None]:
strata.tail(n = 20)

In [None]:
domains = pd.read_table("site_domains.tsv", index_col = 0)

def lookup_domain(site_key):
    return domains.loc[site_key, "site_domain"]

domains.head()

In [None]:
for row in strata.itertuples(index = False):
    if row.sampled_users:
        text = ""
        for user in row.sampled_users:
            text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], lookup_domain(user[1]))
        site = pwb.Site()
        page = pwb.Page(site, "Community Engagement Insights/MassMessages/Lists/2016/" + row.code)
        page.text = text
        page.save(u"Upload a target list for the 2016 Performance Survey", minor = False)