In [None]:
import pywikibot as pwb

# Frame preparation
(Should be moved to `sampling-frame.ipynb`)

In [1]:
frame = pd.read_csv("sampling-frame.tsv", sep = "\t")
frame.head()

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months
0,! Bikkit !,dewiki,559.2,1.0,3
1,!NewLondon31,jawiki,46.0,1.0,2
2,!Silent,ptwiki,22349.8,2.0,3
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3


In [2]:
# Import without human-readable name column
proj_groups = pd.read_table("project-groups.tsv").drop("project_name", axis = 1)

proj_groups.head()

Unnamed: 0,project_key,project_group
0,aawiki,ssa_wps
1,abwiki,mena_wps
2,acewiki,asia_wps
3,adywiki,cee_wps
4,afwiki,ssa_wps


In [3]:
frame = frame.merge(proj_groups, how = "left", left_on = "home_wiki", right_on = "project_key" )
frame = frame.drop("project_key", axis = 1)
frame.head()

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months,project_group
0,! Bikkit !,dewiki,559.2,1.0,3,dewiki
1,!NewLondon31,jawiki,46.0,1.0,2,jawiki
2,!Silent,ptwiki,22349.8,2.0,3,ptwiki
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3,ptwiki
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3,itwiki


In [6]:
# Any project not in project-groups.tsv is in the "other" group
frame["project_group"] = frame["project_group"].fillna("other")

In [7]:
# Are all our users are in a group?
frame.groupby("project_group")["user"].count().sum() == len(frame["user"])

True

In [8]:
# Do we all 19 groups?
frame["project_group"].nunique() == 19

True

# Frame statistics
(Should be moved to `population-analysis.ipynb`)

In [9]:
frame["project_group"].value_counts()

enwiki      18126
dewiki       3815
cee_wps      3493
jawiki       3198
frwiki       3097
commons      2641
eswiki       2456
ruwiki       2340
other        1959
zhwiki       1879
asia_wps     1851
weur_wps     1784
itwiki       1730
mena_wps     1621
ptwiki        964
nlwiki        734
arwiki        415
wikidata      336
ssa_wps        57
Name: project_group, dtype: int64

In [10]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)

edit_bins = [10, 30, 100, 600, 6000, 12000, top_edge]

frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

pd.DataFrame(frame["binned_edits"].value_counts(sort = False))

Unnamed: 0,binned_edits
"[10, 30)",2867
"[30, 100)",10068
"[100, 600)",18953
"[600, 6000)",16182
"[6000, 12000)",2328
"[12000, 1200000)",2098


In [11]:
frame.groupby(["project_group", "binned_edits"]).size().unstack()

binned_edits,"[10, 30)","[30, 100)","[100, 600)","[600, 6000)","[6000, 12000)","[12000, 1200000)"
project_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arwiki,35,59,126,152,22,21
asia_wps,90,322,635,647,90,67
cee_wps,184,574,1134,1252,209,140
commons,62,205,590,1036,261,487
dewiki,149,638,1398,1350,171,109
enwiki,1088,4095,7038,4833,597,475
eswiki,150,484,939,747,86,50
frwiki,177,609,1059,1001,147,104
itwiki,105,331,652,514,68,60
jawiki,261,824,1370,697,33,13


# Setting sample sizes

In [38]:
targets = pd.read_table("project-group-sample-targets.tsv").set_index("project_group")
targets.head()

Unnamed: 0_level_0,sample_target
project_group,Unnamed: 1_level_1
enwiki,900
arwiki,675
zhwiki,450
frwiki,450
eswiki,1125


In [58]:
strata_sizes = frame.groupby(["project_group", "binned_edits"]).size().unstack()

In [59]:
group_sizes = pd.DataFrame(frame.groupby(["project_group"]).count()["user"])

In [60]:
strata_group_pcts = np.divide(strata_sizes, group_sizes)

In [61]:
strata_targets = np.ceil(
    # Make sure we don't target fewer than 20 users per bin for privacy reasons
    np.maximum(
        np.multiply(strata_group_pcts, targets),
        20
    )
).apply(np.int64)

In [62]:
strata_samples = np.minimum(strata_targets, strata_sizes)

In [63]:
strata_samples

binned_edits,"[10, 30)","[30, 100)","[100, 600)","[600, 6000)","[6000, 12000)","[12000, 1200000)"
project_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arwiki,35,59,126,152,22,21
asia_wps,33,118,232,236,33,25
cee_wps,24,74,147,162,27,20
commons,20,35,101,177,45,83
dewiki,44,189,413,399,51,33
enwiki,28,102,175,120,20,20
eswiki,28,89,173,137,20,20
frwiki,65,222,385,364,54,38
itwiki,28,87,170,134,20,20
jawiki,37,116,193,99,20,13


# Code below not yet updated for 2018

In [66]:
frame.query("project_group == 'arwiki' & binned_edits == '[10, 30)'")

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months,project_group,binned_edits
164,1993 Ibrahim Al-Nofal,arwiki,17.0,0.0,2,arwiki,"[10, 30)"
858,Abdelàli Laarich,arwiki,16.3,0.0,2,arwiki,"[10, 30)"
861,Abdoakta,arwiki,28.0,0.0,2,arwiki,"[10, 30)"
926,Abood wiki,arwiki,18.0,0.0,2,arwiki,"[10, 30)"
935,Aboutthereader,arwiki,20.0,0.0,2,arwiki,"[10, 30)"
951,Abu Hashem,arwiki,28.0,6.0,2,arwiki,"[10, 30)"
1090,Adam faraj2,arwiki,19.0,0.0,3,arwiki,"[10, 30)"
2168,AliAziz,arwiki,17.0,0.0,2,arwiki,"[10, 30)"
3543,Arab world,arwiki,21.0,0.0,2,arwiki,"[10, 30)"
4102,Asim7m,arwiki,13.0,0.0,2,arwiki,"[10, 30)"


In [None]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a tuple consisting of (1) the population count for that subgroup and
    (2) a list of the sampled users in MassMessage format 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    if sample_size == 0:
        sample = None
    elif sample_size > stratum_size:
        sample = stratum[["user_name", "home_wiki"]]
    else:
        sample = stratum.sample(n = sample_size)[["user_name", "home_wiki"]]
        
    if sample is None:
        sample_list = None
    else:
        sample_list = [
            i for i in sample.itertuples(index = False, name = "user")
        ]

    return (stratum_size, sample_list)

In [None]:
results = [[], []]

for row in strata.itertuples(index = False):
    size, sample = sample_stratum(row.query, row.sample)
    results[0].append(size)
    results[1].append(sample)
    
strata["population"] = results[0]
strata["sampled_users"] = results[1]

In [None]:
strata.tail(n = 20)

In [None]:
domains = pd.read_table("site_domains.tsv", index_col = 0)

def lookup_domain(site_key):
    return domains.loc[site_key, "site_domain"]

domains.head()

In [None]:
for row in strata.itertuples(index = False):
    if row.sampled_users:
        text = ""
        for user in row.sampled_users:
            text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], lookup_domain(user[1]))
        site = pwb.Site()
        page = pwb.Page(site, "Community Engagement Insights/MassMessages/Lists/2016/" + row.code)
        page.text = text
        page.save(u"Upload a target list for the 2016 Performance Survey", minor = False)