In [1]:
import pywikibot as pwb

# Frame preparation
(Should be moved to `sampling-frame.ipynb`)

In [3]:
frame = pd.read_csv("sampling-frame.tsv", sep = "\t")
frame.head()

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months
0,! Bikkit !,dewiki,559.2,1.0,3
1,!NewLondon31,jawiki,46.0,1.0,2
2,!Silent,ptwiki,22349.8,2.0,3
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3


In [6]:
# Import without human-readable name column
proj_groups = pd.read_table("project-groups.tsv").drop("project_name", axis = 1)

proj_groups.head()

Unnamed: 0,project_key,group
0,aawiki,ssa_wps
1,abwiki,mena_wps
2,acewiki,asia_wps
3,adywiki,cee_wps
4,afwiki,ssa_wps


In [7]:
frame = frame.merge(proj_groups, how = "left", left_on = "home_wiki", right_on = "project_key" )
frame = frame.drop("project_key", axis = 1)
frame.head()

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months,group
0,! Bikkit !,dewiki,559.2,1.0,3,dewiki
1,!NewLondon31,jawiki,46.0,1.0,2,jawiki
2,!Silent,ptwiki,22349.8,2.0,3,ptwiki
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3,ptwiki
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3,itwiki


In [16]:
# Any project not in project-groups.tsv is in the "other" group
frame["group"] = frame["group"].fillna("other")

In [18]:
# Are all our users are in a group?
frame.groupby("group")["user"].count().sum() == len(frame["user"])

True

In [19]:
# Do we all 19 groups?
frame["group"].nunique() == 19

True

# Frame statistics
(Should be moved to `population-analysis.ipynb`)

In [30]:
frame["group"].value_counts()

enwiki      18126
dewiki       3815
cee_wps      3493
jawiki       3198
frwiki       3097
commons      2641
eswiki       2456
ruwiki       2340
other        1959
zhwiki       1879
asia_wps     1851
weur_wps     1784
itwiki       1730
mena_wps     1621
ptwiki        964
nlwiki        734
arwiki        415
wikidata      336
ssa_wps        57
Name: group, dtype: int64

In [44]:
# Round the highest edit count up to the nearest hundred thousand,
# in order to make a pretty edge for the top bin
top_edge = int(np.ceil(frame["global_edits"].max() / 100000) * 100000)

edit_bins = [10, 30, 100, 600, 6000, 12000, top_edge]

frame["binned_edits"] = pd.cut(frame["global_edits"], edit_bins, right=False)

pd.DataFrame(frame["binned_edits"].value_counts(sort = False))

Unnamed: 0,binned_edits
"[10, 30)",2867
"[30, 100)",10068
"[100, 600)",18953
"[600, 6000)",16182
"[6000, 12000)",2328
"[12000, 1200000)",2098


In [41]:
frame.groupby(["group", "binned_edits"]).size().unstack()

binned_edits,"[10, 30)","[30, 100)","[100, 600)","[600, 6000)","[6000, 12000)","[12000, 1200000)"
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
arwiki,35,59,126,152,22,21
asia_wps,90,322,635,647,90,67
cee_wps,184,574,1134,1252,209,140
commons,62,205,590,1036,261,487
dewiki,149,638,1398,1350,171,109
enwiki,1088,4095,7038,4833,597,475
eswiki,150,484,939,747,86,50
frwiki,177,609,1059,1001,147,104
itwiki,105,331,652,514,68,60
jawiki,261,824,1370,697,33,13


# Code below not yet updated for 2018

In [88]:
frame

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months,group,binned_edits
0,! Bikkit !,dewiki,559.2,1.0,3,dewiki,"[100, 600)"
1,!NewLondon31,jawiki,46.0,1.0,2,jawiki,"[30, 100)"
2,!Silent,ptwiki,22349.8,2.0,3,ptwiki,"[12000, 1200000)"
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3,ptwiki,"[100, 600)"
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3,itwiki,"[100, 600)"
5,$andlo17,itwiki,79.0,0.0,2,itwiki,"[30, 100)"
6,$uperFan32,enwiki,2627.0,2627.0,3,enwiki,"[600, 6000)"
7,%Pier%,itwiki,3315.3,0.0,3,itwiki,"[600, 6000)"
8,&beer&love,wikidatawiki,19484.3,1.0,3,wikidata,"[12000, 1200000)"
9,'Inyan,frwiki,2108.4,2.0,3,frwiki,"[600, 6000)"


In [85]:
targets = pd.read_table("project-sample-targets.tsv")
targets.head()

Unnamed: 0,project_group,sample_target
0,enwiki,900
1,arwiki,675
2,zhwiki,450
3,frwiki,450
4,eswiki,1125


In [None]:
# Try doing the same as below
strata_size = frame.groupby(["group", "binned_edits"]).size().unstack()

In [135]:
rows_list = []

for row in targets.itertuples(index = False):
    proj_users = frame[frame["group"] == row.project_group]
    
    # Now we make targets for the actual strata (the intersections of project groups and edit bins)
    strata_size = proj_users.groupby("binned_edits").size()
    strata_group_pct = (strata_size / len(proj_users))    
    strata_targets = np.maximum(np.ceil(strata_group_pct * row.sample_target), 20)    
    strata_samples = np.minimum(strata_targets, strata_size)
    
    for edit_bin, size in strata_samples.iteritems():
        rows_list.append({
                "home_proj": row.project_group, 
                "edits": edit_bin, 
                "sample": int(size)
            })

In [136]:
strata = pd.DataFrame(rows_list)
strata

Unnamed: 0,edits,home_proj,sample
0,"[10, 30)",enwiki,55
1,"[30, 100)",enwiki,204
2,"[100, 600)",enwiki,350
3,"[600, 6000)",enwiki,240
4,"[6000, 12000)",enwiki,30
5,"[12000, 1200000)",enwiki,24
6,"[10, 30)",arwiki,35
7,"[30, 100)",arwiki,59
8,"[100, 600)",arwiki,126
9,"[600, 6000)",arwiki,152


In [138]:
strata.set_index(["home_proj", "edits"]).unstack().sort_index(axis=1)

Unnamed: 0_level_0,sample,sample,sample,sample,sample,sample
edits,"[10, 30)","[100, 600)","[12000, 1200000)","[30, 100)","[600, 6000)","[6000, 12000)"
home_proj,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
arwiki,35,126,21,59,152,22
asia_wps,22,155,20,79,158,22
cee_wps,56,341,43,173,377,63
commons,43,403,332,140,707,178
dewiki,20,165,20,76,160,21
enwiki,55,350,24,204,240,30
eswiki,69,431,23,222,343,40
frwiki,26,154,20,89,146,22
itwiki,28,170,20,87,134,20
jawiki,37,193,13,116,99,20


In [None]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a tuple consisting of (1) the population count for that subgroup and
    (2) a list of the sampled users in MassMessage format 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    if sample_size == 0:
        sample = None
    elif sample_size > stratum_size:
        sample = stratum[["user_name", "home_wiki"]]
    else:
        sample = stratum.sample(n = sample_size)[["user_name", "home_wiki"]]
        
    if sample is None:
        sample_list = None
    else:
        sample_list = [
            i for i in sample.itertuples(index = False, name = "user")
        ]

    return (stratum_size, sample_list)

In [None]:
results = [[], []]

for row in strata.itertuples(index = False):
    size, sample = sample_stratum(row.query, row.sample)
    results[0].append(size)
    results[1].append(sample)
    
strata["population"] = results[0]
strata["sampled_users"] = results[1]

In [None]:
strata.tail(n = 20)

In [None]:
domains = pd.read_table("site_domains.tsv", index_col = 0)

def lookup_domain(site_key):
    return domains.loc[site_key, "site_domain"]

domains.head()

In [None]:
for row in strata.itertuples(index = False):
    if row.sampled_users:
        text = ""
        for user in row.sampled_users:
            text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], lookup_domain(user[1]))
        site = pwb.Site()
        page = pwb.Page(site, "Community Engagement Insights/MassMessages/Lists/2016/" + row.code)
        page.text = text
        page.save(u"Upload a target list for the 2016 Performance Survey", minor = False)