In [None]:
import pandas as pd
import pywikibot as pwb

In [1]:
frame_data = pd.read_table("sampling_frame.tsv")
frame_data.head()

Unnamed: 0,user_name,home_wiki,global_edits,wiki,edits
0,! Bikkit !,dewiki,1658.6,dewiki,1435.0
1,! Bikkit !,dewiki,1658.6,enwiki,190.0
2,! Bikkit !,dewiki,1658.6,commonswiki,32.0
3,! Bikkit !,dewiki,1658.6,frwiki,1.0
4,! Bikkit !,dewiki,1658.6,wikidatawiki,0.6


In [2]:
per_wiki = frame_data[["user_name", "wiki", "edits"]]
per_wiki = per_wiki.pivot("user_name", "wiki")
per_wiki.columns = per_wiki.columns.droplevel()
per_wiki.head()

wiki,abwiki,acewiki,adywiki,afwiki,afwikibooks,afwikiquote,afwiktionary,akwiki,alswiki,amwiki,...,zh_yuewiki,zhwiki,zhwikibooks,zhwikinews,zhwikiquote,zhwikisource,zhwikivoyage,zhwiktionary,zuwiki,zuwiktionary
user_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
! Bikkit !,,,,,,,,,,,...,,,,,,,,,,
!KrzysiekBu!,,,,,,,,,,,...,,,,,,,,,,
!Manihiki,,,,,,,,,,,...,,,,,,,,,,
!Silent,,,,,,,,,,,...,,,,,,,,,,
!minmi73!,,,,,,,,,,,...,,,,,,,,,,


In [3]:
global_data = frame_data[["user_name", "home_wiki", "global_edits"]]
grouped = global_data.groupby("user_name")
global_data = grouped.first().reset_index()
global_data.head()

Unnamed: 0,user_name,home_wiki,global_edits
0,! Bikkit !,dewiki,1658.6
1,!KrzysiekBu!,plwiki,2463.6
2,!Manihiki,itwikibooks,648.0
3,!Silent,ptwiki,26321.5
4,!minmi73!,jawiki,64.0


In [4]:
frame = global_data.join(per_wiki, on = "user_name")
frame.head()

Unnamed: 0,user_name,home_wiki,global_edits,abwiki,acewiki,adywiki,afwiki,afwikibooks,afwikiquote,afwiktionary,...,zh_yuewiki,zhwiki,zhwikibooks,zhwikinews,zhwikiquote,zhwikisource,zhwikivoyage,zhwiktionary,zuwiki,zuwiktionary
0,! Bikkit !,dewiki,1658.6,,,,,,,,...,,,,,,,,,,
1,!KrzysiekBu!,plwiki,2463.6,,,,,,,,...,,,,,,,,,,
2,!Manihiki,itwikibooks,648.0,,,,,,,,...,,,,,,,,,,
3,!Silent,ptwiki,26321.5,,,,,,,,...,,,,,,,,,,
4,!minmi73!,jawiki,64.0,,,,,,,,...,,,,,,,,,,


In [5]:
defs = pd.read_table("strata_definitions.tsv")
defs.head()

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population
0,01-AEAR,AE,arwiki,0,150,225,
1,02-AEZH,AE,zhwiki,0,150,225,
2,03-AEFR,AE,frwiki,0,150,225,
3,04-AEES,AE,eswiki,0,375,563,
4,05-AERU,AE,ruwiki,0,150,225,


In [6]:
# Tidy data
defs["home_wiki"] = defs["home_wiki"].apply(str.strip)
defs.loc[
    defs['home_wiki'] == "other projects",
    "home_wiki"
] = "other"

In [7]:
wps_by_group = pd.read_table("wikipedias_by_group.tsv")

#Remove when fixed in data
wps_by_group.loc[wps_by_group["site_key"] == "ruwiki", "group"] = "ruwiki"
wps_by_group.head()

Unnamed: 0,language,site_key,group
0,Afar,aawiki,ssa
1,Abkhazian,abwiki,mena
2,Acehnese,acewiki,asia
3,Adyghe,adywiki,cee
4,Afrikaans,afwiki,ssa


In [8]:
# Map each group to its component wikis
wiki_groups = {group: None for group in set(defs["home_wiki"])}

regions = ["ssa", "mena", "asia", "cee", "weur"]

for group in regions:
    wiki_groups[group] = set(wps_by_group.query("group == @group")["site_key"])

all_projects = set(frame_data["wiki"])

wiki_groups["other"] = all_projects - wiki_groups.keys() - wiki_groups["ssa"] - wiki_groups["mena"] \
    - wiki_groups["asia"] - wiki_groups["cee"] - wiki_groups["weur"]
    
for key, val in wiki_groups.items():
    if val is None:
        wiki_groups[key] = set([key])

In [9]:
queries = []

for row in defs.itertuples(index = False):
    if row.audience == "AE":
        query = "global_edits < 600"
    else:
        query = "global_edits >= 600"
    
    query += (" & home_wiki.isin({})".format(list(wiki_groups[row.home_wiki])))
    
    if row.home_wiki != "enwiki":
        if row.also_active_enwiki:
            query += " & enwiki >= 30"
        else:
            query += " & (enwiki < 30 | enwiki.isnull())"
    
    queries.append(query)

defs["query"] = queries

defs.tail(n = 25)

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population,query
49,51-VAESSA,VAE,ssa,0,275,413,,global_edits >= 600 & home_wiki.isin(['afwiki'...
50,52-VAECE,VAE,cee,0,350,525,,global_edits >= 600 & home_wiki.isin(['srwiki'...
51,53-VAEAI,VAE,asia,0,150,225,,global_edits >= 600 & home_wiki.isin(['tawiki'...
52,54-VAEWE,VAE,weur,0,150,225,,global_edits >= 600 & home_wiki.isin(['vlswiki...
53,55-VAECM,VAE,commonswiki,0,600,900,,global_edits >= 600 & home_wiki.isin(['commons...
54,56-VAEWD,VAE,wikidatawiki,0,600,900,,global_edits >= 600 & home_wiki.isin(['wikidat...
55,57-VAEOP,VAE,other,0,1200,1800,,global_edits >= 600 & home_wiki.isin(['enwikin...
56,58-VAEARe,VAE,arwiki,1,100,150,,global_edits >= 600 & home_wiki.isin(['arwiki'...
57,59-VAEZHe,VAE,zhwiki,1,100,150,,global_edits >= 600 & home_wiki.isin(['zhwiki'...
58,60-VAEFRe,VAE,frwiki,1,100,150,,global_edits >= 600 & home_wiki.isin(['frwiki'...


In [10]:
def sample_stratum(stratum_query, sample_size):
    """
    returns a tuple consisting of (1) the population count for that subgroup and
    (2) a list of the sampled users in MassMessage format 
    """ 
    stratum = frame.query(stratum_query)
    stratum_size = len(stratum)
    
    if sample_size > stratum_size:
        sample_size = stratum_size
    
    if sample_size == 0:
        sample_list = False
    else:
        sample = stratum.sample(n = sample_size)[["user_name", "home_wiki"]]
        sample_list = [
            i for i in sample.itertuples(index = False, name = "user")
        ]

    return (stratum_size, sample_list)

In [11]:
results = [[], []]

for row in defs.itertuples(index = False):
    size, sample = sample_stratum(row.query, row.sample)
    results[0].append(size)
    results[1].append(sample)
    
defs["population"] = results[0]
defs["sampled_users"] = results[1]

In [12]:
defs

Unnamed: 0,code,audience,home_wiki,also_active_enwiki,min_sample,sample,population,query,sampled_users
0,01-AEAR,AE,arwiki,0,150,225,1077,global_edits < 600 & home_wiki.isin(['arwiki']...,"[(علي سمسم, arwiki), (Mohamed.sa, arwiki), (ال..."
1,02-AEZH,AE,zhwiki,0,150,225,2870,global_edits < 600 & home_wiki.isin(['zhwiki']...,"[(草薙影虎, zhwiki), (Wfjlps, zhwiki), (望月的犀牛, zhw..."
2,03-AEFR,AE,frwiki,0,150,225,5539,global_edits < 600 & home_wiki.isin(['frwiki']...,"[(Steven.Auger, frwiki), (Alice GA, frwiki), (..."
3,04-AEES,AE,eswiki,0,375,563,5023,global_edits < 600 & home_wiki.isin(['eswiki']...,"[(Noé Aa., eswiki), (Lelahel1970, eswiki), (Pa..."
4,05-AERU,AE,ruwiki,0,150,225,4268,global_edits < 600 & home_wiki.isin(['ruwiki']...,"[(John-AndrewF, ruwiki), (Юрец460880, ruwiki),..."
5,06-AEDE,AE,dewiki,0,150,225,6847,global_edits < 600 & home_wiki.isin(['dewiki']...,"[(DresdnerFlo, dewiki), (Hardcorebambi, dewiki..."
6,07-AEPT,AE,ptwiki,0,375,563,1995,global_edits < 600 & home_wiki.isin(['ptwiki']...,"[(Skighat, ptwiki), (Bruna Araújo de Souza, pt..."
7,08-AENL,AE,nlwiki,0,150,225,1223,global_edits < 600 & home_wiki.isin(['nlwiki']...,"[(.marc., nlwiki), (Vdveen2, nlwiki), (Ron4, n..."
8,09-AEIT,AE,itwiki,0,150,225,3146,global_edits < 600 & home_wiki.isin(['itwiki']...,"[(Federico Soatto, itwiki), (RennyDJ, itwiki),..."
9,10-AEJA,AE,jawiki,0,150,225,5635,global_edits < 600 & home_wiki.isin(['jawiki']...,"[(土橋竜也, jawiki), (CottonLovely, jawiki), (Taka..."


Get the site domains:
```
select 
site_global_key as site_key,
substring(reverse(site_domain), 2) as site_domain
from enwiki.sites;
```

In [24]:
domains = pd.read_table("site_domains.tsv", index_col = 0)

def lookup_domain(site_key):
    return domains.loc[site_key, "site_domain"]

domains.head()

Unnamed: 0_level_0,site_domain
site_key,Unnamed: 1_level_1
aawiki,aa.wikipedia.org
aawiktionary,aa.wiktionary.org
aawikibooks,aa.wikibooks.org
abwiki,ab.wikipedia.org
abwiktionary,ab.wiktionary.org


In [None]:
for row in defs.itertuples(index = False):
    text = None
    for user in row.sampled_users:
        text += "* {{{{target | user = {} | site = {}}}}}\n".format(user[0], lookup_domain(user[1]))
    site = pwb.Site()
    page = pwb.Page(site, "Community Engagement Insights/MassMessages/Lists/2016/" + row.code)
    page.text = text
    page.save(u"Upload a target list for the 2016 Performance Survey", minor = False)
    
    #Turn this off to do it for real
    break