In [1]:
HIVE_SNAPSHOT = "2018-01"
END_OF_DATA = "2018-02-01 00:00:00"
DAYS_IN_LAST_3_MO = 92

In [2]:
frame_query = """
-- TABLE OF
with mo_edits as (
    select
        event_user_text as user,
        year(event_timestamp) as year,
        month(event_timestamp) as month,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- CONTENT
        page_namespace_is_content_historical = true and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST 3 MONTHS
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * {days_in_last_3_mo})) and

        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"    
    -- PER USER, PER MONTH
    group by event_user_text, year(event_timestamp), month(event_timestamp)
), 

-- TABLE OF
yr_wiki_edits as (
    select
        event_user_text as user,
        wiki_db as wiki,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits,
        max(event_timestamp) as latest_edit
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST YEAR
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 365)) and
        
        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"
    
    -- PER USER, PER WIKI
    group by event_user_text, wiki_db
)

-- FINAL SELECT OF
select 
    recent_actives.user as user,
    wikis_ranked.wiki as home_wiki,
    global_edits,
    coalesce(yr_edits.enwiki_edits, 0) as enwiki_edits,
    active_months

-- USERS ACTIVE IN 2 OF 3 MONTHS
from (
    select
        user,
        sum(if(edits >= 5, 1, 0)) as active_months
    from mo_edits
    where
        -- WHO ARE NOT WMF STAFF
        user not like "%WMF%"
    group by user
    having active_months >= 2
) recent_actives

-- JOINED TO THEIR HOME WIKI
left join (
    select
        user,
        wiki,
        -- in the unlikely event that wikis are tied by edit count and latest edit, 
        -- row_number() will break it somehow
        row_number() over (partition by user order by edits desc, latest_edit desc) as rank
    from yr_wiki_edits
) wikis_ranked
on 
    recent_actives.user = wikis_ranked.user and
    rank = 1

-- JOINED TO THEIR GLOBAL AND ENWIKI EDIT COUNTS
left join (
    select
    user,
    sum(edits) as global_edits,
    sum(if(wiki = "enwiki", edits, 0)) as enwiki_edits
    from yr_wiki_edits
    group by user
)  yr_edits
on 
    recent_actives.user = yr_edits.user
"""

In [3]:
frame = run_hive(
    frame_query.format(
        hive_snapshot = HIVE_SNAPSHOT,
        end_of_data = END_OF_DATA,
        days_in_last_3_mo = DAYS_IN_LAST_3_MO
    )
)

In [4]:
frame

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months
0,! Bikkit !,dewiki,559.2,1.0,3
1,!NewLondon31,jawiki,46.0,1.0,2
2,!Silent,ptwiki,22349.8,2.0,3
3,"""Colorado Campeão""!",ptwiki,422.0,4.0,3
4,"""quasi"" tuttologo",itwiki,115.0,1.0,3
5,$andlo17,itwiki,79.0,0.0,2
6,$uperFan32,enwiki,2627.0,2627.0,3
7,%Pier%,itwiki,3315.3,0.0,3
8,&beer&love,wikidatawiki,19484.3,1.0,3
9,'Inyan,frwiki,2108.4,2.0,3


In [7]:
# Check for duplicate users
frame.user.nunique()

52496

In [10]:
# 10 random users to check against [an edit counter](https://xtools.wmflabs.org/ec)
frame.sample(n = 10)

Unnamed: 0,user,home_wiki,global_edits,enwiki_edits,active_months
12184,Durandal2010,jawiki,82.0,0.0,2
50645,محمد جبيرة,frwiki,20.0,6.0,2
37720,Rolf Ulmer,dewiki,26.0,0.0,2
22611,JuninhoTotal90,ptwiki,473.0,0.0,3
43006,Teddyktchan,zhwiki,27.0,5.0,3
19903,Interfase,ruwiki,7367.1,78.0,3
8396,ChrisTakey,enwiki,133.0,133.0,3
20109,It's Wiki Time,enwiki,182.0,178.0,2
39318,Scottandrewhutchins,enwiki,175.0,174.0,2
4621,B14709,enwiki,691.0,685.0,3


In [17]:
# Those 10 users check out, let's write the frame to a TSV!
frame.to_csv("./sampling-frame.tsv", sep = "\t", index = False)