In [4]:
HIVE_SNAPSHOT = "2018-02"
END_OF_DATA = "2018-03-01 00:00:00"
DAYS_IN_LAST_3_MO = 90

In [5]:
frame_query = """
-- TABLE OF
with mo_edits as (
    select
        event_user_text as user,
        year(event_timestamp) as year,
        month(event_timestamp) as month,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- CONTENT
        page_namespace_is_content_historical = true and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST 3 MONTHS
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * {days_in_last_3_mo})) and

        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"    
    -- PER USER, PER MONTH
    group by event_user_text, year(event_timestamp), month(event_timestamp)
), 

-- TABLE OF
yr_proj_edits as (
    select
        event_user_text as user,
        wiki_db as proj,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits,
        max(event_timestamp) as latest_edit
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST YEAR
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 365)) and
        
        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"
    
    -- PER USER, PER WIKI
    group by event_user_text, wiki_db
)

-- FINAL SELECT OF
select 
    recent_actives.user as user,
    yr_edits.proj as home_proj,
    global_edits

-- USERS ACTIVE IN 2 OF 3 MONTHS
from (
    select
        user,
        sum(if(edits >= 5, 1, 0)) as active_months
    from mo_edits
    where
        -- WHO ARE NOT WMF STAFF
        user not like "%WMF%"
    group by user
    having active_months >= 2
) recent_actives

-- JOINED TO THEIR HOME WIKI AND GLOBAL EDITS
left join (
    select
        user,
        proj,
        -- in the unlikely event that wikis are tied by edit count and latest edit, 
        -- row_number() will break it somehow
        row_number() over (partition by user order by edits desc, latest_edit desc) as rank,
        sum(edits) over (partition by user) as global_edits
    from yr_proj_edits
) yr_edits
on 
    recent_actives.user = yr_edits.user and
    rank = 1
"""

In [6]:
frame = run_hive(
    frame_query.format(
        hive_snapshot = HIVE_SNAPSHOT,
        end_of_data = END_OF_DATA,
        days_in_last_3_mo = DAYS_IN_LAST_3_MO
    )
)

In [42]:
frame.head()

Unnamed: 0,user,home_proj,global_edits
0,! Bikkit !,dewiki,219.0
1,!NewLondon31,jawiki,46.0
2,!Silent,ptwiki,20688.8
3,"""Colorado Campeão""!",ptwiki,417.0
4,"""quasi"" tuttologo",itwiki,117.0


In [43]:
# Import without human-readable name column
proj_groups = pd.read_table("project-groups.tsv").drop("project_name", axis = 1)

proj_groups.head()

Unnamed: 0,project_key,project_group
0,aawiki,ssa_wps
1,abwiki,mena_wps
2,acewiki,asia_wps
3,adywiki,cee_wps
4,afwiki,ssa_wps


In [49]:
rich_frame = frame.merge(proj_groups, how = "left", left_on = "home_proj", right_on = "project_key" )
rich_frame = rich_frame.drop("project_key", axis = 1)
rich_frame.head()

Unnamed: 0,user,home_proj,global_edits,project_group
0,! Bikkit !,dewiki,219.0,dewiki
1,!NewLondon31,jawiki,46.0,jawiki
2,!Silent,ptwiki,20688.8,ptwiki
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki
4,"""quasi"" tuttologo",itwiki,117.0,itwiki


In [50]:
domains = pd.read_table("project-domains.tsv", index_col = 0).reset_index()
domains.head()

Unnamed: 0,project_key,project_domain
0,aawiki,aa.wikipedia.org
1,aawiktionary,aa.wiktionary.org
2,aawikibooks,aa.wikibooks.org
3,abwiki,ab.wikipedia.org
4,abwiktionary,ab.wiktionary.org


In [51]:
rich_frame = rich_frame.merge(domains, how = "left", left_on = "home_proj", right_on = "project_key" )
rich_frame = rich_frame.drop("project_key", axis = 1)
rich_frame.head()

Unnamed: 0,user,home_proj,global_edits,project_group,project_domain
0,! Bikkit !,dewiki,219.0,dewiki,de.wikipedia.org
1,!NewLondon31,jawiki,46.0,jawiki,ja.wikipedia.org
2,!Silent,ptwiki,20688.8,ptwiki,pt.wikipedia.org
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki,pt.wikipedia.org
4,"""quasi"" tuttologo",itwiki,117.0,itwiki,it.wikipedia.org


In [52]:
# Any project not in project-groups.tsv is in the "other" group
rich_frame["project_group"] = rich_frame["project_group"].fillna("other")

In [12]:
frame_len = len(rich_frame["user"])

In [53]:
# Are all our users unique?
rich_frame.user.nunique() == frame_len

True

In [15]:
# Are all our users in a group?
rich_frame.groupby("project_group")["user"].count().sum() == frame_len

True

In [54]:
# Do we have the 19 groups we want?
rich_frame["project_group"].nunique() == 19

True

In [55]:
# Pick 10 random users to check against [an edit counter](https://xtools.wmflabs.org/ec)
rich_frame.sample(n = 10)

Unnamed: 0,user,home_proj,global_edits,project_group,project_domain
8168,Chipb82,enwiki,13.0,enwiki,en.wikipedia.org
22025,Jordiventura96,cawiki,1454.2,weur_wps,ca.wikipedia.org
36405,Rax,dewiki,4419.5,dewiki,de.wikipedia.org
51712,火燐,jawiki,43.0,jawiki,ja.wikipedia.org
20174,J R Gainey,enwiki,103.0,enwiki,en.wikipedia.org
25372,Lesponne,frwikiquote,533.1,other,fr.wikiquote.org
6422,Bqn1996,enwiki,1157.0,enwiki,en.wikipedia.org
47703,XXnickiXx,dewiki,124.0,dewiki,de.wikipedia.org
21478,Jlandin,svwiki,732.0,weur_wps,sv.wikipedia.org
48652,Zcbmxvn,jawiki,55.0,jawiki,ja.wikipedia.org


In [57]:
# Those 10 users check out, let's write the frame to a TSV!
rich_frame.to_csv("./sampling-frame.tsv", sep = "\t", index = False)