# Creating raw frame

In [4]:
HIVE_SNAPSHOT = "2018-02"
END_OF_DATA = "2018-03-01 00:00:00"
DAYS_IN_LAST_3_MO = 90

In [5]:
frame_query = """
-- TABLE OF
with mo_edits as (
    select
        event_user_text as user,
        year(event_timestamp) as year,
        month(event_timestamp) as month,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- CONTENT
        page_namespace_is_content_historical = true and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST 3 MONTHS
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * {days_in_last_3_mo})) and

        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"    
    -- PER USER, PER MONTH
    group by event_user_text, year(event_timestamp), month(event_timestamp)
), 

-- TABLE OF
yr_proj_edits as (
    select
        event_user_text as user,
        wiki_db as proj,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits,
        max(event_timestamp) as latest_edit
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST YEAR
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 365)) and
        
        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"
    
    -- PER USER, PER WIKI
    group by event_user_text, wiki_db
)

-- FINAL SELECT OF
select 
    recent_actives.user as user,
    yr_edits.proj as home_proj,
    global_edits

-- USERS ACTIVE IN 2 OF 3 MONTHS
from (
    select
        user,
        sum(if(edits >= 5, 1, 0)) as active_months
    from mo_edits
    where
        -- WHO ARE NOT WMF STAFF
        user not like "%WMF%"
    group by user
    having active_months >= 2
) recent_actives

-- JOINED TO THEIR HOME WIKI AND GLOBAL EDITS
left join (
    select
        user,
        proj,
        -- in the unlikely event that wikis are tied by edit count and latest edit, 
        -- row_number() will break it somehow
        row_number() over (partition by user order by edits desc, latest_edit desc) as rank,
        sum(edits) over (partition by user) as global_edits
    from yr_proj_edits
) yr_edits
on 
    recent_actives.user = yr_edits.user and
    rank = 1
"""

In [6]:
raw_frame = run_hive(
    frame_query.format(
        hive_snapshot = HIVE_SNAPSHOT,
        end_of_data = END_OF_DATA,
        days_in_last_3_mo = DAYS_IN_LAST_3_MO
    )
)

In [60]:
raw_frame.head()

Unnamed: 0,user,home_proj,global_edits
0,! Bikkit !,dewiki,219.0
1,!NewLondon31,jawiki,46.0
2,!Silent,ptwiki,20688.8
3,"""Colorado Campeão""!",ptwiki,417.0
4,"""quasi"" tuttologo",itwiki,117.0


In [79]:
raw_frame.to_csv("data/raw/sampling-frame.tsv", sep = "\t", index = False)

# Enriching frame

In [80]:
# Import without human-readable name column
proj_groups = pd.read_table("data/raw/project-groups.tsv").drop("proj_name", axis = 1)
proj_groups.head()

Unnamed: 0,proj_key,proj_group
0,aawiki,meaf_wps
1,abwiki,meaf_wps
2,acewiki,asia_wps
3,adywiki,cee_wps
4,afwiki,meaf_wps


In [81]:
frame = raw_frame.merge(proj_groups, how = "left", left_on = "home_proj", right_on = "proj_key" )
frame = frame.drop("proj_key", axis = 1)
frame.head()

Unnamed: 0,user,home_proj,global_edits,proj_group
0,! Bikkit !,dewiki,219.0,dewiki
1,!NewLondon31,jawiki,46.0,jawiki
2,!Silent,ptwiki,20688.8,ptwiki
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki
4,"""quasi"" tuttologo",itwiki,117.0,itwiki


In [82]:
domains = pd.read_table("data/raw/project-domains.tsv", index_col = 0).reset_index()
domains.head()

Unnamed: 0,proj_key,proj_domain
0,aawiki,aa.wikipedia.org
1,aawiktionary,aa.wiktionary.org
2,aawikibooks,aa.wikibooks.org
3,abwiki,ab.wikipedia.org
4,abwiktionary,ab.wiktionary.org


In [83]:
frame = frame.merge(domains, how = "left", left_on = "home_proj", right_on = "proj_key" )
frame = frame.drop("proj_key", axis = 1)
frame.head()

Unnamed: 0,user,home_proj,global_edits,proj_group,proj_domain
0,! Bikkit !,dewiki,219.0,dewiki,de.wikipedia.org
1,!NewLondon31,jawiki,46.0,jawiki,ja.wikipedia.org
2,!Silent,ptwiki,20688.8,ptwiki,pt.wikipedia.org
3,"""Colorado Campeão""!",ptwiki,417.0,ptwiki,pt.wikipedia.org
4,"""quasi"" tuttologo",itwiki,117.0,itwiki,it.wikipedia.org


In [84]:
# Any project not in project-groups.tsv is in the "other" group
frame["proj_group"] = frame["proj_group"].fillna("other")

In [85]:
frame_len = len(frame["user"])

In [86]:
# Are all our users unique?
frame.user.nunique() == frame_len

True

In [87]:
# Are all our users in a group?
frame.groupby("proj_group")["user"].count().sum() == frame_len

True

In [88]:
# Do we have the 18 groups we want?
frame["proj_group"].nunique() == 18

True

In [76]:
# Pick 10 random users to check against [an edit counter](https://xtools.wmflabs.org/ec)
# to make sure their edit counts and home projects are correct
frame.sample(n = 10)

Unnamed: 0,user,home_proj,global_edits,proj_group,proj_domain
20708,Jampilot,enwiki,142.0,enwiki,en.wikipedia.org
52139,사과우유,kowiki,48.0,asia_wps,ko.wikipedia.org
6208,Boldbdd,mnwiki,39.4,asia_wps,mn.wikipedia.org
10959,Dheillyx,frwiki,148.0,frwiki,fr.wikipedia.org
7041,CHARQUIN,frwiki,2525.4,frwiki,fr.wikipedia.org
9800,Daerl,enwiki,1368.1,enwiki,en.wikipedia.org
11052,Dieschwarzgelben,thwiki,100.0,asia_wps,th.wikipedia.org
23860,Kleiner Stampfi,dewiki,55.0,dewiki,de.wikipedia.org
42260,TEP 60,ruwiki,171.0,ruwiki,ru.wikipedia.org
43688,Tia Canita,eswiki,1568.4,eswiki,es.wikipedia.org


In [89]:
# Those 10 users check out, let's write the enhanced frame to a TSV!
frame.to_csv("data/interim/sampling-frame.tsv", sep = "\t", index = False)