In [1]:
%matplotlib notebook

In [3]:
HIVE_SNAPSHOT = "2018-01"
END_OF_DATA = "2018-02-01 00:00:00"

In [None]:
frame = run_hive(
"""
-- TABLE OF
with mo_edits as (
    select
        event_user_text as user,
        year(event_timestamp) as year,
        month(event_timestamp) as month,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- NON-WMF
       event_user_text not like "%WMF%" and
        
        -- CONTENT
        page_namespace_is_content_historical = true and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST 3 MONTHS (92 DAYS)
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 92)) and

        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"    
    -- PER USER, PER MONTH
    group by event_user_text, year(event_timestamp), month(event_timestamp)
), 

-- TABLE OF
yr_wiki_edits as (
    select
        event_user_text as user,
        wiki_db as wiki,
        sum(if(wiki_db = "wikidatawiki", 0.1, 1)) as edits
    from wmf.mediawiki_history
    where
        -- REGISTERED
        event_user_is_anonymous = false and
        
        -- NON-BOT
        event_user_is_bot_by_name = false and
        not array_contains(event_user_groups, "bot") and
        
        -- NON-WMF
        event_user_text not like "%WMF%" and
        
        -- EDITS
        event_entity = "revision" and
        event_type = "create" and
        
        -- FROM THE LAST YEAR (365 DAYS)
        unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
            > (unix_timestamp("{end_of_data}") - (60 * 60 * 24 * 365)) and
        
        -- FROM THE LATEST SNAPSHOT
        snapshot = "{hive_snapshot}"
    
    -- PER USER, PER WIKI
    group by event_user_text, wiki_db
)

-- FINAL SELECT
select 
    recent_active.user as user,
    active_months,
    year_edits,
    home_wiki.wiki as home_wiki,
    enwiki.edits as enwiki_edits

-- USERS ACTIVE IN 2 OF 3 MONTHS
from (
    select
        user,
        sum(if(edits >= 5, 1, 0)) as active_months
    from mo_edits
    group by user
    having active_months >= 2
) recent_active

-- JOINED TO THEIR YEARLY EDITS
left join (
    select
    user,
    sum(edits) as year_edits
    from yr_wiki_edits
    group by user
) year_edits
on recent_active.user = year_edits.user

-- JOINED TO THEIR MAX PER-WIKI EDIT COUNT
left join (
    select
    user,
    max(edits) as max_edits
    from yr_wiki_edits
    group by user
) max_edits
on year_edits.user = max_edits.user

-- JOINED TO THE WIKI WHERE THEY ACHIEVED THAT MAX
left join
yr_wiki_edits home_wiki
on max_edits.max_edits = home_wiki.edits

-- JOINED TO THEIR ENWIKI EDIT COUNT
left join
yr_wiki_edits enwiki
on recent_active.user = enwiki.user and enwiki.wiki = "enwiki"
""".format(
        hive_snapshot = HIVE_SNAPSHOT,
        end_of_data = END_OF_DATA
    )
)

In [15]:
frame

Unnamed: 0,user,active_months,year_edits,home_wiki
0,WhatamIdoing,3,2998.0,enwiki
1,-revi,3,13103.3,commonswiki
2,Martin Urbanec,3,18228.0,cswiki


In [None]:
len(kowiki_edits[kowiki_edits["last_year_edits"] >= 30])

In [None]:
plt.pyplot.hist(
    np.log10(kowiki_edits["last_year_edits"]),
    bins = (
        0, np.log10(5), np.log10(10), np.log10(25),
        np.log10(100), np.log10(500), np.log10(1000), 
        np.log10(10000), np.log10(70000))
)
plt.pyplot.title("Yearly edit distribution on the Korean Wikipedia")
plt.pyplot.xlabel("edits per year")
plt.pyplot.ylabel("number of users")
plt.pyplot.show()

In [None]:
global_edits = run_hive(
"""
select
    event_user_text as user,
    count(*) as last_year_edits
from wmf.mediawiki_history
where 
    unix_timestamp(event_timestamp, "yyyy-MM-dd HH:mm:ss.0") 
        > unix_timestamp(current_timestamp) - (60 * 60 * 24 * 365) and
    event_user_is_bot_by_name = false and
    -- not array_contains(event_user_groups, "bot") and
    user_is_anonymous = false and
    wiki_db = "{wiki}" and
    event_entity = "revision" and
    event_type = "create" and
    snapshot = "{hive_snapshot}" 
group by event_user_text
""".format(
        wiki = WIKI,
        hive_snapshot = HIVE_SNAPSHOT
    )
)

In [None]:
run_hive(
"""
select
    event_timestamp,
    year(event_timestamp) as year,
    month(event_timestamp) as month
from wmf.mediawiki_history
where snapshot = "2017-12"
limit 10
""")