In [41]:
import pandas as pd
import wmfdata as wmf
from wmfdata.utils import sig_figs
from numbers import Number

In [65]:
%matplotlib inline

In [2]:
mob_wikis = (
    "hiwiki",
    "bnwiki",
    "idwiki",
    "arwiki",
    "mrwiki",
    "fawiki",
    "swwiki",
    "tlwiki",
    "zhwikiquote",
    "thwiki",
    "arzwiki",
    "mlwiki",
    "tawiki",
    "knwiki",
    "ptwiktionary",
    "azwiki",
    "guwiki",
    "kywiki",
    "sqwiki",
    "mswiki"
)

gn_countries = (
    "AD", "AL", "AT", "AX", "BA", "BE", "BG", "CH", "CY", "CZ",
    "DE", "DK", "EE", "ES", "FI", "FO", "FR", "FX", "GB", "GG",
    "GI", "GL", "GR", "HR", "HU", "IE", "IL", "IM", "IS", "IT",
    "JE", "LI", "LU", "LV", "MC", "MD", "ME", "MK", "MT", "NL",
    "NO", "PL", "PT", "RO", "RS", "RU", "SE", "SI", "SJ", "SK",
    "SM", "TR", "VA", "AU", "CA", "HK", "MO", "NZ", "JP", "SG",
    "KR", "TW", "US"
)

# Global South countries

In [None]:
gs_edits = wmf.hive.run("""
with gs_editors as (
    select 
        wiki_db,
        user_fingerprint_or_id as user_id
    from wmf.geoeditors_daily
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        user_is_anonymous = 0
    group by wiki_db, user_fingerprint_or_id
)
select
    date_format(event_timestamp, "yyyy-MM-01") as month,
    count(*) as total_edits,
    sum(cast(
        !(event_user_is_bot_by_name or array_contains(event_user_groups, "bot")
    ) as int)) as nonbot_edits
from gs_editors ge
left join wmf.mediawiki_history mh
on
    ge.wiki_db = mh.wiki_db and
    ge.user_id = event_user_id and
    snapshot = "{snapshot}"
where
    event_entity = "revision" and
    event_type = "create" and
    event_timestamp >= "{start}"
group by date_format(event_timestamp, "yyyy-MM-01")
""".format(
    gn_countries=repr(gn_countries),
    snapshot="2018-08",
    start="2018-04"
))

In [18]:
gs_edits

Unnamed: 0,month,total_edits,nonbot_edits
0,2018-04-01,7810767,7810635
1,2018-05-01,8769063,8768780
2,2018-06-01,8501262,8500910
3,2018-07-01,8370261,8370057
4,2018-08-01,9563086,9562694


In [5]:
gs_editors = wmf.hive.run("""
with gs_editors as (
    select 
        wiki_db,
        user_fingerprint_or_id as user_id
    from wmf.geoeditors_daily
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        user_is_anonymous = 0
    group by wiki_db, user_fingerprint_or_id
)
select
    month,
    sum(cast(content_edits >= 5 as int)) as active_editors
from (
    select
        date_format(event_timestamp, "yyyy-MM-01") as month,
        count(*) as content_edits
    from gs_editors ge
    left join wmf.mediawiki_history mh
    on
        ge.wiki_db = mh.wiki_db and
        ge.user_id = event_user_id and
        snapshot = "{snapshot}"
    where
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp >= "{start}" and
        page_namespace_is_content = 1 and
        !(event_user_is_bot_by_name or array_contains(event_user_groups, "bot"))
    group by event_user_text, date_format(event_timestamp, "yyyy-MM-01")
) combined_eds
group by month
""".format(
    gn_countries=repr(gn_countries),
    snapshot="2018-08",
    start="2018-04"
))

In [6]:
gs_editors

Unnamed: 0,month,active_editors
0,2018-04-01,14635
1,2018-05-01,16072
2,2018-06-01,21617
3,2018-07-01,22605
4,2018-08-01,22379


In [7]:
gs_ner = wmf.hive.run("""
with gs_edits as (
    select 
        gd.wiki_db,
        event_user_text as user_name,
        event_timestamp as edit_dt,
        event_user_creation_timestamp as registration_dt
    from wmf.geoeditors_daily gd
    left join wmf.mediawiki_history mh
    on
        gd.wiki_db = mh.wiki_db and
        gd.user_fingerprint_or_id = event_user_id and
        snapshot = "{snapshot}"
    where
        month >= "{start}" and
        country_code not in {gn_countries} and
        gd.user_is_anonymous = 0 and
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp >= "{start}" and
        event_user_is_created_by_system = 0 and
        event_user_creation_timestamp >= "{start}" and
        event_user_creation_timestamp < "{end}" and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot"))
)
select 
    1st_month.cohort,
    sum(cast(1st_month.edits >= 1 as int)) as new_editors,
    sum(cast(2nd_month.edits >= 1 as int)) / sum(cast(1st_month.edits >= 1 as int)) as new_editor_retention
from (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from gs_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
    group by user_name, registration_dt, wiki_db
    ) 1st_month
left join (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from gs_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") >=
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) and
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60))
        group by user_name, registration_dt, wiki_db
    ) 2nd_month
on
    (1st_month.user_name = 2nd_month.user_name and
    1st_month.wiki = 2nd_month.wiki and
    1st_month.cohort = 2nd_month.cohort)
group by 1st_month.cohort
""".format(
    gn_countries=repr(gn_countries),
    snapshot="2018-08",
    start="2018-04",
    end="2018-09"
))

In [8]:
gs_ner

Unnamed: 0,1st_month.cohort,new_editors,new_editor_retention
0,2018-04,2017,0.531978
1,2018-05,4004,0.384116
2,2018-06,34730,0.047855
3,2018-07,35639,0.033615
4,2018-08,36622,0.000109


# Mobile-heavy wikis

In [42]:
def fmt_num(x):
    if isinstance(x, Number):
        x = sig_figs(x, 3)
        
        if x < 5:
            return pct(x)
        else:
            return "{:,.0f}".format(x)
    else:
        return x

def pct(n):
    return "{:.1%}".format(n)

def find_latest(ser):
    nn_ser = ser[~ser.isnull()]
    return nn_ser.index[-1].date()

def calc_rpt(ser):
    nn_ser = ser[~ser.isnull()]
    cur_mo = nn_ser.index[-1].date()
    cur = nn_ser.iloc[-1]
    mo_prev = nn_ser.iloc[-2]
    yr_prev = nn_ser.iloc[-13]
    mom_change = (cur / mo_prev) - 1
    yoy_change = (cur / yr_prev) - 1
    
    res = [cur_mo, cur, mom_change, yoy_change]
    return pd.Series(
        [fmt_num(n) for n in res],
        index=["latest_month", "value", "mom_change", "yoy_change"]
    )

In [9]:
mh_edits = wmf.hive.run("""
select
    date_format(event_timestamp, "yyyy-MM-01") as month,
    count(*) as total_edits,
    sum(cast(not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot")) as int)) as nonbot_edits
from wmf.mediawiki_history
where
    snapshot = "{snapshot}" and
    event_entity = "revision" and
    event_type = "create" and
    wiki_db in {wikis} and
    event_timestamp >= "{start}" and
    event_timestamp < "{end}" and
    not event_user_is_anonymous
group by date_format(event_timestamp, "yyyy-MM-01")
""".format(
    snapshot="2018-08",
    start="2017-06",
    end="2018-09",
    wikis=repr(mob_wikis)
))

In [24]:
mh_edits["month"] = pd.to_datetime(mh_edits["month"])
mh_edits = mh_edits.set_index("month")

In [30]:
mh_edits = mh_edits.sort_index()

In [45]:
calc_rpt(mh_edits["total_edits"])

latest_month    2018-08-01
value            1,210,000
mom_change           13.4%
yoy_change           16.4%
dtype: object

In [46]:
calc_rpt(mh_edits["nonbot_edits"])

latest_month    2018-08-01
value              487,000
mom_change            6.1%
yoy_change           10.6%
dtype: object

In [None]:
mh_editors = wmf.hive.run("""
select
    month,
    sum(cast(content_edits >= 5 as int)) as active_editors
from (
    select
        date_format(event_timestamp, "yyyy-MM-01") as month,
        count(*) as content_edits
    from wmf.mediawiki_history
    where
        snapshot = "{snapshot}" and
        event_entity = "revision" and
        event_type = "create" and
        wiki_db in {wikis} and
        event_timestamp >= "{start}" and
        event_timestamp < "{end}" and
        not event_user_is_anonymous and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot")) and
        page_namespace_is_content
    group by date_format(event_timestamp, "yyyy-MM-01"), event_user_text
) editors
group by month
""".format(
    snapshot="2018-08",
    start="2017-06",
    end="2018-09",
    wikis=repr(mob_wikis)
))

In [51]:
mh_editors = mh_editors.sort_values("month")
mh_editors["month"] = pd.to_datetime(mh_editors["month"])
mh_editors = mh_editors.set_index("month")

mh_editors.tail()

Unnamed: 0_level_0,active_editors
month,Unnamed: 1_level_1
2018-04-01,3615
2018-05-01,3690
2018-06-01,3429
2018-07-01,3690
2018-08-01,3754


In [52]:
calc_rpt(mh_editors["active_editors"])

latest_month    2018-08-01
value                3,750
mom_change            1.7%
yoy_change            9.6%
dtype: object

In [13]:
mh_ner = wmf.hive.run("""
with mh_edits as (
    select 
        wiki_db,
        event_user_text as user_name,
        event_timestamp as edit_dt,
        event_user_creation_timestamp as registration_dt
    from wmf.mediawiki_history        
    where
        snapshot = "{snapshot}" and
        event_entity = "revision" and
        event_type = "create" and
        event_timestamp >= "{start}" and
        wiki_db in {wikis} and
        not event_user_is_anonymous and
        not event_user_is_created_by_system and
        event_user_creation_timestamp >= "{start}" and
        event_user_creation_timestamp < "{end}" and
        not (event_user_is_bot_by_name or array_contains(event_user_groups, "bot"))
)
select 
    1st_month.cohort,
    sum(cast(1st_month.edits >= 1 as int)) as new_editors,
    sum(cast(2nd_month.edits >= 1 as int)) / sum(cast(1st_month.edits >= 1 as int)) as new_editor_retention
from (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from mh_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60))
    group by user_name, registration_dt, wiki_db
    ) 1st_month
left join (
    select
        user_name,
        wiki_db as wiki,
        substr(registration_dt, 0, 7) as cohort,
        count(*) as edits
    from mh_edits
    where
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") >=
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (30*24*60*60)) and
        unix_timestamp(edit_dt, "yyyy-MM-dd HH:mm:ss.0") <
            (unix_timestamp(registration_dt, "yyyy-MM-dd HH:mm:ss.0") + (60*24*60*60))
        group by user_name, registration_dt, wiki_db
    ) 2nd_month
on
    (1st_month.user_name = 2nd_month.user_name and
    1st_month.wiki = 2nd_month.wiki and
    1st_month.cohort = 2nd_month.cohort)
group by 1st_month.cohort
""".format(
    snapshot="2018-08",
    start="2017-04",
    end="2018-07",
    wikis=repr(mob_wikis)
))

In [54]:
mh_ner

Unnamed: 0,1st_month.cohort,new_editors,new_editor_retention
0,2017-04,8227,0.042907
1,2017-05,7531,0.061346
2,2017-06,7390,0.052233
3,2017-07,7837,0.049381
4,2017-08,7617,0.042405
5,2017-09,7363,0.049301
6,2017-10,8550,0.044444
7,2017-11,8032,0.042953
8,2017-12,8298,0.043022
9,2018-01,8693,0.044173


In [61]:
ner = mh_ner["new_editor_retention"]

In [62]:
# MoM change
(ner.iloc[-1] / ner.iloc[-2]) - 1

0.14475587044955618

In [63]:
#YoY change
(ner.iloc[-1] / ner.iloc[-13]) - 1

-0.09940100618232217