In [1]:
import pandas as pd
from wmfdata import hive
from wmfdata.utils import pct_str

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


## Global South new editor retention

In [2]:
hive.run("show partitions wmf.geoeditors_daily")

Unnamed: 0,partition
0,month=2018-11
1,month=2018-12
2,month=2019-01
3,month=2019-02
4,month=2019-03
5,month=2019-04
6,month=2019-05


In [5]:
# The new_editor table must be updated for the cohort month (using 01b-new_editor_table.ipynb)
# before this can be successfully run 
gs_new_editors = hive.run("""
select 
    user_name, 
    wiki,
    1st_month_edits,
    2nd_month_edits
from neilpquinn.new_editors ne
left join wmf.geoeditors_daily gd
on
    ne.user_id = gd.user_fingerprint_or_id and
    ne.wiki = gd.wiki_db and
    ne.cohort = gd.month
left join canonical_data.countries cdc
on gd.country_code = cdc.iso_code
where
    ne.cohort = "{retention_cohort}" and
    gd.month = "{retention_cohort}" and
    economic_region = "Global South"
group by user_name, wiki, 1st_month_edits, 2nd_month_edits
""".format(
    **date_params
))

In [6]:
gs_second_mo_eds = gs_new_editors[lambda df: df["2nd_month_edits"] >= 1]
print(pct_str(len(gs_second_mo_eds) / len(gs_new_editors), decimals=2))

4.30%


# Prototyping integration into 02-calculation.ipynb

In [7]:
# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
METRICS_MONTH_TEXT = "2019-05"
MEDIAWIKI_HISTORY_SNAPSHOT = "2019-05"

In [8]:
metrics_month = pd.Period(METRICS_MONTH_TEXT)
date_params = {
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_month": str(metrics_month),
    "metrics_month_start": str(metrics_month.start_time), 
    "metrics_month_first_day": str(metrics_month.asfreq("D", how="start")),
    "metrics_month_end": str((metrics_month + 1).start_time),
    "metrics_month_last_day": str(metrics_month.asfreq("D", how="end")),
    "api_metrics_month_first_day": metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    "api_metrics_month_day_after": (metrics_month + 1).asfreq("D", how="start").strftime("%Y%m%d"),
    "metrics_prev_month": str(metrics_month - 1),
    "retention_cohort": str(metrics_month - 2)
}

In [9]:
date_params["retention_cohort"]

'2019-03'

In [None]:
hive.run("""
with gs_new_editors as (
    select 
        user_name, 
        wiki,
        1st_month_edits,
        2nd_month_edits
    from neilpquinn.new_editors ne`
    left join wmf.geoeditors_daily gd
    on
        ne.user_id = gd.user_fingerprint_or_id and
        ne.wiki = gd.wiki_db and
        ne.cohort = gd.month
    left join canonical_data.countries cdc
    on gd.country_code = cdc.iso_code
    where
        ne.cohort = "{retention_cohort}" and
        gd.month = "{retention_cohort}" and
        economic_region = "Global South"
    group by user_name, wiki, 1st_month_edits, 2nd_month_edits
), 
select
    "{metrics_month_first_day}" as month,
    cast(2nd_month_edits >= 1 as int) / count(*) as global_south_new_editor_retention
   
""".format(
    **date_params
))