This notebook can be used to backfill new or changed metrics across all history (unlike `03-report.ipynb` which runs all metrics for a single month). To do a new backfill, overwrite this code, run your backfill, and then commit the notebook to Git (so it can be overwritten for the next backfill).

In [13]:
import pandas as pd
from wmfdata import hive
from wmfdata.utils import pd_display_all

In [3]:
FILENAME = "metrics/metrics.tsv"

# old metric column names (to be removed)
OLD_COLUMNS = [
    "active_editors",
    "new_active_editors",
    "returning_active_editors"
]

MEDIAWIKI_HISTORY_SNAPSHOT = "2019-04"

In [4]:
old_metrics = (
    pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
    .set_index("month")
)

old_metrics[OLD_COLUMNS].tail()

Unnamed: 0_level_0,active_editors,new_active_editors,returning_active_editors
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-12-01,81075.0,14753.0,65458.0
2019-01-01,86776.0,17108.0,68813.0
2019-02-01,82554.0,16335.0,66219.0
2019-03-01,87711.0,17738.0,69973.0
2019-04-01,84358.0,15535.0,66809.0


In [5]:
old_metrics = old_metrics.drop(OLD_COLUMNS, axis=1)

In [8]:
new_metrics = hive.run("""
select
    month,
    count(*) as active_editors,
    sum(cast(registration_month = month as int)) as new_active_editors,
    count(*) - sum(cast(registration_month = month as int)) as returning_active_editors
from (
    select
        cast(month as date) as month,
        user_name,
        sum(content_edits) as content_edits,
        max(bot_by_group) as bot_by_group,
        cast(trunc(min(user_registration), "MONTH") as date) as registration_month
    from neilpquinn.editor_month
    where
        user_id != 0
    group by month, user_name
) global_edits
where
    content_edits >= 5 and
    not bot_by_group and
    user_name not regexp "bot\\b"
group by month
order by month asc
limit 1000
""".format(mediawiki_history_snapshot=MEDIAWIKI_HISTORY_SNAPSHOT)
).assign(month=lambda df: pd.to_datetime(df["month"])).set_index("month")

In [9]:
new_metrics.head()

Unnamed: 0_level_0,active_editors,new_active_editors,returning_active_editors
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2001-01-01,6,6,0
2001-02-01,9,4,5
2001-03-01,19,12,7
2001-04-01,18,5,13
2001-05-01,21,6,15


In [10]:
new_metrics.tail()

Unnamed: 0_level_0,active_editors,new_active_editors,returning_active_editors
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2018-12-01,80796,14662,66134
2019-01-01,86782,17096,69686
2019-02-01,82554,16335,66219
2019-03-01,87711,17738,69973
2019-04-01,84358,15535,68823


In [11]:
metrics = old_metrics.merge(new_metrics, left_index=True, right_index=True)

In [14]:
pd_display_all(metrics["2017":])

Unnamed: 0_level_0,global_south_active_editors,global_south_edits,global_south_nonbot_edits,mobile-heavy_wiki_active_editors,mobile-heavy_wiki_edits,mobile-heavy_wiki_new_editor_retention,mobile-heavy_wiki_nonbot_edits,mobile_edits,net_new_Commons_content_pages,net_new_Wikidata_entities,net_new_Wikipedia_articles,net_new_content_pages,new_editor_retention,other_nonbot_edits,revert_rate,total_edits,uploads,wikidata_edits,active_editors,new_active_editors,returning_active_editors
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2017-01-01,,,,3300.0,894436.0,0.045183,454376.0,900732.0,793270.0,436983.0,412831.0,1899491.0,0.051108,12042844.0,0.105871,38001265.0,832992.0,17069546.0,85452,18368,67084
2017-02-01,,,,3347.0,1093680.0,0.040924,418319.0,813061.0,793644.0,206788.0,387343.0,1627275.0,0.053877,11105361.0,0.114371,38057326.0,843774.0,17940961.0,80461,16642,63819
2017-03-01,,,,3449.0,969064.0,0.047269,431418.0,929213.0,674367.0,183115.0,308826.0,1386866.0,0.071227,11894294.0,0.113531,33612901.0,710482.0,12928116.0,87366,19565,67801
2017-04-01,,,,3383.0,911166.0,0.042733,415346.0,851164.0,595207.0,537677.0,382293.0,1724616.0,0.060161,11099327.0,0.111393,31602306.0,624298.0,8723317.0,82594,16839,65755
2017-05-01,,,,3421.0,729458.0,0.046426,390945.0,854016.0,833390.0,406552.0,424211.0,1977121.0,0.060064,11024825.0,0.107609,41867781.0,862373.0,12451250.0,85949,19482,66467
2017-06-01,,,,3589.0,738181.0,0.04323,419822.0,855413.0,868011.0,1049593.0,433003.0,2736800.0,0.054192,10972893.0,0.098326,38336583.0,898029.0,17156953.0,81598,18713,62885
2017-07-01,,,,4107.0,944735.0,0.06143,455920.0,934109.0,668452.0,2655035.0,329845.0,4035895.0,0.049984,11234467.0,0.091339,39740681.0,702732.0,19022591.0,80095,16304,63791
2017-08-01,,,,3458.0,1039919.0,0.052176,436618.0,1007777.0,456573.0,3874032.0,406120.0,5097291.0,0.050048,11205655.0,0.099401,44526564.0,488750.0,19872017.0,79296,15372,63924
2017-09-01,,,,3234.0,1242240.0,0.049343,409461.0,923963.0,719268.0,3143746.0,392927.0,4448320.0,0.053856,10554861.0,0.101785,41774909.0,758462.0,20179635.0,79624,16857,62767
2017-10-01,,,,3381.0,1045401.0,0.042514,408665.0,999360.0,571720.0,941342.0,185950.0,1886914.0,0.058635,11736089.0,0.103107,40886220.0,611227.0,16790134.0,81021,16400,64621


In [15]:
metrics.to_csv(FILENAME, sep="\t")