In [1]:
from functools import reduce

from cycler import cycler
from dateutil.relativedelta import relativedelta
import matplotlib.pyplot as plt
from numbers import Number
import numpy as np
import pandas as pd
#from scipy import stats as sstats
import wmfdata as wmf
from wmfdata.utils import pct_str, sig_figs
#from wmfdata.charting import pct_fmt, M_fmt, comma_fmt

In [2]:
%matplotlib inline
#wmf.charting.set_mpl_style()

In [3]:
FILE = "metrics/metrics.tsv"
metrics = pd.read_csv(FILE, sep="\t", parse_dates=["month"]).set_index("month")

FILE_D =  "metrics/diversity_metrics.tsv"
diversity_metrics = pd.read_csv(FILE_D, sep="\t", parse_dates=["month"]).set_index("month")

graph_end = metrics.index[-1]
graph_start = graph_end - relativedelta(months = 47)
prior_month = graph_end- relativedelta(months = 1)

# Data transformation

In [4]:
# Transform content flows into content stocks
def stock_to_flow(arr):
    return arr[-1] - arr[0]

flow_cols = [
    "net_new_content_pages", 
    "net_new_Wikipedia_articles", 
    "net_new_Commons_content_pages", 
    "net_new_Wikidata_entities"
]
stock_cols = [
    "content_pages", 
    "Wikipedia_articles", 
    "Commons_content_pages",
    "Wikidata_entities"
]

name_map = {flow_cols[i]: stock_cols[i] for i in range(len(stock_cols))}

stock_metrics = metrics[flow_cols].cumsum().rename(name_map, axis=1)

metrics = pd.concat([metrics, stock_metrics], axis=1)

In [5]:
metrics_all = pd.concat([metrics, diversity_metrics], axis=1)
metrics_all  = metrics_all.loc[:,~metrics_all.columns.duplicated()]

# Report

In [32]:
report_order = [
     #Editors
    'active_editors',
        'new_active_editors',
        'returning_active_editors',
    'new_editor_retention',
    # Content
    'content_pages', 
        'Wikipedia_articles',
        'Commons_content_pages', 
        'Wikidata_entities',
    'net_new_content_pages',
        'net_new_Wikipedia_articles',
        'net_new_Commons_content_pages',
        'net_new_Wikidata_entities',
    'revert_rate',
    'total_edits',
        'mobile_edits',
        'wikidata_edits',
        'uploads',
        'other_nonbot_edits',
        'anonymous_edits',
        'non_anonymous_edits',
    #Diversity Content
    'global_north_net_new_content',
    'global_south_net_new_content',
    'global_north_edits',
    'global_south_edits',
    #Diversity Editors
    'global_north_active_editors',
    'global_south_active_editors',
    #'global_north_new_editor_retention',
    #'global_south_new_editor_retention'

]

In [18]:
def fmt_num(x):
    if isinstance(x, Number) and not pd.isnull(x):
        x = sig_figs(x, 3)
        
        if x < 5:
            return pct_str(x)
        else:
            return "{:,.0f}".format(x)
    else:
        return x

def calc_rpt(ser):   
    cur = ser[-1]
    
    try:
        yr_prev = ser[-13]
        yoy_value = yr_prev 
        yoy_change = (cur / yr_prev) - 1
    except IndexError:
        yoy_change = None
        
    try:
        yoyoyoy_value = ser[-37]
        cagr_change = (cur  / yoyoyoy_value)**(1/4) - 1
        yoyoyoy = (cur / yoyoyoy_value) - 1
    except IndexError:
        cagr = None
        
    
    res = [cur, yoy_change, yoy_value, yoyoyoy, yoyoyoy_value, cagr_change]
    return pd.Series(
        [fmt_num(n) for n in res],
        index=["value", "yoy_change", "yoy_value", "yoyoyoy", "yoyoyoy_value", "3_yr_cagr"]
    )



#TODO Fix lines below, ensure the logic mirrors the logic updated in calc_rpt
def calc_prev_month_rpt(ser):   
    pm_cur = ser[-2]
    
    try:
        pm_yr_prev = ser[-14]
        pm_yoy_change = (pm_cur / pm_yr_prev) - 1
    except IndexError:
        pm_yoy_change = None
        
    try:
        pm_cagr = (ser[-2] / ser[-49])**(1/4) - 1
    except IndexError:
        pm_cagr = None
        
    
    pm_res = [pm_cur, pm_yoy_change, pm_cagr]
    return pd.Series(
        [fmt_num(n) for n in pm_res],
        index=["previous_M_value", "PM_yoy_change", "PM4_yr_cagr"]
    )

In [33]:
def add_month_header(df):
    header = graph_end.strftime("%Y-%m") + " editing and content metrics"
    new_index = pd.MultiIndex.from_product([[header], df.columns])
    
    df.columns = new_index
    
    return df

(
    metrics_all
    .apply(calc_rpt)
    .transpose()
    .reindex(report_order)
    .pipe(add_month_header)
    .style
    .set_table_styles([{
        'selector': 'th.col_heading.level0',
        'props': 'font-size: 1.5em; text-align: center; font-weight: bold;'
    }])
)

Unnamed: 0_level_0,2023-05 editing and content metrics,2023-05 editing and content metrics,2023-05 editing and content metrics,2023-05 editing and content metrics,2023-05 editing and content metrics,2023-05 editing and content metrics
Unnamed: 0_level_1,value,yoy_change,yoy_value,yoyoyoy,yoyoyoy_value,3_yr_cagr
active_editors,91700,-1.4%,93000,-12.2%,105000,-3.2%
new_active_editors,17000,-6.6%,18200,-31.5%,24800,-9.0%
returning_active_editors,74700,-0.1%,74800,-6.3%,79700,-1.6%
new_editor_retention,6.7%,-4.6%,7.0%,-2.9%,6.9%,-0.7%
content_pages,326000000,7.9%,302000000,28.5%,253000000,6.5%
Wikipedia_articles,63500000,4.3%,60900000,17.7%,54000000,4.2%
Commons_content_pages,94700000,12.6%,84100000,52.0%,62300000,11.0%
Wikidata_entities,106000000,6.2%,99800000,20.5%,87900000,4.8%
net_new_content_pages,3210000,120.0%,1460000,-28.8%,4510000,-8.2%
net_new_Wikipedia_articles,202000,10.1%,184000,-49.0%,396000,-15.5%


In [9]:
print("Previous month's report")
(
    metrics_all
    .apply(calc_prev_month_rpt)
    .transpose()
    .reindex(report_order)
    .fillna("—")
)

Previous month's report


Unnamed: 0,previous_M_value,PM_yoy_change,PM4_yr_cagr
month,—,—,—
active_editors,93500,1.9%,2.6%
new_active_editors,18400,3.3%,4.3%
returning_active_editors,75100,1.5%,2.2%
new_editor_retention,8.1%,5.1%,4.4%
content_pages,321000000,7.3%,11.7%
Wikipedia_articles,63100000,4.3%,5.8%
Commons_content_pages,93100000,12.6%,14.7%
Wikidata_entities,105000000,5.6%,16.6%
net_new_content_pages,1740000,8.9%,-0.6%
