# Imports

In [1]:
from functools import reduce
from pathlib import Path
import time
import datetime

import pandas as pd
import requests
from wmfdata import hive
from wmfdata.utils import print_err, pd_display_all

# Parameters

In [2]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/diversity_metrics.tsv"

# Metric month. The mediawiki_history snapshot must be from the metrics month or later.
# METRICS_MONTH_TEXT = '2019-07'
# MEDIAWIKI_HISTORY_SNAPSHOT = '2019-07'

last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)

METRICS_MONTH_TEXT = last_month.strftime("%Y-%m")
MEDIAWIKI_HISTORY_SNAPSHOT = last_month.strftime("%Y-%m")


# Preparation

In [3]:
# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = pd.Period(METRICS_MONTH_TEXT)
date_params = {
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_month": str(metrics_month), 
    "metrics_month_first_day": str(metrics_month.asfreq("D", how="start")),
    "metrics_next_month_first_day": str((metrics_month+1).asfreq("D", how="start")),
    "metrics_month_end": str((metrics_month + 1).start_time),
    "metrics_month_last_day": str(metrics_month.asfreq("D", how="end")),
    "api_metrics_month_first_day": metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    "api_metrics_month_day_after": (metrics_month + 1).asfreq("D", how="start").strftime("%Y%m%d"),
    "metrics_prev_month": str(metrics_month - 1),
    "retention_cohort": str(metrics_month - 2)
}

# Load any previous results
try:
    old_metrics = (
        pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
        .set_index("month")
    )
except FileNotFoundError:
    old_metrics = None
    
def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

# MariaDB and Hive query metrics

In [4]:
queries = {
    "global_south_edits_editors": {
        "file": "queries/global_south_edits_editors.hql",
        "engine": "hive"
    },

    "global_south_new_editor_retention": {
        "file": "queries/global_south_new_editor_retention.hql",
        "engine": "hive"
    },
    
    "global_south_net_new_content": {
        "file": "queries/global_south_net_new_content.hql",
        "engine": "hive"
    },
    
    "global_south_wikidata_entities": {
        "file": "queries/global_south_net_new_wikidata.hql",
        "engine": "hive"
    },
    
    "global_north_edits_editors": {
        "file": "queries/global_north_edits_editors.hql",
        "engine": "hive"
    },

    "global_north_new_editor_retention": {
        "file": "queries/global_north_new_editor_retention.hql",
        "engine": "hive"
    },
    
    "global_north_net_new_content": {
        "file": "queries/global_north_net_new_content.hql",
        "engine": "hive"
    },
    "global_north_wikidata_entities": {
        "file": "queries/global_north_net_new_wikidata.hql",
        "engine": "hive"
    }
}


for key, val in queries.items():
    query = prepare_query(val["file"])
    engine = val["engine"]
    print_err("Running {} on {}...".format(key, engine))
    
    if engine == "mariadb":
        result = mariadb.run(query)
    elif engine == "hive":
        result = hive.run(query)
    else:
        raise ValueError("Unknown engine specified.") 
    
    result = result.assign(month=lambda df: pd.to_datetime(df["month"]))
    val["result"] = result

Running global_south_edits_editors on hive...
Running global_south_new_editor_retention on hive...
Running global_south_net_new_content on hive...
Running global_south_wikidata_entities on hive...
Running global_north_edits_editors on hive...
Running global_north_new_editor_retention on hive...
Running global_north_net_new_content on hive...
Running global_north_wikidata_entities on hive...


# Combining and saving metrics

In [5]:
# Assemble list of result dataframes
results = [val["result"] for _, val in queries.items()]


In [6]:
# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), results)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month").sort_index()
new_metrics = new_metrics.iloc[:,:].astype(float)

if old_metrics is None:
    metrics = new_metrics
else:
    metrics = new_metrics.combine_first(old_metrics)
    
pd_display_all(metrics.tail())

Unnamed: 0_level_0,global_north_active_editors,global_north_edits,global_north_net_new_content,global_north_new_editor_retention,global_north_nonbot_edits,global_north_wikidata_entities,global_south_active_editors,global_south_edits,global_south_net_new_content,global_south_new_editor_retention,global_south_nonbot_edits,global_south_wikidata_entities
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-07-01,69747.0,20691492.0,1859537.0,0.063689,10778187.0,293369.0,25186.0,3184875.0,197044.0,0.05392,2744544.0,38198.0
2021-08-01,70557.0,21739353.0,1918201.0,,12319079.0,490623.0,25249.0,3594274.0,213229.0,,2735392.0,29429.0
2021-09-01,68133.0,21126847.0,1257788.0,,12424305.0,297736.0,24011.0,3010823.0,246326.0,,2498736.0,26384.0
2021-10-01,70549.0,20053414.0,2094859.0,,13132400.0,443349.0,23752.0,3271487.0,240598.0,,2484138.0,34070.0
2021-11-01,71341.0,21041572.0,1070166.0,,11554411.0,375992.0,23066.0,3303526.0,289701.0,,2550849.0,37997.0


In [7]:
metrics.to_csv(FILENAME, sep="\t")

In [8]:
metrics.tail()

Unnamed: 0_level_0,global_north_active_editors,global_north_edits,global_north_net_new_content,global_north_new_editor_retention,global_north_nonbot_edits,global_north_wikidata_entities,global_south_active_editors,global_south_edits,global_south_net_new_content,global_south_new_editor_retention,global_south_nonbot_edits,global_south_wikidata_entities
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
2021-07-01,69747.0,20691492.0,1859537.0,0.063689,10778187.0,293369.0,25186.0,3184875.0,197044.0,0.05392,2744544.0,38198.0
2021-08-01,70557.0,21739353.0,1918201.0,,12319079.0,490623.0,25249.0,3594274.0,213229.0,,2735392.0,29429.0
2021-09-01,68133.0,21126847.0,1257788.0,,12424305.0,297736.0,24011.0,3010823.0,246326.0,,2498736.0,26384.0
2021-10-01,70549.0,20053414.0,2094859.0,,13132400.0,443349.0,23752.0,3271487.0,240598.0,,2484138.0,34070.0
2021-11-01,71341.0,21041572.0,1070166.0,,11554411.0,375992.0,23066.0,3303526.0,289701.0,,2550849.0,37997.0
