In [1]:
import datetime as dt
from functools import reduce

import pandas as pd

import wmfdata as wmf

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
# TSV file where metrics are or will be saved
FILENAME = "metrics.tsv"

# Latest mediawiki_history snapshot in Hive
SNAPSHOT = "2018-04"

# Loading previous results

In [3]:
try:
    old_metrics = pd.read_csv(FILENAME, sep="\t")
    START = old_metrics["month"].max()
except FileNotFoundError:
    START = "2001-01-01"
    old_metrics = None

# Running queries

In [4]:
mdb_queries = {
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/edits.sql",
    },
    "nonbot_edits": {
        "file": "queries/nonbot_edits.sql"
    }
}

hive_queries = {
    "edits": {
        "file": "queries/edits.hql",
    },
}

In [None]:
for k in mdb_queries:
    q = mdb_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.mariadb.run(text.format(start = START))
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])
    
for k in hive_queries:
    q = hive_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.hive.run(text.format(start = START, snapshot = SNAPSHOT))
    # Unlike our MariaDB queries, the Hive query returns a string rather than a date
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running edits...
Running active_editors...


# Combining and saving metrics

In [None]:
dfs = [mdb_queries[k]["result"] for k in mdb_queries]
dfs.extend([hive_queries[k]["result"] for k in hive_queries])
new_metrics = reduce(lambda l, r: pd.merge(l, r), dfs)

if old_metrics:
    metrics = pd.concat([old_metrics, new_metrics])
else:
    metrics = new_metrics
    
metrics.tail()

In [13]:
for col in metrics.columns:
    if col != "month":
        metrics[col] = metrics[col].apply(int)

In [15]:
metrics.to_csv(FILENAME, sep="\t", index=False)