In [None]:
import datetime as dt
import re
import io
from functools import reduce

import pandas as pd
from dateutil.relativedelta import relativedelta

import wmfdata as wmf

In [None]:
# TSV file where metrics are or will be saved
FILENAME = "metrics.tsv"

# Latest mediawiki_history snapshot in Hive
SNAPSHOT = "2018-04"

# Loading previous results

In [None]:
try:
    old_metrics = pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
    START = old_metrics["month"].max() + relativedelta(months=1)
except FileNotFoundError:
    START = pd.Timestamp(2001, 1, 1)
    old_metrics = None

START = START.strftime("%Y-%m-%d")
print(START)

# Single-query metrics

In [None]:
mdb_queries = {
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/edits.sql",
    },
    "nonbot_edits": {
        "file": "queries/nonbot_edits.sql"
    }
}

hive_queries = {
    "edits": {
        "file": "queries/edits.hql",
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql"
    }
}

In [None]:
for k in mdb_queries:
    q = mdb_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.mariadb.run(text.format(start = START))
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])
    
for k in hive_queries:
    q = hive_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.hive.run(text.format(start = START, snapshot = SNAPSHOT))
    # Unlike our MariaDB queries, the Hive query returns a string rather than a date
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

# Multiquery metrics

In [None]:
!rmdir data
!mkdir data

In [None]:
# To-do: remove old files matching this pattern
#!cp /mnt/data/xmldatadumps/public/other/wikistats_1/*_main.zip data
!cp /mnt/data/xmldatadumps/public/other/pagecounts-ez/wikistats/csv_*_main.zip data

zipfiles = !ls data/csv_*_main.zip

In [None]:
cols = ["wiki", "group", "month", "articles", "files"]
content = pd.DataFrame(columns=cols)

for f in zipfiles:
    # Extract the Wikistats code for the project family
    grp = re.search(r"data/csv_([a-z]{2})_main.zip", f).group(1)
    
    # Map Wikistats codes for project family to the corresponding database codes
    db_suffix = {
        "wb": "wikibooks",
        "wk": "wiktionary",
        "wn": "wikinews",
        "wo": "wikivoyage",
        "wp": "wiki",
        "wq": "wikiquote",
        "ws": "wikisource",
        "wv": "wikiversity",
        "wx": ""
    }
    
    # Unzip files to stdout and capture it in an IPython SList.
    # Put the newline-separated string (`.n`) of the output in a buffer for Pandas.
    sm = !unzip -p {f} StatisticsMonthly.csv
    sm = io.StringIO(sm.n)
    
    spn = !unzip -p {f} StatisticsPerNamespace.csv
    spn = io.StringIO(spn.n)
     
    # Manually set column numbers because some CSVs are ragged 
    # Select the columns we need, which aren't named so we need to select by location
    art = pd.read_csv(sm, header=None, usecols=[0, 1, 6], names=range(29))
    art.columns = ["wiki", "month", "articles"]

    # Wikisource has extra namespaces so its file has more columns
    if grp == "ws":
        col_nums = range(22)
    else:
        col_nums = range(17)
    
    files = pd.read_csv(spn, header=None, usecols=[0, 1, 5], names=col_nums)
    files.columns = ["wiki", "month", "files"]
        
    grp_content = pd.merge(art, files, on=["wiki", "month"], validate="one_to_one")
        
    # Wiki column just contains the language code (except in wx) so we have to disambiguate across files
    grp_content["wiki"] = grp_content["wiki"] + db_suffix[grp]
    
    grp_content["group"] = grp
    
    content = content.append(grp_content)
    
    sm.close()
    spn.close()

content["month"] = pd.to_datetime(content["month"])
content["articles"] = content["articles"].astype(int)

# Remove "wikis" with zz codes since those are aggregates
not_zz = lambda df: ~df["wiki"].str.match(r"zz.*")
content = content[not_zz]

content.head()

In [None]:
# Content is articles + files on all wikis except Commons, 
# where it's files alone since there files count as articles
def count_content(df):
    files = df["files"].sum()
    noncommons_articles = df[df["wiki"] != "commons"]["articles"].sum()
    total_content = noncommons_articles + files
    wikipedia_articles = df[df["group"] == "wp"]["articles"].sum()
    wikidata_entities = df[df["wiki"] == "wikidata"]["articles"].sum()

    return pd.Series(
        [total_content, wikipedia_articles, files, wikidata_entities],
        index=["total_content", "wikipedia_articles", "files", "wikidata_entities"]
    )

glob_cont = content.groupby("month").apply(count_content)
glob_cont.tail()

In [None]:
# Remove dates with 0 articles, because those are junk data
glob_cont = glob_cont[glob_cont["wikipedia_articles"] != 0]

# This data is calculated as of the end of a calendar month. In other places,
# the metric is dated the first day of that month it applies to. Let's convert
# to that.
glob_cont.index = glob_cont.index - pd.tseries.offsets.MonthBegin()

glob_cont.tail()

# Combining and saving metrics

In [None]:
dfs = [mdb_queries[k]["result"] for k in mdb_queries]
dfs.extend([hive_queries[k]["result"] for k in hive_queries])
dfs.append(glob_cont)
new_metrics = reduce(lambda l, r: pd.merge(l, r), dfs)

if old_metrics:
    metrics = pd.concat([old_metrics, new_metrics])
else:
    metrics = new_metrics
    
metrics.tail()

In [None]:
for col in metrics.columns:
    if col != "month":
        metrics[col] = metrics[col].apply(int)

In [None]:
metrics.to_csv(FILENAME, sep="\t", index=False)