In [1]:
import datetime as dt
import re
import io
from functools import reduce

import pandas as pd
from dateutil.relativedelta import relativedelta

import wmfdata as wmf

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [37]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Latest mediawiki_history snapshot in Hive
SNAPSHOT = "2018-04"

# Loading previous results

In [3]:
# What about preserving partially complete rows?

try:
    old_metrics = pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
    START = old_metrics["month"].max() + relativedelta(months=1)
except FileNotFoundError:
    START = pd.Timestamp(2001, 1, 1)
    old_metrics = None

START = START.strftime("%Y-%m-%d")
print(START)

2001-01-01


# Single-query metrics

In [4]:
mdb_queries = {
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/edits.sql",
    },
    "nonbot_edits": {
        "file": "queries/nonbot_edits.sql"
    }
}

hive_queries = {
    "edits": {
        "file": "queries/edits.hql",
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql"
    }
}

In [None]:
for k in mdb_queries:
    q = mdb_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.mariadb.run(text.format(start = START))
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running active_editors...


In [None]:
for k in hive_queries:
    q = hive_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.hive.run(text.format(start = START, snapshot = SNAPSHOT))
    # Unlike our MariaDB queries, the Hive query returns a string rather than a date
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running new_editor_retention...
Running edits...


# Multiquery metrics

In [21]:
# Pull Wikistats data files, removing old copies from previous runs
wikistats_files = "csv_*_main.zip"
wikistats_path = "/mnt/data/xmldatadumps/public/other/pagecounts-ez/wikistats/"
!rm data/{wikistats_files}
!cp {wikistats_path}{wikistats_files} data

In [22]:
zipfiles = !ls data/csv_*_main.zip
cols = ["wiki", "group", "month", "articles", "files"]
content = pd.DataFrame(columns=cols)

for f in zipfiles:
    # Extract the Wikistats code for the project family
    grp = re.search(r"data/csv_([a-z]{2})_main.zip", f).group(1)
    
    # Map Wikistats codes for project family to the corresponding database codes
    db_suffix = {
        "wb": "wikibooks",
        "wk": "wiktionary",
        "wn": "wikinews",
        "wo": "wikivoyage",
        "wp": "wiki",
        "wq": "wikiquote",
        "ws": "wikisource",
        "wv": "wikiversity",
        "wx": ""
    }
    
    # Unzip files to stdout and capture it in an IPython SList.
    # Put the newline-separated string (`.n`) of the output in a buffer for Pandas.
    sm = !unzip -p {f} StatisticsMonthly.csv
    sm = io.StringIO(sm.n)
    
    spn = !unzip -p {f} StatisticsPerNamespace.csv
    spn = io.StringIO(spn.n)
     
    # Manually set column numbers because some CSVs are ragged 
    # Select the columns we need, which aren't named so we need to select by location
    art = pd.read_csv(sm, header=None, usecols=[0, 1, 6], names=range(29))
    art.columns = ["wiki", "month", "articles"]

    # Wikisource has extra namespaces so its file has more columns
    if grp == "ws":
        col_nums = range(22)
    else:
        col_nums = range(17)
    
    files = pd.read_csv(spn, header=None, usecols=[0, 1, 5], names=col_nums)
    files.columns = ["wiki", "month", "files"]
        
    grp_content = pd.merge(art, files, on=["wiki", "month"], validate="one_to_one")
        
    # Wiki column just contains the language code (except in wx) so we have to disambiguate across files
    grp_content["wiki"] = grp_content["wiki"] + db_suffix[grp]
    
    grp_content["group"] = grp
    
    content = content.append(grp_content)
    
    sm.close()
    spn.close()

content["month"] = pd.to_datetime(content["month"])
content["articles"] = content["articles"].astype(int)

# Remove "wikis" with zz codes since those are aggregates
not_zz = lambda df: ~df["wiki"].str.match(r"zz.*")
content = content[not_zz]

content.head()

Unnamed: 0,articles,files,group,month,wiki
0,1,0.0,wb,2004-08-31,aawikibooks
1,1,0.0,wb,2004-09-30,aawikibooks
2,1,0.0,wb,2004-10-31,aawikibooks
3,1,0.0,wb,2004-11-30,aawikibooks
4,1,0.0,wb,2004-12-31,aawikibooks


In [28]:
# Content is articles + files on all wikis except Commons, 
# where it's files alone since there files count as articles
def count_content(df):
    files = df["files"].sum()
    noncommons_articles = df[df["wiki"] != "commons"]["articles"].sum()
    total_content = noncommons_articles + files
    wikipedia_articles = df[df["group"] == "wp"]["articles"].sum()
    wikidata_entities = df[df["wiki"] == "wikidata"]["articles"].sum()

    return pd.Series(
        [total_content, wikipedia_articles, files, wikidata_entities],
        index=["total_content", "wikipedia_articles", "files", "wikidata_entities"]
    )

glob_cont = content.groupby("month").apply(count_content)
glob_cont.tail()

Unnamed: 0_level_0,total_content,wikipedia_articles,files,wikidata_entities
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-10-31,172255421.0,46615082.0,46460043.0,39516435.0
2017-11-30,174855044.0,46780268.0,46992759.0,41196256.0
2017-12-31,178512868.0,46968190.0,47518748.0,43888840.0
2018-01-31,180301255.0,47369882.0,48199383.0,44334692.0
2018-02-28,184123520.0,47533734.0,48939378.0,47026490.0


In [29]:
# Remove dates with 0 articles, because those are junk data
glob_cont = glob_cont[glob_cont["wikipedia_articles"] != 0]

# This data is calculated as of the end of a calendar month. In other places,
# the metric is dated the first day of that month it applies to. Let's convert
# to that.
glob_cont.index = glob_cont.index - pd.tseries.offsets.MonthBegin()

# Reset index so we can merge
glob_cont = glob_cont.reset_index()

glob_cont.tail()

Unnamed: 0,month,total_content,wikipedia_articles,files,wikidata_entities
202,2017-10-01,172255421.0,46615082.0,46460043.0,39516435.0
203,2017-11-01,174855044.0,46780268.0,46992759.0,41196256.0
204,2017-12-01,178512868.0,46968190.0,47518748.0,43888840.0
205,2018-01-01,180301255.0,47369882.0,48199383.0,44334692.0
206,2018-02-01,184123520.0,47533734.0,48939378.0,47026490.0


# Combining and saving metrics

In [43]:
dfs = [mdb_queries[k]["result"] for k in mdb_queries]
dfs.extend([hive_queries[k]["result"] for k in hive_queries])
dfs.append(glob_cont)
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), dfs)

if old_metrics:
    metrics = pd.concat([old_metrics, new_metrics])
else:
    metrics = new_metrics
        
metrics.tail()

Unnamed: 0,month,active_editors,new_active_editors,second_month_active_editors,total_edits,mobile_web_edits,mobile_app_edits,nonbot_edits,new_editor_retention,uploads,data_edits,total_content,wikipedia_articles,files,wikidata_entities
204,2017-12-01,79558,14387.0,4152.0,37358877.0,925275.0,96439.0,19926673.0,0.052366,522009,13933627,178512868.0,46968190.0,47518748.0,43888840.0
205,2018-01-01,84422,16678.0,3353.0,39437710.0,1054365.0,98878.0,20374752.0,0.07736,675791,9749202,180301255.0,47369882.0,48199383.0,44334692.0
206,2018-02-01,78963,15052.0,4134.0,39574449.0,932776.0,78617.0,19934503.0,0.066922,725015,15704642,184123520.0,47533734.0,48939378.0,47026490.0
207,2018-03-01,86190,17625.0,4347.0,43003671.0,1027516.0,93668.0,25176956.0,,809162,17996980,,,,
208,2018-04-01,83705,16059.0,4739.0,34530321.0,999482.0,107802.0,20762284.0,,634360,12851269,,,,


In [44]:
metrics.to_csv(FILENAME, sep="\t", index=False)