In [1]:
import datetime as dt
import re
import io
from functools import reduce

import pandas as pd
from dateutil.relativedelta import relativedelta

import wmfdata as wmf

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Latest mediawiki_history snapshot in Hive
SNAPSHOT = "2018-06"

# Loading previous results

In [3]:
# What about preserving partially complete rows?

try:
    old_metrics = pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
    START = old_metrics["month"].max() + relativedelta(months=1)
except FileNotFoundError:
    START = pd.Timestamp(2001, 1, 1)
    old_metrics = None

START = START.strftime("%Y-%m-%d")
print(START)

2001-01-01


# Single-query metrics

In [4]:
mdb_queries = {
    
    # To-do: active editors with null registration aren't classified as existing (?)
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/mobile_edits.sql",
    }
}

hive_queries = {
    "edits": {
        "file": "queries/edits.hql",
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql"
    }
}

In [5]:
for k in mdb_queries:
    q = mdb_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.mariadb.run(text.format(start = START))
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running active_editors...
Running edits...


In [6]:
for k in hive_queries:
    q = hive_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.hive.run(text.format(start = START, snapshot = SNAPSHOT))
    # Unlike our MariaDB queries, the Hive query returns a string rather than a date
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running new_editor_retention...
Running edits...


# Multiquery metrics

In [13]:
# Pull Wikistats data files, removing old copies from previous runs
wikistats_files = "csv_*_main.zip"
wikistats_path = "/mnt/data/xmldatadumps/public/other/wikistats_1/"
!rm data/{wikistats_files}
!cp {wikistats_path}{wikistats_files} data

In [14]:
zipfiles = !ls data/csv_*_main.zip
cols = ["wiki", "group", "month", "articles", "files"]
content = pd.DataFrame(columns=cols)

for f in zipfiles:
    # Extract the Wikistats code for the project family
    grp = re.search(r"data/csv_([a-z]{2})_main.zip", f).group(1)
    
    # Map Wikistats codes for project family to the corresponding database codes
    db_suffix = {
        "wb": "wikibooks",
        "wk": "wiktionary",
        "wn": "wikinews",
        "wo": "wikivoyage",
        "wp": "wiki",
        "wq": "wikiquote",
        "ws": "wikisource",
        "wv": "wikiversity",
        "wx": ""
    }
    
    # Unzip files to stdout and capture it in an IPython SList.
    # Put the newline-separated string (`.n`) of the output in a buffer for Pandas.
    sm = !unzip -p {f} StatisticsMonthly.csv
    sm = io.StringIO(sm.n)
    
    spn = !unzip -p {f} StatisticsPerNamespace.csv
    spn = io.StringIO(spn.n)
     
    # Manually set column numbers because some CSVs are ragged 
    # Select the columns we need, which aren't named so we need to select by location
    art = pd.read_csv(sm, header=None, usecols=[0, 1, 6], names=range(29))
    art.columns = ["wiki", "month", "articles"]

    # Wikisource has extra namespaces so its file has more columns
    if grp == "ws":
        col_nums = range(22)
    else:
        col_nums = range(17)
    
    files = pd.read_csv(spn, header=None, usecols=[0, 1, 5], names=col_nums)
    files.columns = ["wiki", "month", "files"]
        
    grp_content = pd.merge(art, files, on=["wiki", "month"], validate="one_to_one")
        
    # Wiki column just contains the language code (except in wx) so we have to disambiguate across files
    grp_content["wiki"] = grp_content["wiki"] + db_suffix[grp]
    
    grp_content["group"] = grp
    
    content = content.append(grp_content, sort=False)
    
    sm.close()
    spn.close()

content["month"] = pd.to_datetime(content["month"])
content["articles"] = content["articles"].astype(int)

# Remove "wikis" with zz codes since those are aggregates
not_zz = lambda df: ~df["wiki"].str.match(r"zz.*")
content = content[not_zz]

content.tail()

Unnamed: 0,wiki,group,month,articles,files
1910,wikidata,wx,2018-02-28,46985914,0.0
1911,wikidata,wx,2018-03-31,47812006,0.0
1912,wikidata,wx,2018-04-30,48959861,0.0
1913,wikidata,wx,2018-05-31,50434708,0.0
1914,wikidata,wx,2018-06-30,51021959,0.0


In [15]:
# Content is articles + files on all wikis except Commons, 
# where it's files alone since there files count as articles
def count_content(df):
    files = df["files"].sum()
    noncommons_articles = df[df["wiki"] != "commons"]["articles"].sum()
    total_content = noncommons_articles + files
    wikipedia_articles = df[df["group"] == "wp"]["articles"].sum()
    wikidata_entities = df[df["wiki"] == "wikidata"]["articles"].sum()

    return pd.Series(
        [total_content, wikipedia_articles, files, wikidata_entities],
        index=["total_content", "wikipedia_articles", "files", "wikidata_entities"]
    )

glob_cont = content.groupby("month").apply(count_content)
glob_cont.tail()

Unnamed: 0_level_0,total_content,wikipedia_articles,files,wikidata_entities
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2018-02-28,183898889.0,47486624.0,48826383.0,46985914.0
2018-03-31,185993282.0,47686797.0,49632489.0,47812006.0
2018-04-30,188191961.0,47859087.0,50263125.0,48959861.0
2018-05-31,191363893.0,48033215.0,51001384.0,50434708.0
2018-06-30,193057002.0,48201005.0,51626384.0,51021959.0


In [16]:
# Remove dates with 0 articles, because those are junk data
glob_cont = glob_cont[glob_cont["wikipedia_articles"] != 0]

# This data is calculated as of the end of a calendar month. In other places,
# the metric is dated the first day of that month it applies to. Let's convert
# to that.
glob_cont.index = glob_cont.index - pd.tseries.offsets.MonthBegin()

glob_cont = glob_cont[START:]

# Reset index so we can merge
glob_cont = glob_cont.reset_index()

glob_cont.tail()

Unnamed: 0,month,total_content,wikipedia_articles,files,wikidata_entities
206,2018-02-01,183898889.0,47486624.0,48826383.0,46985914.0
207,2018-03-01,185993282.0,47686797.0,49632489.0,47812006.0
208,2018-04-01,188191961.0,47859087.0,50263125.0,48959861.0
209,2018-05-01,191363893.0,48033215.0,51001384.0,50434708.0
210,2018-06-01,193057002.0,48201005.0,51626384.0,51021959.0


# Combining and saving metrics

In [17]:
dfs = [mdb_queries[k]["result"] for k in mdb_queries]
dfs.extend([hive_queries[k]["result"] for k in hive_queries])
dfs.append(glob_cont)
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), dfs)

if old_metrics is not None:
    metrics = pd.concat([old_metrics, new_metrics], sort=False)
else:
    metrics = new_metrics

metrics = metrics.reset_index(drop=True)
metrics.tail()

Unnamed: 0,month,active_editors,existing_active_editors,new_active_editors,second_month_active_editors,mobile_edits,new_editor_retention,total_edits,uploads,data_edits,nonbot_nondata_nonupload_edits,revert_rate,total_content,wikipedia_articles,files,wikidata_entities
206,2018-02-01,78963,58887.0,15052.0,4134.0,1011393.0,0.066915,39607703,723342,15702214,13592242,0.081981,183898889.0,47486624.0,48826383.0,46985914.0
207,2018-03-01,86190,63332.0,17625.0,4347.0,1121184.0,0.064066,43033263,806356,17995388,14685039,0.07115,185993282.0,47686797.0,49632489.0,47812006.0
208,2018-04-01,83705,62042.0,16059.0,4739.0,1107284.0,0.057341,34541104,628380,12847031,13760495,0.082339,188191961.0,47859087.0,50263125.0,48959861.0
209,2018-05-01,85424,62698.0,17787.0,4068.0,1165558.0,,39080099,738136,15706861,14360932,0.074504,191363893.0,48033215.0,51001384.0,50434708.0
210,2018-06-01,78549,59034.0,15005.0,3664.0,1128001.0,,37112171,619140,16477748,13004529,0.066875,193057002.0,48201005.0,51626384.0,51021959.0


In [18]:
metrics.to_csv(FILENAME, sep="\t", index=False)