In [42]:
import datetime as dt
import re
import io
from functools import reduce

import pandas as pd
from dateutil.relativedelta import relativedelta

import wmfdata as wmf

In [43]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Latest mediawiki_history snapshot in Hive
SNAPSHOT = "2018-04"

# Loading previous results

In [44]:
# What about preserving partially complete rows?

try:
    old_metrics = pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
    START = old_metrics["month"].max() + relativedelta(months=1)
except FileNotFoundError:
    START = pd.Timestamp(2001, 1, 1)
    old_metrics = None

START = START.strftime("%Y-%m-%d")
print(START)

# Single-query metrics

In [45]:
mdb_queries = {
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/edits.sql",
    },
    "nonbot_edits": {
        "file": "queries/nonbot_edits.sql"
    }
}

hive_queries = {
    "edits": {
        "file": "queries/edits.hql",
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql"
    }
}

In [None]:
for k in mdb_queries:
    q = mdb_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.mariadb.run(text.format(start = START))
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

In [11]:
for k in hive_queries:
    q = hive_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.hive.run(text.format(start = START, snapshot = SNAPSHOT))
    # Unlike our MariaDB queries, the Hive query returns a string rather than a date
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running edits...
Running new_editor_retention...


# Multiquery metrics

In [12]:
# Pull Wikistats data files, removing old copies from previous runs
wikistats_files = "csv_*_main.zip"
wikistats_path = "/mnt/data/xmldatadumps/public/other/wikistats_1/"
!rm data/{wikistats_files}
!cp {wikistats_path}{wikistats_files} data

In [13]:
zipfiles = !ls data/csv_*_main.zip
cols = ["wiki", "group", "month", "articles", "files"]
content = pd.DataFrame(columns=cols)

for f in zipfiles:
    # Extract the Wikistats code for the project family
    grp = re.search(r"data/csv_([a-z]{2})_main.zip", f).group(1)
    
    # Map Wikistats codes for project family to the corresponding database codes
    db_suffix = {
        "wb": "wikibooks",
        "wk": "wiktionary",
        "wn": "wikinews",
        "wo": "wikivoyage",
        "wp": "wiki",
        "wq": "wikiquote",
        "ws": "wikisource",
        "wv": "wikiversity",
        "wx": ""
    }
    
    # Unzip files to stdout and capture it in an IPython SList.
    # Put the newline-separated string (`.n`) of the output in a buffer for Pandas.
    sm = !unzip -p {f} StatisticsMonthly.csv
    sm = io.StringIO(sm.n)
    
    spn = !unzip -p {f} StatisticsPerNamespace.csv
    spn = io.StringIO(spn.n)
     
    # Manually set column numbers because some CSVs are ragged 
    # Select the columns we need, which aren't named so we need to select by location
    art = pd.read_csv(sm, header=None, usecols=[0, 1, 6], names=range(29))
    art.columns = ["wiki", "month", "articles"]

    # Wikisource has extra namespaces so its file has more columns
    if grp == "ws":
        col_nums = range(22)
    else:
        col_nums = range(17)
    
    files = pd.read_csv(spn, header=None, usecols=[0, 1, 5], names=col_nums)
    files.columns = ["wiki", "month", "files"]
        
    grp_content = pd.merge(art, files, on=["wiki", "month"], validate="one_to_one")
        
    # Wiki column just contains the language code (except in wx) so we have to disambiguate across files
    grp_content["wiki"] = grp_content["wiki"] + db_suffix[grp]
    
    grp_content["group"] = grp
    
    content = content.append(grp_content)
    
    sm.close()
    spn.close()

content["month"] = pd.to_datetime(content["month"])
content["articles"] = content["articles"].astype(int)

# Remove "wikis" with zz codes since those are aggregates
not_zz = lambda df: ~df["wiki"].str.match(r"zz.*")
content = content[not_zz]

content.tail()

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=True'.


  sort=sort)


Unnamed: 0,articles,files,group,month,wiki
1894,43875472,0.0,wx,2017-12-31,wikidata
1895,44320994,0.0,wx,2018-01-31,wikidata
1896,47011459,0.0,wx,2018-02-28,wikidata
1897,47837922,0.0,wx,2018-03-31,wikidata
1898,48986646,0.0,wx,2018-04-30,wikidata


In [14]:
# Content is articles + files on all wikis except Commons, 
# where it's files alone since there files count as articles
def count_content(df):
    files = df["files"].sum()
    noncommons_articles = df[df["wiki"] != "commons"]["articles"].sum()
    total_content = noncommons_articles + files
    wikipedia_articles = df[df["group"] == "wp"]["articles"].sum()
    wikidata_entities = df[df["wiki"] == "wikidata"]["articles"].sum()

    return pd.Series(
        [total_content, wikipedia_articles, files, wikidata_entities],
        index=["total_content", "wikipedia_articles", "files", "wikidata_entities"]
    )

glob_cont = content.groupby("month").apply(count_content)
glob_cont.tail()

Unnamed: 0_level_0,total_content,wikipedia_articles,files,wikidata_entities
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2017-12-31,176783401.0,46945492.0,45835672.0,43875472.0
2018-01-31,179327186.0,47347214.0,47271845.0,44320994.0
2018-02-28,183394041.0,47509635.0,48260398.0,47011459.0
2018-03-31,185726550.0,47710088.0,49303612.0,47837922.0
2018-04-30,188284127.0,47883204.0,50291095.0,48986646.0


In [15]:
# Remove dates with 0 articles, because those are junk data
glob_cont = glob_cont[glob_cont["wikipedia_articles"] != 0]

# This data is calculated as of the end of a calendar month. In other places,
# the metric is dated the first day of that month it applies to. Let's convert
# to that.
glob_cont.index = glob_cont.index - pd.tseries.offsets.MonthBegin()

# Reset index so we can merge
glob_cont = glob_cont.reset_index()

glob_cont.tail()

Unnamed: 0,month,total_content,wikipedia_articles,files,wikidata_entities
204,2017-12-01,176783401.0,46945492.0,45835672.0,43875472.0
205,2018-01-01,179327186.0,47347214.0,47271845.0,44320994.0
206,2018-02-01,183394041.0,47509635.0,48260398.0,47011459.0
207,2018-03-01,185726550.0,47710088.0,49303612.0,47837922.0
208,2018-04-01,188284127.0,47883204.0,50291095.0,48986646.0


# Combining and saving metrics

In [16]:
dfs = [mdb_queries[k]["result"] for k in mdb_queries]
dfs.extend([hive_queries[k]["result"] for k in hive_queries])
dfs.append(glob_cont)
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), dfs)

if old_metrics:
    metrics = pd.concat([old_metrics, new_metrics])
else:
    metrics = new_metrics
        
metrics.tail()

Unnamed: 0,month,total_edits,mobile_web_edits,mobile_app_edits,nonbot_edits,active_editors,new_active_editors,second_month_active_editors,uploads,data_edits,new_editor_retention,total_content,wikipedia_articles,files,wikidata_entities
205,2018-01-01,39437710.0,1054365.0,98878.0,20374752.0,84422,16678.0,3353.0,675791.0,9749202.0,0.077362,179327186.0,47347214.0,47271845.0,44320994.0
206,2018-02-01,39574449.0,932776.0,78617.0,19934503.0,78963,15052.0,4134.0,725015.0,15704642.0,0.066914,183394041.0,47509635.0,48260398.0,47011459.0
207,2018-03-01,43003671.0,1027516.0,93668.0,25176956.0,86190,17625.0,4347.0,809162.0,17996980.0,0.064066,185726550.0,47710088.0,49303612.0,47837922.0
208,2018-04-01,34530321.0,999482.0,107802.0,20762284.0,83705,16059.0,4739.0,634360.0,12851269.0,,188284127.0,47883204.0,50291095.0,48986646.0
209,2018-05-01,39061950.0,1067485.0,98073.0,22231556.0,85424,17787.0,4068.0,,,,,,,


In [17]:
metrics.to_csv(FILENAME, sep="\t", index=False)