# Parameters

In [1]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Metric month. The mediawiki_history snapshot for this month must be available.
METRICS_MONTH_TEXT = "2018-10"

# Imports

In [2]:
import datetime as dt
from functools import reduce
import io
from pathlib import Path
import re
import time

from dateutil.relativedelta import relativedelta
import pandas as pd
import requests
import wmfdata as wmf
from wmfdata import hive, mariadb
from wmfdata.utils import print_err, mediawiki_dt

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


# Preparation

In [3]:
# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
metrics_month = pd.Period(METRICS_MONTH_TEXT)
date_params = {
    "mediawiki_history_snapshot": METRICS_MONTH_TEXT,
    "metrics_month": str(metrics_month),
    "metrics_month_start": str(metrics_month.start_time), 
    "metrics_month_first_day": str(metrics_month.asfreq("D", how="start")),
    "metrics_month_end": str((metrics_month + 1).start_time),
    "metrics_month_last_day": str(metrics_month.asfreq("D", how="end")),
    "api_metrics_month_first_day": metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    "api_metrics_month_day_after": (metrics_month + 1).asfreq("D", how="start").strftime("%Y%m%d"),
    "retention_cohort": str(metrics_month - 2)
}

# Load any previous results
try:
    old_metrics = (
        pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
        .set_index("month")
    )
except FileNotFoundError:
    old_metrics = None
    
def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

# MariaDB and Hive query metrics

In [4]:
queries = {
    # To-do: active editors with null registration aren't classified as existing (?)
    "active_editors": {
        "file": "queries/active_editors.sql",
        "engine": "mariadb"
    },
    "mobile_edits": {
        "file": "queries/mobile_edits.sql",
        "engine": "mariadb"
    },
    "edits": {
        "file": "queries/edits.hql",
        "engine": "hive"
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql",
         "engine": "hive"
    }
}


for key, val in queries.items():
    query = prepare_query(val["file"])
    engine = val["engine"]
    print_err("Running {} on {}...".format(key, engine))
    
    if engine == "mariadb":
        result = mariadb.run(query)
    elif engine == "hive":
        result = hive.run(query)
    else:
        raise ValueError("Unknown engine specified.") 
    
    result = result.assign(month=lambda df: pd.to_datetime(df["month"]))
    val["result"] = result

Running edits on hive...
Running active_editors on mariadb...
Running mobile_edits on mariadb...
Running new_editor_retention on hive...


# Content metrics via API

In [5]:
NEW_PAGES_API = (
    "https://wikimedia.org/api/rest_v1/metrics/edited-pages/new/" +
    "{project}/all-editor-types/{page_type}/monthly/{start}/{end}"
)

headers = {
    "User-Agent": "https://github.com/wikimedia-research/Editing-movement-metrics (bot)"
}

# Create container for results
api_results = []

def get_new_pages(
    project="all-projects",
    page_type="content",
    start=date_params["api_metrics_month_first_day"],
    end=date_params["api_metrics_month_day_after"]
):
    url = NEW_PAGES_API.format(
        project = project,
        page_type = page_type,
        start = start,
        end = end
    )
    
    r = requests.get(url, headers=headers)
    data = r.json()["items"][0]["results"]
    frame = pd.DataFrame(data)
    frame["timestamp"] = pd.to_datetime(frame["timestamp"])
    frame = frame.rename(columns={"timestamp": "month"})
    
    return frame

## Total

In [6]:
total_new = get_new_pages().rename(columns={"new_pages": "net_new_content_pages"})
api_results.append(total_new)

## Wikidata

In [7]:
new_wd = get_new_pages(
    project="wikidata.org"
).rename(columns={
    "new_pages": "net_new_Wikidata_entities"
})
api_results.append(new_wd)

## Commons

In [8]:
new_commons = get_new_pages(
    project="commons.wikimedia.org"
).rename(columns={
    "new_pages": "net_new_Commons_content_pages"
})
api_results.append(new_commons)

## Wikipedias

In [9]:
# Get a list of project URLs (each one in a 1-tuple)
wp_domains = wmf.mariadb.run("""
select trim(leading "." from reverse(site_domain))
from enwiki.sites
where site_group = "wikipedia"
""", fmt="raw")

# Query the API for each project and append records to a list
results = []
n = len(wp_domains)

for idx, val in enumerate(wp_domains):
    domain = val[0]
    
    if idx % 10 == 0:
        msg = "Now on the {}th project of {} ({})"
        print_err(msg.format(idx, n, domain))
        
    frame = get_new_pages(project=domain).reset_index()
    frame["project"] = domain
    records = frame.to_dict("records")
    results.extend(records)
    
    # Sleep 20 milliseconds
    time.sleep(0.02)

# Turn the big list of records into a data frame
new_per_wp = pd.DataFrame(results)

# Sum across projects to get new Wikipedia articles per month
new_wp = new_per_wp.groupby("month").agg(
    {"new_pages": "sum"}
).rename(columns={"new_pages": "net_new_Wikipedia_articles"}).reset_index()

api_results.append(new_wp)

Now on the 0th project of 303 (aa.wikipedia.org)
Now on the 10th project of 303 (arc.wikipedia.org)
Now on the 20th project of 303 (bcl.wikipedia.org)
Now on the 30th project of 303 (bpy.wikipedia.org)
Now on the 40th project of 303 (ch.wikipedia.org)
Now on the 50th project of 303 (cu.wikipedia.org)
Now on the 60th project of 303 (el.wikipedia.org)
Now on the 70th project of 303 (fi.wikipedia.org)
Now on the 80th project of 303 (gag.wikipedia.org)
Now on the 90th project of 303 (hak.wikipedia.org)
Now on the 100th project of 303 (hy.wikipedia.org)
Now on the 110th project of 303 (is.wikipedia.org)
Now on the 120th project of 303 (kg.wikipedia.org)
Now on the 130th project of 303 (krc.wikipedia.org)
Now on the 140th project of 303 (lbe.wikipedia.org)
Now on the 150th project of 303 (lv.wikipedia.org)
Now on the 160th project of 303 (mo.wikipedia.org)
Now on the 170th project of 303 (na.wikipedia.org)
Now on the 180th project of 303 (no.wikipedia.org)
Now on the 190th project of 303 (pa

# Combining and saving metrics

In [10]:
# Assemble list of result dataframes
results = [val["result"] for _, val in queries.items()]
results.extend(api_results)

# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), results)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month").sort_index()

if old_metrics is None:
    metrics = new_metrics
else:
    metrics = new_metrics.combine_first(old_metrics)
    
metrics.tail()

Unnamed: 0_level_0,active_editors,data_edits,existing_active_editors,mobile_edits,net_new_Commons_content_pages,net_new_Wikidata_entities,net_new_Wikipedia_articles,net_new_content_pages,new_active_editors,new_editor_retention,nonbot_nondata_nonupload_edits,revert_rate,second_month_active_editors,total_edits,uploads
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
2018-06-01,78549.0,16473924.0,59034.0,1128001.0,569621.0,577595.0,177203.0,1638254.0,15005.0,0.057341,12995238.0,0.076353,3664.0,37115031.0,604437.0
2018-07-01,77734.0,14115389.0,59386.0,1169233.0,631394.0,458593.0,221459.0,1809947.0,14037.0,0.047123,13385582.0,0.076803,3455.0,36942656.0,665106.0
2018-08-01,78681.0,18257565.0,60371.0,1226835.0,763082.0,520235.0,221579.0,1818411.0,14018.0,0.051513,13888662.0,0.069798,3433.0,40969220.0,799857.0
2018-09-01,82084.0,17810116.0,59265.0,1171268.0,800330.0,377222.0,185644.0,1635067.0,18284.0,0.056814,13439283.0,0.076115,3671.0,38588449.0,835420.0
2018-10-01,82567.0,23090068.0,60815.0,1231434.0,549132.0,1147804.0,191087.0,2101982.0,16690.0,0.066411,13382193.0,0.073543,4251.0,43471807.0,588857.0


In [11]:
metrics.to_csv(FILENAME, sep="\t")