In [143]:
import datetime as dt
import io
from functools import reduce
import re
import time

import pandas as pd
import requests
from dateutil.relativedelta import relativedelta

import wmfdata as wmf
from wmfdata.utils import print_err

In [144]:
# TSV file where metrics are or will be saved
FILENAME = "metrics/metrics.tsv"

# Metric month
METRICS_MONTH = "2018-09"

# Latest mediawiki_history snapshot in Hive
SNAPSHOT = "2018-09"

# Date wrangling

In [145]:
mm = pd.Period(METRICS_MONTH)
mm_first_day = mm_period.asfreq("D", how="start")
mm_first_day_str = mm_first_day.strftime("%Y-%m-%d")
mm_last_day = mm_period.asfreq("D", how="end")
mm_last_day_str = mm_last_day.strftime("%Y-%m-%d")

# Loading previous results

In [160]:
try:
    old_metrics = pd.read_csv(FILENAME, sep="\t", parse_dates = ["month"])
    old_metrics = old_metrics.set_index("month")
    first_null = old_metrics[old_metrics.isnull().any(axis=1)]["month"].min()
    START = first_null + relativedelta(months=1)
except FileNotFoundError:
    START = pd.Timestamp(2001, 1, 1)
    old_metrics = None

START = START.strftime("%Y-%m-%d")
print(START)

2018-09-01


# Single-query metrics

In [4]:
mdb_queries = {
    
    # To-do: active editors with null registration aren't classified as existing (?)
    "active_editors": {
        "file": "queries/active_editors.sql"
    },
    "edits": {
        "file": "queries/mobile_edits.sql",
    }
}

hive_queries = {
    "edits": {
        "file": "queries/edits.hql",
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql"
    }
}

In [5]:
for k in mdb_queries:
    q = mdb_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.mariadb.run(text.format(start = START))
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running active_editors...
Running edits...


In [6]:
for k in hive_queries:
    q = hive_queries[k]
    with open(q["file"]) as f:
        text = f.read()
        
    wmf.utils.print_err("Running {}...".format(k))
    q["result"] = wmf.hive.run(text.format(start = START, snapshot = SNAPSHOT))
    # Unlike our MariaDB queries, the Hive query returns a string rather than a date
    q["result"]["month"] = pd.to_datetime(q["result"]["month"])

Running new_editor_retention...
Running edits...


# Content metrics via API

In [176]:
api_end_string

'20181001'

In [191]:
NEW_PAGES_API = (
    "https://wikimedia.org/api/rest_v1/metrics/edited-pages/new/" +
    "{project}/all-editor-types/{page_type}/monthly/{start}/{end}"
)

headers = {
    "User-Agent": "https://github.com/wikimedia-research/Editing-movement-metrics (bot)"
}

api_end_string = (mm + 1).asfreq("D", how="start").strftime("%Y%m%d")

def get_new_pages(project="all-projects", page_type="content", start="20010101", end=api_end_string):
    url = NEW_PAGES_API.format(
        project = project,
        page_type = page_type,
        start = start,
        end = end
    )
    
    r = requests.get(url, headers=headers)
    data = r.json()["items"][0]["results"]
    frame = pd.DataFrame(data)
    frame["timestamp"] = pd.to_datetime(frame["timestamp"])
    frame = frame.rename(columns={"timestamp": "month"})
    
    return frame

## Total

In [205]:
total_new = get_new_pages().rename(columns={"new_pages": "net_new_content_pages"})

## Wikidata

In [206]:
new_wd = get_new_pages(
    project="wikidata.org",
    end="20181002"
).rename(columns={
    "new_pages": "net_new_Wikidata_entities"
})

## Commons

In [207]:
new_commons = get_new_pages(
    project="commons.wikimedia.org",
    end="20181002"
).rename(columns={
    "new_pages": "net_new_Commons_content_pages"
})

## Wikipedias

In [None]:
# Get a list of project URLs (each one in a 1-tuple)
wp_domains = wmf.mariadb.run("""
select trim(leading "." from reverse(site_domain))
from enwiki.sites
where site_group = "wikipedia"
""", fmt="raw")

# Query the API for each project and append records to a list
results = []
n = len(wp_domains)

for idx, val in enumerate(wp_domains):
    domain = val[0]
    
    if idx % 10 == 0:
        msg = "Now on the {}th project of {} ({})"
        print_err(msg.format(idx, n, domain))
        
    frame = get_new_pages(project=domain).reset_index()
    frame["project"] = domain
    records = frame.to_dict("records")
    results.extend(records)
    
    # Sleep 50 milliseconds
    time.sleep(0.05)

# Turn the big list of records into a data frame
new_per_wp = pd.DataFrame(results)

# Sum across projects to get new Wikipedia articles per month
new_wp = new_per_wp.groupby("month").agg(
    {"new_pages": "sum"}
).rename(columns={"new_pages": "net_new_Wikipedia_articles"}).reset_index()

# Combining and saving metrics

In [209]:
# MariaDB results
dfs = [mdb_queries[k]["result"] for k in mdb_queries]

# Hive results
dfs.extend([hive_queries[k]["result"] for k in hive_queries])

# AQS content results
dfs.extend([total_new, new_commons, new_wd, new_wp])

# Wikistats 1 content results
dfs.append(glob_cont)

# Merge them all, assuming that the month is the only common column
new_metrics = reduce(lambda l, r: pd.merge(l, r, how="outer"), dfs)

# Set the month as an index so combine_first works properly
new_metrics = new_metrics.set_index("month")

if old_metrics is not None:
    metrics = new_metrics.combine_first(old_metrics)
else:
    metrics = new_metrics
    
metrics = metrics.sort_index()

metrics.tail()

Unnamed: 0_level_0,active_editors,data_edits,existing_active_editors,files,mobile_edits,net_new_Commons_content_pages,net_new_Wikidata_entities,net_new_Wikipedia_articles,net_new_content_pages,new_active_editors,new_editor_retention,nonbot_nondata_nonupload_edits,revert_rate,second_month_active_editors,total_content,total_edits,uploads,wikidata_entities,wikipedia_articles
month,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1
2018-05-01,85424.0,15706189.0,62698.0,50974289.0,1165558.0,687827,1466249,177423,3119985,17787.0,0.047123,14355475.0,0.076782,4068.0,191319173.0,39081127.0,735020.0,50429005.0,48018756.0
2018-06-01,78549.0,16473924.0,59034.0,51555579.0,1128001.0,570494,578664,177167,1640181,15005.0,0.051513,12995238.0,0.076353,3664.0,192895236.0,37115031.0,604437.0,50978404.0,48161127.0
2018-07-01,77734.0,14115389.0,59386.0,52227012.0,1169233.0,632174,459588,221481,1811794,14037.0,0.056814,13385582.0,0.076803,3455.0,194750830.0,36942656.0,665106.0,51446133.0,48378242.0
2018-08-01,78681.0,18261169.0,60371.0,53044373.0,1226835.0,765093,521824,221601,1822092,14018.0,,13890731.0,0.066912,3433.0,196624959.0,40968361.0,802116.0,51977456.0,48591849.0
2018-09-01,82084.0,17797137.0,59265.0,,1171268.0,804334,377916,187533,1641871,18284.0,,13445328.0,0.068043,3671.0,,38570897.0,840120.0,,


In [210]:
metrics.to_csv(FILENAME, sep="\t")