## Setup

In [16]:
import datetime
from functools import reduce
from pathlib import Path
import time

import pandas as pd
import requests

import wmfdata
from wmfdata import hive, spark
from wmfdata.utils import print_err, pd_display_all

In [62]:
last_month = datetime.date.today().replace(day=1) - datetime.timedelta(days=1)
METRICS_MONTH_TEXT = last_month.strftime("%Y-%m")
MEDIAWIKI_HISTORY_SNAPSHOT = last_month.strftime("%Y-%m")
metrics_month = pd.Period(METRICS_MONTH_TEXT)

# Convert our metrics month to all the formats we need and provide them in a dict
# so we can easily use them to format strings
date_params = {
    "api_metrics_month_first_day": metrics_month.asfreq("D", how="start").strftime("%Y%m%d"),
    "api_metrics_month_day_after": (metrics_month + 1).asfreq("D", how="start").strftime("%Y%m%d"),
    "mediawiki_history_snapshot": MEDIAWIKI_HISTORY_SNAPSHOT,
    "metrics_cur_month": last_month.month,
    "metrics_month": str(metrics_month),
    "metrics_month_first_day": str(metrics_month.asfreq("D", how="start")),
    "metrics_month_end": str((metrics_month + 1).start_time),
    "metrics_month_last_day": str(metrics_month.asfreq("D", how="end")),
    "metrics_month_start": str(metrics_month.start_time),
    "metrics_next_month_first_day": str((metrics_month + 1).asfreq("D", how="start")),
    "metrics_prev_month": str(metrics_month - 1),
    "metrics_year": last_month.year,
    "retention_cohort": str(metrics_month - 2)
}

def prepare_query(filename):
    return (
        Path(filename)
        .read_text()
        .format(**date_params)
    )

In [33]:
class MetricSet:
    """
    Class assumptions:
    * Each query contains a "month" column which Pandas can parse into a date.
    * The column names used in the queries are unique across queries.
    """
    
    def __init__(self, filename, queries):
        self.filename = filename
        self.queries = queries
        self.load_data()

    def load_data(self):
        try:
            self.data = (
                pd
                .read_csv(self.filename, sep="\t", parse_dates = ["month"])
                .set_index("month")
            )
        except FileNotFoundError:
            self.data = pd.DataFrame()

    def add_data(self, new_data):
        """
        Takes a Pandas data frame with a date index giving the month and one or more
        columns of metrics.
        """
        self.data = new_data.combine_first(self.data)

    def run_queries(self):
        for key, val in self.queries.items():
            query = prepare_query(val["file"])
            engine = val["engine"]
            print_err("Running {} on {}...".format(key, engine))

            if engine == "mariadb":
                result = mariadb.run(query)
            elif engine == "hive":
                result = spark.run(query)
            else:
                raise ValueError("Unknown engine specified.") 

            result = (
                result
                .assign(month=lambda df: pd.to_datetime(df["month"]))
                .set_index("month")
            )
            
            self.add_data(result)

    def save_data(self):
        self.data.to_csv(self.filename, sep="\t")

## Editing metrics

In [57]:
editing_queries = {
    "active_editors": {
        "file": "queries/active_editors.sql",
        "engine": "hive"
    },
    "edits": {
        "file": "queries/edits.hql",
        "engine": "hive"
    },
    "new_editor_retention": {
        "file": "queries/new_editor_retention.hql",
         "engine": "hive"
    },
    "global_south_edits_editors": {
        "file": "queries/global_south_edits_editors.hql",
        "engine": "hive"
    },
    "mobile-heavy_edits_editors": {
        "file": "queries/mobile-heavy_edits_editors.hql",
        "engine": "hive"
    },
    "mobile-heavy_new_editor_retention": {
        "file": "queries/mobile-heavy_new_editor_retention.hql",
        "engine": "hive"
    },
    "global_south_new_editor_retention": {
        "file": "queries/global_south_new_editor_retention.hql",
        "engine": "hive"
    }
}

editing_metrics = MetricSet("metrics/editing_metrics.tsv", editing_queries)
editing_metrics.run_queries()

Running active_editors on hive...





Running edits on hive...                                                        





Running new_editor_retention on hive...                                         





Running global_south_edits_editors on hive...                                   





Running mobile-heavy_edits_editors on hive...                                   





Running mobile-heavy_new_editor_retention on hive...                            





Running global_south_new_editor_retention on hive...                            





### Content metrics via API

In [42]:
NEW_PAGES_API = (
    "https://wikimedia.org/api/rest_v1/metrics/"
    "edited-pages/new/{project}/all-editor-types/{page_type}/monthly/{start}/{end}"
)

headers = {
    "User-Agent": "https://github.com/wikimedia-research/movement-metrics (bot)"
}

api_results = []

def get_new_pages(
    project="all-projects",
    page_type="content",
    start=date_params["api_metrics_month_first_day"],
    end=date_params["api_metrics_month_day_after"]
):
    url = NEW_PAGES_API.format(
        project = project,
        page_type = page_type,
        start = start,
        end = end
    )
    
    r = requests.get(url, headers=headers)
    data = r.json()["items"][0]["results"]
    frame = pd.DataFrame(data)
    frame["timestamp"] = pd.to_datetime(frame["timestamp"]).dt.tz_localize(None)
    frame = (
        frame
        .rename(columns={"timestamp": "month"})
        .set_index("month")
    )
    
    return frame

In [45]:
total_new = (
    get_new_pages()
    .rename(columns={"new_pages": "net_new_content_pages"})
)

editing_metrics.add_data(total_new)


wikidata_new = (
    get_new_pages(project="wikidata.org")
    .rename(columns={"new_pages": "net_new_Wikidata_entities"})
)

editing_metrics.add_data(wikidata_new)


commons_new = (
    get_new_pages(project="commons.wikimedia.org")
    .rename(columns={"new_pages": "net_new_Commons_content_pages"})
)

editing_metrics.add_data(commons_new)

In [46]:
wp_domains = spark.run("""
    SELECT domain_name
    FROM canonical_data.wikis
    WHERE database_group = "wikipedia"
""")["domain_name"]

results = []
n = len(wp_domains)

for i, domain in enumerate(wp_domains):
    p = i + 1
    
    if p % 50 == 0:
        print_err(f"Now on project {p} of {n} ({domain})")
        
    frame = get_new_pages(project=domain)
    frame["project"] = domain
    results.append(frame)
    
    # Be polite to the API
    time.sleep(0.02)

new_per_wp = pd.concat(results)

# Sum across projects to get new Wikipedia articles per month
wikipedia_new = (
    new_per_wp
    .groupby("month")
    .agg({"new_pages": "sum"})
    .rename(columns={"new_pages": "net_new_Wikipedia_articles"})
)

editing_metrics.add_data(wikipedia_new)

Now on project 50 of 335 (ceb.wikipedia.org)                                    





Now on project 100 of 335 (glk.wikipedia.org)





Now on project 150 of 335 (kj.wikipedia.org)





Now on project 200 of 335 (mus.wikipedia.org)





Now on project 250 of 335 (roa-tara.wikipedia.org)





Now on project 300 of 335 (tpi.wikipedia.org)





In [51]:
editing_metrics.save_data()

## Readers metrics

In [56]:
readers_queries = {
    "pageviews": {
        "file": "queries/pageviews.hql",
        "engine": "hive"
    },
    "automated_pageviews": {
        "file": "queries/automated_pageviews.hql",
        "engine": "hive"
    },
    "page_previews": {
        "file": "queries/page_previews.hql",
        "engine": "hive"
    },
    "unique_devices": {
        "file": "queries/unique_devices.hql",
        "engine": "hive"
    }     
}

readers_metrics = MetricSet("metrics/readers_metrics.tsv", readers_queries)
readers_metrics.run_queries()
readers_metrics.data = (
    readers_metrics.data
    .assign(interactions=lambda df: df["previews_seen"] + df["total_pageview"])
)
readers_metrics.save_data()

Running pageviews on hive...





Running automated_pageviews on hive...                                          





Running page_previews on hive...                                                





Running unique_devices on hive...                                               





## Editing diversity metrics

In [60]:
editing_diversity_queries = {
    "global_south_edits_editors": {
        "file": "queries/global_south_edits_editors.hql",
        "engine": "hive"
    },

    "global_south_new_editor_retention": {
        "file": "queries/global_south_new_editor_retention.hql",
        "engine": "hive"
    },
    
    "global_south_net_new_content": {
        "file": "queries/global_south_net_new_content.hql",
        "engine": "hive"
    },
    
    "global_south_wikidata_entities": {
        "file": "queries/global_south_net_new_wikidata.hql",
        "engine": "hive"
    },
    
    "global_north_edits_editors": {
        "file": "queries/global_north_edits_editors.hql",
        "engine": "hive"
    },

    "global_north_new_editor_retention": {
        "file": "queries/global_north_new_editor_retention.hql",
        "engine": "hive"
    },
    
    "global_north_net_new_content": {
        "file": "queries/global_north_net_new_content.hql",
        "engine": "hive"
    },
    "global_north_wikidata_entities": {
        "file": "queries/global_north_net_new_wikidata.hql",
        "engine": "hive"
    }
}

editing_diversity_metrics = MetricSet("metrics/editing_diversity_metrics.tsv", editing_diversity_queries)
editing_diversity_metrics.run_queries()
editing_diversity_metrics.save_data()

Running global_south_edits_editors on hive...





Running global_south_new_editor_retention on hive...                            





Running global_south_net_new_content on hive...                                 





Running global_south_wikidata_entities on hive...                               





Running global_north_edits_editors on hive...                                   





Running global_north_new_editor_retention on hive...                            





Running global_north_net_new_content on hive...





Running global_north_wikidata_entities on hive...                               





                                                                                

In [68]:
readers_diversity_queries = {
    "global_south_pageviews": {
        "file": "queries/global_south_pageviews.hql",
        "engine": "hive"
    },
    "global_south_previews": {
        "file": "queries/global_south_previews.hql",
        "engine": "hive"
    },
    "global_north_previews": {
        "file": "queries/global_north_previews.hql",
        "engine": "hive"
    },
     "global_north_pageviews": {
        "file": "queries/global_north_pageviews.hql",
        "engine": "hive"
    }   
}

readers_diversity_metrics = MetricSet("metrics/readers_diversity_metrics.tsv", readers_diversity_queries)
readers_diversity_metrics.run_queries()

readers_diversity_metrics.data = (
    readers_diversity_metrics.data
    .assign(
        gs_interactions=lambda df: df["gs_previews"] + df["gs_pageviews"],
        gn_interactions=lambda df: df["gn_previews"] + df["gn_pageviews"]
    )
)

readers_diversity_metrics.save_data()

Running global_south_pageviews on hive...





Running global_south_previews on hive...                                        





Running global_north_previews on hive...                                        





Running global_north_pageviews on hive...                                       





                                                                                