In [1]:
import json

from wmfdata import charting, mariadb, hive
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
mpl.style.use(charting.mpl_style)

Let's grab a couple monthly samples of events, one from August and another from May (before events from phone editors stopped being logged). Note that these logs are a 6.25% sample of all events, so the absolute numbers need to be multiplied by 16 to get the true estimate of events.

Also, the 2010 wikitext editor has [kept sending events to an old schema version](https://phabricator.wikimedia.org/T203621) even after the schema was updated in December 2017, so we'll have to join the two tables. The [update](https://meta.wikimedia.org/w/index.php?title=Schema%3AEdit&type=revision&diff=17541122&oldid=13457736) involved adding `loaded` as a possible value for `event_action` and a corresponding `event_action.loaded.timing` field.

In [9]:
# A query to grab events for a specified schema version and timeperiod.
events_query = """
select
    timestamp,
    userAgent,
    wiki,
    event_action,
    `event_action.abort.mechanism`,
    `event_action.abort.timing`,
    `event_action.abort.type`,
    `event_action.init.mechanism`,
    `event_action.init.timing`,
    `event_action.init.type`,
    null as `event_action.loaded.timing`,
    `event_action.ready.timing`,
    `event_action.saveAttempt.timing`,
    `event_action.saveFailure.message`,
    `event_action.saveFailure.timing`,
    `event_action.saveFailure.type`,
    `event_action.saveIntent.timing`,
    `event_action.saveSuccess.timing`,
    `event_editingSessionId`,
    event_editor,
    event_platform,
    `event_user.class`,
    `event_user.editCount`
from log.Edit_13457736
where timestamp between "{start}" and "{end}"

union all

select
    timestamp,
    userAgent,
    wiki,
    event_action,
    `event_action.abort.mechanism`,
    `event_action.abort.timing`,
    `event_action.abort.type`,
    `event_action.init.mechanism`,
    `event_action.init.timing`,
    `event_action.init.type`,
    `event_action.loaded.timing`,
    `event_action.ready.timing`,
    `event_action.saveAttempt.timing`,
    `event_action.saveFailure.message`,
    `event_action.saveFailure.timing`,
    `event_action.saveFailure.type`,
    `event_action.saveIntent.timing`,
    `event_action.saveSuccess.timing`,
    `event_editingSessionId`,
    event_editor,
    event_platform,
    `event_user.class`,
    `event_user.editCount`
from log.Edit_17541122
where timestamp between "{start}" and "{end}"
"""

# Some functions to join two tables of events together and clean them
el_rename = lambda x: x.replace("event_", "").replace("action.", "").replace(".", "_")

def json_try_load(obj):
    try:
        return json.loads(obj)
    except TypeError:
        return None

def prepare_events(events_1, events_2):
    events = pd.concat([events_1, events_1], ignore_index=True, sort=False)
    
    # Shorten the names of the events
    events = events.rename(columns=el_rename)
    
    # How do we parse UA?

    # Make action a categorical variable with a roughly progressive sort order
    action_names = ["init", "loaded", "ready", "saveIntent", "saveAttempt", "saveSuccess", "saveFailure", "abort"]
    events["action"] = pd.Categorical(events["action"], categories=action_names, ordered=True)

    # Sort the events for chronological display.
    # If actions have the same timestamp, they'll get sorted by the actions in their custom sort order.
    events = events.sort_values(["timestamp", "action"])

    # Reset the index, which is a mess because of the concatenation
    events = events.reset_index(drop=True)

    # Some missing values are represented as `None`, others as `NaN`. Make that consistent.
    events = events.fillna(value=np.nan)
    
    return events

Let's do this for May first.

In [10]:

aug_events.to_csv("data/aug_events.tsv", sep="\t", index=False)
may_events.to_csv("data/may_events.tsv", sep="\t", index=False)

AttributeError: 'str' object has no attribute 'value_counts'

And now for August.

# Group sessions in MariaDB

In [20]:
sessions_q = """
select
    group_concat(action order by timestamp separator "-") as actions,
    min(edit_count) as edit_count,
    group_concat(distinct editor separator "-") as editor,
    group_concat(distinct platform separator "-") as platform,
    group_concat(distinct user_class separator "-") as user_class,
    sum(action = "init") as init_count,
    sum(action = "loaded") as loaded_count,
    sum(action = "ready") as ready_count,
    sum(action = "saveIntent") as save_intent_count,
    sum(action = "saveAttempt") as save_attempt_count,
    sum(action = "saveSuccess") as save_success_count,
    sum(action = "abort") as abort_count
from (
    select 
        event_editingSessionId as editing_session_id,
        timestamp,
        event_action as action,
        `event_user.editCount` as edit_count,
        event_editor as editor,
        event_platform as platform,
        `event_user.class` as user_class
    from log.Edit_13457736
    where timestamp between "{start}" and "{end}"
    
    union all
    
    select 
        event_editingSessionId as editing_session_id,
        timestamp,
        event_action as action,
        `event_user.editCount` as edit_count,
        event_editor as editor,
        event_platform as platform,
        `event_user.class` as user_class
    from log.Edit_17541122
    where timestamp between "{start}" and "{end}"
) as events
group by editing_session_id
"""

In [21]:
may_sessions = mariadb.run(
    sessions_q.format(start="201805", end="201806"),
    host="logs"
)

may_sessions.to_csv("data/may_sessions.tsv", sep="\t")

In [22]:
aug_sessions = mariadb.run(
    sessions_q.format(start="201808", end="201809"),
    host="logs"
)

aug_sessions.to_csv("data/aug_sessions.tsv", sep="\t")