In [None]:
import json

from wmfdata import charting, mariadb, hive
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

You can find the source for `wmfdata` at https://github.com/neilpquinn/wmfdata


In [2]:
mpl.style.use(charting.mpl_style)

Let's grab a couple monthly samples of events, one from August and another from May (before events from phone editors stopped being logged). Note that these logs are a 6.25% sample of all events, so the absolute numbers need to be multiplied by 16 to get the true estimate of events.

Also, the 2010 wikitext editor has [kept sending events to an old schema version](https://phabricator.wikimedia.org/T203621) even after the schema was updated in December 2017, so we'll have to join the two tables. The [update](https://meta.wikimedia.org/w/index.php?title=Schema%3AEdit&type=revision&diff=17541122&oldid=13457736) involved adding `loaded` as a possible value for `event_action` and a corresponding `event_action.loaded.timing` field.

In [None]:
# A query to grab events for a specified time period.
events_q = """
select
    timestamp,
    userAgent,
    wiki,
    event_action,
    `event_action.abort.mechanism`,
    `event_action.abort.timing`,
    `event_action.abort.type`,
    `event_action.init.mechanism`,
    `event_action.init.timing`,
    `event_action.init.type`,
    null as `event_action.loaded.timing`,
    `event_action.ready.timing`,
    `event_action.saveAttempt.timing`,
    `event_action.saveFailure.message`,
    `event_action.saveFailure.timing`,
    `event_action.saveFailure.type`,
    `event_action.saveIntent.timing`,
    `event_action.saveSuccess.timing`,
    `event_editingSessionId`,
    event_editor,
    event_platform,
    `event_user.class`,
    `event_user.editCount`
from log.Edit_13457736
where timestamp between "{start}" and "{end}"

union all

select
    timestamp,
    userAgent,
    wiki,
    event_action,
    `event_action.abort.mechanism`,
    `event_action.abort.timing`,
    `event_action.abort.type`,
    `event_action.init.mechanism`,
    `event_action.init.timing`,
    `event_action.init.type`,
    `event_action.loaded.timing`,
    `event_action.ready.timing`,
    `event_action.saveAttempt.timing`,
    `event_action.saveFailure.message`,
    `event_action.saveFailure.timing`,
    `event_action.saveFailure.type`,
    `event_action.saveIntent.timing`,
    `event_action.saveSuccess.timing`,
    `event_editingSessionId`,
    event_editor,
    event_platform,
    `event_user.class`,
    `event_user.editCount`
from log.Edit_17541122
where timestamp between "{start}" and "{end}"
"""

# Some functions to join two tables of events together and clean them
el_rename = lambda x: x.replace("event_", "").replace("action.", "").replace(".", "_")

def json_try_load(obj):
    try:
        return json.loads(obj)
    except TypeError:
        return None

def prepare_events(events):
    # Shorten the names of the events
    events = events.rename(columns=el_rename)
    
    # How do we parse UA?

    # Make action a categorical variable with a roughly progressive sort order
    action_names = ["init", "loaded", "ready", "saveIntent", "saveAttempt", "saveSuccess", "saveFailure", "abort"]
    events["action"] = pd.Categorical(events["action"], categories=action_names, ordered=True)

    # Sort the events for chronological display.
    # If actions have the same timestamp, they'll get sorted by the actions in their custom sort order.
    events = events.sort_values(["timestamp", "action"])

    # Reset the index, which is a mess because of the concatenation
    events = events.reset_index(drop=True)

    # Some missing values are represented as `None`, others as `NaN`. Make that consistent.
    events = events.fillna(value=np.nan)
    
    return events

Let's do this for May first.

In [None]:
may_events = prepare_events(
    mariadb.run(
        events_q.format(start="201805", end="201806"),
        host="logs"
    )
)

may_events.to_csv("data/may_events.tsv", sep="\t", index=False)

In [None]:
aug_events = prepare_events(
    mariadb.run(
        events_q.format(start="201808", end="201809"),
        host="logs"
    )
)

aug_events.to_csv("data/aug_events.tsv", sep="\t", index=False)

# Sessions

In [35]:
sessions_q = """
select
    wiki,
    group_concat(action order by timestamp separator "-") as actions,
    min(edit_count) as edit_count,
    group_concat(distinct editor separator "-") as editor,
    group_concat(distinct platform separator "-") as platform,
    group_concat(distinct user_class separator "-") as user_class,
    page_namespace,
    sum(action = "init") as init_count,
    sum(action = "loaded") as loaded_count,
    sum(action = "ready") as ready_count,
    sum(action = "saveIntent") as save_intent_count,
    sum(action = "saveAttempt") as save_attempt_count,
    sum(action = "saveSuccess") as save_success_count,
    sum(action = "abort") as abort_count,
    max(timestamp) as max_timestamp,
    min(timestamp) as min_timestamp
from (
    select 
        event_editingSessionId as editing_session_id,
        timestamp,
        wiki,
        `event_page.ns` as page_namespace,
        event_action as action,
        `event_user.editCount` as edit_count,
        event_editor as editor,
        event_platform as platform,
        `event_user.class` as user_class
    from log.Edit_13457736
    where timestamp between "{start}" and "{end}"
    
    union all
    
    select 
        event_editingSessionId as editing_session_id,
        timestamp,
        wiki,
        `event_page.ns` as page_namespace,
        event_action as action,
        `event_user.editCount` as edit_count,
        event_editor as editor,
        event_platform as platform,
        `event_user.class` as user_class
    from log.Edit_17541122
    where timestamp between "{start}" and "{end}"
) as events
group by editing_session_id
"""

In [36]:
may_sessions = mariadb.run(
    sessions_q.format(start="201805", end="201806"),
    host="logs"
)

may_sessions.to_csv("data/may_sessions.tsv", sep="\t", index=False)

In [37]:
aug_sessions = mariadb.run(
    sessions_q.format(start="201808", end="201809"),
    host="logs"
)

aug_sessions.to_csv("data/aug_sessions.tsv", sep="\t", index=False)

# Extracting and loading change tags

In [None]:
# Not taking into account deleted edits!
TAG_START = "201710"
TAG_END = "201810"

tags = mariadb.multirun("""
select
    database() as wiki,
    rev_id,
    sum(ct_tag = "mobile web edit") as mobile_web,
    sum(ct_tag = "visualeditor") as visual_editor
from change_tag
inner join revision
on ct_rev_id = rev_id
where 
    ct_tag in ("mobile web edit", "visualeditor") and
    rev_timestamp between "{start}" and "{end}"
group by rev_id
""".format(start=TAG_START, end=TAG_END))

aawiki completed in 0 s
aawikibooks completed in 0 s
aawiktionary completed in 0 s
abwiki completed in 0 s
abwiktionary completed in 0 s
acewiki completed in 0 s
adywiki completed in 0 s
afwiki completed in 2 s
afwikibooks completed in 0 s
afwikiquote completed in 0 s
afwiktionary completed in 0 s
akwiki completed in 0 s
akwikibooks completed in 0 s
akwiktionary completed in 0 s
alswiki completed in 1 s
alswikibooks completed in 0 s
alswikiquote completed in 0 s
alswiktionary completed in 0 s
amwiki completed in 0 s
amwikiquote completed in 0 s
amwiktionary completed in 0 s
angwiki completed in 0 s
angwikibooks completed in 0 s
angwikiquote completed in 0 s
angwikisource completed in 0 s
angwiktionary completed in 0 s
anwiki completed in 0 s
anwiktionary completed in 0 s
arcwiki completed in 0 s
arwiki completed in 202 s
arwikibooks completed in 0 s
arwikinews completed in 0 s
arwikiquote completed in 0 s
arwikisource completed in 0 s
arwikiversity completed in 0 s
arwiktionary complet

In [48]:
tags["mobile_web"] = tags["mobile_web"].astype(bool)
tags["visual_editor"] = tags["visual_editor"].astype(bool)

In [53]:
tags.head()

Unnamed: 0,wiki,rev_id,mobile_web,visual_editor
0,abwiki,54766,False,True
1,abwiki,55951,True,False
2,abwiki,56128,True,False
3,abwiki,56288,False,True
4,abwiki,56517,True,False


In [70]:
def compute_editor(row):
    if row["mobile_web"]:
        if row["visual_editor"]:
            return "mobile visual editor"
        else:
            return "mobile wikitext editor"
    else:
        # Edits made with the desktop wikitext editors aren't included in this dateset
        return "desktop visual editor"
        
tags["editor"] = tags.apply(compute_editor, axis=1)
tags.head()

Unnamed: 0,wiki,rev_id,mobile_web,visual_editor,editor
0,abwiki,54766,False,True,desktop visual editor
1,abwiki,55951,True,False,mobile wikitext editor
2,abwiki,56128,True,False,mobile wikitext editor
3,abwiki,56288,False,True,desktop visual editor
4,abwiki,56517,True,False,mobile wikitext editor


In [71]:
tags.drop(
    labels=["mobile_web", "visual_editor"],
    axis=1
).to_csv(
    "data/mob_or_ve_edits.tsv",
    sep="\t",
    index=False,
    header=False
)

In [72]:
hive.run("""
drop table neilpquinn.mob_or_ve_edits
""")

In [74]:
hive.run("""
create table neilpquinn.mob_or_ve_edits (
    wiki string,
    rev_id int,
    editor string
)
row format delimited fields terminated by '\t'
""")

In [76]:
!hive -e "LOAD DATA LOCAL INPATH '/home/neilpquinn-wmf/proj/2018-08-Editing-metrics-snapshots/data/mob_or_ve_edits.tsv' OVERWRITE INTO TABLE neilpquinn.mob_or_ve_edits"

log4j:WARN No such property [maxBackupIndex] in org.apache.log4j.DailyRollingFileAppender.

Logging initialized using configuration in file:/etc/hive/conf.analytics-hadoop/hive-log4j.properties
Loading data to table neilpquinn.mob_or_ve_edits
OK
Time taken: 18.49 seconds


In [77]:
hive.run("select * from neilpquinn.mob_or_ve_edits limit 10")

Unnamed: 0,mob_or_ve_edits.wiki,mob_or_ve_edits.rev_id,mob_or_ve_edits.editor
0,abwiki,54766,desktop visual editor
1,abwiki,55951,mobile wikitext editor
2,abwiki,56128,mobile wikitext editor
3,abwiki,56288,desktop visual editor
4,abwiki,56517,mobile wikitext editor
5,abwiki,56518,mobile wikitext editor
6,abwiki,56883,mobile wikitext editor
7,abwiki,57138,mobile wikitext editor
8,abwiki,57139,mobile wikitext editor
9,abwiki,57140,desktop visual editor


In [78]:
hive.run("select count(*) from neilpquinn.mob_or_ve_edits")

Unnamed: 0,_c0
0,20160021
