In [286]:
from ast import literal_eval
from copy import deepcopy
import json

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from wmfdata import charting, mariadb, hive

In [3]:
%matplotlib inline
mpl.style.use(charting.mpl_style)

When we changed the schema to `EditAttemptStep`, there were some initial problems logging all the events, but as far as I know, the [final fix](https://phabricator.wikimedia.org/T209431) was fully live with [MediaWiki 1.33.0-wmf.6](https://www.mediawiki.org/wiki/MediaWiki_1.33/wmf.6#MobileFrontend), which was fully rolled out by 30 November 2018.

The whitelist has not been fully applied to either `EditAttemptStep` or `VisualEditorFeatureUse` so we'll have to stick with the 90 days of "raw" events.

In [30]:
eas_count_r = hive.run("""
select
    to_date(dt) as day,
    event.platform as platform,
    event.editor_interface as interface,
    event.action as action,
    count(*)
from event.editattemptstep
where
    year >= 2018 and
    not event.is_oversample
group by
    to_date(dt),
    event.platform,
    event.editor_interface,
    event.action
""")

eas_count_r.head()

Unnamed: 0,day,platform,interface,action,_c4
0,2018-10-31,desktop,wikitext,init,3146
1,2018-11-12,desktop,visualeditor,abort,1
2,2018-11-14,desktop,visualeditor,init,78
3,2018-11-15,desktop,wikitext,saveSuccess,16041
4,2018-11-17,desktop,wikitext,saveFailure,2074


In [46]:
eas_count = (
    eas_count_r
    .pivot_table(
        values="_c4",
        index=["day"],
        columns=["platform", "interface", "action"],
        fill_value=0
    )
)

In [None]:
# A query to grab events for a specified time period.
events_q = """
select
    timestamp,
    userAgent,
    wiki,
    event_action,
    `event_action.abort.mechanism`,
    `event_action.abort.timing`,
    `event_action.abort.type`,
    `event_action.init.mechanism`,
    `event_action.init.timing`,
    `event_action.init.type`,
    null as `event_action.loaded.timing`,
    `event_action.ready.timing`,
    `event_action.saveAttempt.timing`,
    `event_action.saveFailure.message`,
    `event_action.saveFailure.timing`,
    `event_action.saveFailure.type`,
    `event_action.saveIntent.timing`,
    `event_action.saveSuccess.timing`,
    `event_editingSessionId`,
    event_editor,
    event_platform,
    `event_user.class`,
    `event_user.editCount`
from log.Edit_13457736
where timestamp between "{start}" and "{end}"

union all

select
    timestamp,
    userAgent,
    wiki,
    event_action,
    `event_action.abort.mechanism`,
    `event_action.abort.timing`,
    `event_action.abort.type`,
    `event_action.init.mechanism`,
    `event_action.init.timing`,
    `event_action.init.type`,
    `event_action.loaded.timing`,
    `event_action.ready.timing`,
    `event_action.saveAttempt.timing`,
    `event_action.saveFailure.message`,
    `event_action.saveFailure.timing`,
    `event_action.saveFailure.type`,
    `event_action.saveIntent.timing`,
    `event_action.saveSuccess.timing`,
    `event_editingSessionId`,
    event_editor,
    event_platform,
    `event_user.class`,
    `event_user.editCount`
from log.Edit_17541122
where timestamp between "{start}" and "{end}"
"""

# Some functions to join two tables of events together and clean them
el_rename = lambda x: x.replace("event_", "").replace("action.", "").replace(".", "_")

def json_try_load(obj):
    try:
        return json.loads(obj)
    except TypeError:
        return None

def prepare_events(events):
    # Shorten the names of the events
    events = events.rename(columns=el_rename)
    
    # How do we parse UA?

    # Make action a categorical variable with a roughly progressive sort order
    action_names = ["init", "loaded", "ready", "saveIntent", "saveAttempt", "saveSuccess", "saveFailure", "abort"]
    events["action"] = pd.Categorical(events["action"], categories=action_names, ordered=True)

    # Sort the events for chronological display.
    # If actions have the same timestamp, they'll get sorted by the actions in their custom sort order.
    events = events.sort_values(["timestamp", "action"])

    # Reset the index, which is a mess because of the concatenation
    events = events.reset_index(drop=True)

    # Some missing values are represented as `None`, others as `NaN`. Make that consistent.
    events = events.fillna(value=np.nan)
    
    return events

Let's do this for May first.

In [None]:
may_events = prepare_events(
    mariadb.run(
        events_q.format(start="201805", end="201806"),
        host="logs"
    )
)

may_events.to_csv("data/may_events.tsv", sep="\t", index=False)

In [None]:
aug_events = prepare_events(
    mariadb.run(
        events_q.format(start="201808", end="201809"),
        host="logs"
    )
)

aug_events.to_csv("data/aug_events.tsv", sep="\t", index=False)

# Sessions

In [None]:
sessions_r = hive.run("""
select *
from (
    select
        event.editing_session_id as editing_session_id,
        wiki,
        event.platform as platform,
        collect_set(event.editor_interface) as editor,
        event.user_class as user_class,
        event.page_ns as page_ns,
        collect_list(event.action) as funnel_actions,
        min(event.user_editcount) as edit_count,
        sum(cast(event.action = "init" as int)) as init_count,
        sum(cast(event.action = "loaded" as int)) as loaded_count,
        sum(cast(event.action = "ready" as int)) as ready_count,
        sum(cast(event.action = "saveIntent" as int)) as save_intent_count,
        sum(cast(event.action = "saveAttempt" as int)) as save_attempt_count,
        sum(cast(event.action = "saveSuccess" as int)) as save_success_count,
        sum(cast(event.action = "abort" as int)) as abort_count,
        max(dt) as max_timestamp,
        min(dt) as min_timestamp
    from event.editattemptstep
    where
        year >= 2018 and dt >= "2018-11-30" and
        not event.is_oversample
    group by event.editing_session_id, wiki, event.platform, event.user_class, event.page_ns
) eas
left join (
    select
        event.editingsessionid as editing_session_id,
        collect_list(concat_ws("-", event.feature, event.action)) as feature_actions
    from event.visualeditorfeatureuse
    where
        year >= 2018 and dt >= "2018-11-30"
    group by event.editingsessionid
) vefu
on eas.editing_session_id = vefu.editing_session_id
""")

In [331]:
sessions_r.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13044682 entries, 0 to 13044681
Data columns (total 19 columns):
editing_session_id    object
wiki                  object
platform              object
editor                object
user_class            object
page_ns               int64
funnel_actions        object
edit_count            int64
init_count            int64
loaded_count          int64
ready_count           int64
save_intent_count     int64
save_attempt_count    int64
save_success_count    int64
abort_count           int64
max_timestamp         object
min_timestamp         object
editing_session_id    object
feature_actions       object
dtypes: int64(9), object(10)
memory usage: 1.8+ GB


In [332]:
def eval_arrays(x):
    if x is None:
        return None
    else:
        return literal_eval(x)
    
def apply_assign(df, **kwargs):
    df_copy = df.copy()
    for col, func in kwargs.items():
        df_copy[col] = df_copy[col].apply(func)
    return df_copy

In [333]:
%%time
sessions = (
    sessions_r
    .pipe(apply_assign,
        funnel_actions=eval_arrays,
        feature_actions=eval_arrays,
        min_timestamp=pd.to_datetime,
        max_timestamp=pd.to_datetime,
        editor=eval_arrays
    )
)

CPU times: user 29min 51s, sys: 10.6 s, total: 30min 1s
Wall time: 30min 1s


In [355]:
# Rename the duplicate `editing_session_id` column
#new_cols = sessions.columns.tolist()
#new_cols[-2] = "vefu_session_id"
#sessions.columns = new_cols
sessions.drop("vefu_session_id", axis=1, inplace=True)

MemoryError: 

In [349]:

sessions.to_parquet("data/sessions.parquet")

MemoryError: 

# Extracting and loading change tags

In [None]:
# Not taking into account deleted edits!
TAG_START = "201710"
TAG_END = "201810"

tags = mariadb.multirun("""
select
    database() as wiki,
    rev_id,
    sum(ct_tag = "mobile web edit") as mobile_web,
    sum(ct_tag = "visualeditor") as visual_editor
from change_tag
inner join revision
on ct_rev_id = rev_id
where 
    ct_tag in ("mobile web edit", "visualeditor") and
    rev_timestamp between "{start}" and "{end}"
group by rev_id
""".format(start=TAG_START, end=TAG_END))

aawiki completed in 0 s
aawikibooks completed in 0 s
aawiktionary completed in 0 s
abwiki completed in 0 s
abwiktionary completed in 0 s
acewiki completed in 0 s
adywiki completed in 0 s
afwiki completed in 2 s
afwikibooks completed in 0 s
afwikiquote completed in 0 s
afwiktionary completed in 0 s
akwiki completed in 0 s
akwikibooks completed in 0 s
akwiktionary completed in 0 s
alswiki completed in 1 s
alswikibooks completed in 0 s
alswikiquote completed in 0 s
alswiktionary completed in 0 s
amwiki completed in 0 s
amwikiquote completed in 0 s
amwiktionary completed in 0 s
angwiki completed in 0 s
angwikibooks completed in 0 s
angwikiquote completed in 0 s
angwikisource completed in 0 s
angwiktionary completed in 0 s
anwiki completed in 0 s
anwiktionary completed in 0 s
arcwiki completed in 0 s
arwiki completed in 202 s
arwikibooks completed in 0 s
arwikinews completed in 0 s
arwikiquote completed in 0 s
arwikisource completed in 0 s
arwikiversity completed in 0 s
arwiktionary complet

In [48]:
tags["mobile_web"] = tags["mobile_web"].astype(bool)
tags["visual_editor"] = tags["visual_editor"].astype(bool)

In [53]:
tags.head()

Unnamed: 0,wiki,rev_id,mobile_web,visual_editor
0,abwiki,54766,False,True
1,abwiki,55951,True,False
2,abwiki,56128,True,False
3,abwiki,56288,False,True
4,abwiki,56517,True,False


In [70]:
def compute_editor(row):
    if row["mobile_web"]:
        if row["visual_editor"]:
            return "mobile visual editor"
        else:
            return "mobile wikitext editor"
    else:
        # Edits made with the desktop wikitext editors aren't included in this dateset
        return "desktop visual editor"
        
tags["editor"] = tags.apply(compute_editor, axis=1)
tags.head()

Unnamed: 0,wiki,rev_id,mobile_web,visual_editor,editor
0,abwiki,54766,False,True,desktop visual editor
1,abwiki,55951,True,False,mobile wikitext editor
2,abwiki,56128,True,False,mobile wikitext editor
3,abwiki,56288,False,True,desktop visual editor
4,abwiki,56517,True,False,mobile wikitext editor


In [71]:
tags.drop(
    labels=["mobile_web", "visual_editor"],
    axis=1
).to_csv(5
    "data/mob_or_ve_edits.tsv",
    sep="\t",
    index=False,
    header=False
)

In [72]:
hive.run("""
drop table neilpquinn.mob_or_ve_edits
""")

In [74]:
hive.run("""
create table neilpquinn.mob_or_ve_edits (
    wiki string,
    rev_id int,
    editor string
)
row format delimited fields terminated by '\t'
""")

In [76]:
!hive -e "LOAD DATA LOCAL INPATH '/home/neilpquinn-wmf/proj/2018-08-Editing-metrics-snapshots/data/mob_or_ve_edits.tsv' OVERWRITE INTO TABLE neilpquinn.mob_or_ve_edits"

log4j:WARN No such property [maxBackupIndex] in org.apache.log4j.DailyRollingFileAppender.

Logging initialized using configuration in file:/etc/hive/conf.analytics-hadoop/hive-log4j.properties
Loading data to table neilpquinn.mob_or_ve_edits
OK
Time taken: 18.49 seconds


In [77]:
hive.run("select * from neilpquinn.mob_or_ve_edits limit 10")

Unnamed: 0,mob_or_ve_edits.wiki,mob_or_ve_edits.rev_id,mob_or_ve_edits.editor
0,abwiki,54766,desktop visual editor
1,abwiki,55951,mobile wikitext editor
2,abwiki,56128,mobile wikitext editor
3,abwiki,56288,desktop visual editor
4,abwiki,56517,mobile wikitext editor
5,abwiki,56518,mobile wikitext editor
6,abwiki,56883,mobile wikitext editor
7,abwiki,57138,mobile wikitext editor
8,abwiki,57139,mobile wikitext editor
9,abwiki,57140,desktop visual editor


In [78]:
hive.run("select count(*) from neilpquinn.mob_or_ve_edits")

Unnamed: 0,_c0
0,20160021


## Feature use snapshot

In [354]:
sessions.query("editor == ['visualeditor']")

MemoryError: 