In [169]:
from ast import literal_eval
from copy import deepcopy
import json

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from wmfdata import charting, mariadb, hive
from wmfdata.utils import pd_display_all, pct_str

In [170]:
%matplotlib inline
mpl.style.use(charting.mpl_style)

In [14]:
ve_sessions_r = hive.run("""
select *
from (
    select
        event.editing_session_id as editing_session_id,
        wiki,
        event.platform as platform,
        event.user_class as user_class,
        event.page_ns as page_ns,
        collect_list(event.action) as funnel_actions,
        min(event.user_editcount) as edit_count,
        sum(cast(event.action = "init" as int)) as init_count,
        sum(cast(event.action = "loaded" as int)) as loaded_count,
        sum(cast(event.action = "ready" as int)) as ready_count,
        sum(cast(event.action = "saveIntent" as int)) as save_intent_count,
        sum(cast(event.action = "saveAttempt" as int)) as save_attempt_count,
        sum(cast(event.action = "saveSuccess" as int)) as save_success_count,
        sum(cast(event.action = "abort" as int)) as abort_count,
        max(dt) as max_timestamp,
        min(dt) as min_timestamp
    from event.editattemptstep
    where
        year >= 2018 and dt >= "2018-11-30" and
        event.editor_interface = "visualeditor" and
        not event.is_oversample
    group by event.editing_session_id, wiki, event.platform, event.user_class, event.page_ns
) eas
left join (
    select
        event.editingsessionid as editing_session_id,
        collect_list(concat_ws("-", event.feature, event.action)) as feature_actions
    from event.visualeditorfeatureuse
    where
        year >= 2018 and dt >= "2018-11-30"
    group by event.editingsessionid
) vefu
on eas.editing_session_id = vefu.editing_session_id
""")

In [15]:
%%time
def eval_arrays(x):
    if x is None:
        return None
    else:
        return literal_eval(x)
    
def apply_assign(df, **kwargs):
    df_copy = df.copy()
    for col, func in kwargs.items():
        df_copy[col] = df_copy[col].apply(func)
    return df_copy

# Rename the duplicate `editing_session_id` column
new_cols = ve_sessions_r.columns.tolist()
new_cols[-2] = "vefu_editing_session_id"
ve_sessions_r.columns = new_cols

ve_sessions = (
    ve_sessions_r
    .drop("vefu_editing_session_id", axis=1)
    .pipe(apply_assign,
        funnel_actions=eval_arrays,
        feature_actions=eval_arrays,
        min_timestamp=pd.to_datetime,
        max_timestamp=pd.to_datetime
    )
)

CPU times: user 1min 22s, sys: 512 ms, total: 1min 22s
Wall time: 1min 22s


Right now, when we see sessions that ended before the user tried to save, we can't tell whether the user actually attempted to do anything or not. So, to avoid that messing with our results, I'll limit this to only sessions where they user actually attempted to save.

In [78]:
ve_sessions = ve_sessions.query("save_intent_count >= 1")
ve_sessions.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 94835 entries, 0 to 631243
Data columns (total 17 columns):
editing_session_id    94835 non-null object
wiki                  94835 non-null object
platform              94835 non-null object
user_class            36528 non-null object
page_ns               94835 non-null int64
funnel_actions        94835 non-null object
edit_count            94835 non-null int64
init_count            94835 non-null int64
loaded_count          94835 non-null int64
ready_count           94835 non-null int64
save_intent_count     94835 non-null int64
save_attempt_count    94835 non-null int64
save_success_count    94835 non-null int64
abort_count           94835 non-null int64
max_timestamp         94835 non-null datetime64[ns]
min_timestamp         94835 non-null datetime64[ns]
feature_actions       81762 non-null object
dtypes: datetime64[ns](2), int64(9), object(6)
memory usage: 13.0+ MB


How many sessions are there?

In [61]:
n_sess = session_acts["editing_session_id"].nunique()
n_sess

94805

In [102]:
n_desktop_sess = session_acts.query("platform == 'desktop'")["editing_session_id"].nunique()
n_desktop_sess

81732

In [28]:
%%time
feature_acts = hive.run("""
select
    dt,
    event.editingsessionid as editing_session_id,
    event.feature as feat,
    event.action as act
from event.visualeditorfeatureuse
where
    year >= 2018 and dt >= "2018-11-30"
""").assign(
    dt=lambda df: pd.to_datetime(df["dt"])
)

CPU times: user 24.4 s, sys: 308 ms, total: 24.7 s
Wall time: 3min 24s


In [142]:
session_acts = pd.merge(
    ve_sessions, 
    feature_acts, 
    how="inner", 
    on="editing_session_id"
)

session_acts.head()

Unnamed: 0,editing_session_id,wiki,platform,user_class,page_ns,funnel_actions,edit_count,init_count,loaded_count,ready_count,save_intent_count,save_attempt_count,save_success_count,abort_count,max_timestamp,min_timestamp,feature_actions,dt,feat,act
0,00005580d0f93f8c83da,eswiki,desktop,,0,"[init, saveAttempt, saveIntent, saveAttempt, s...",10,1,1,1,1,2,1,0,2018-12-12 17:30:23,2018-12-12 16:51:41,"[citefromid-window-open, mwSave-window-open, c...",2018-12-12 16:53:43,citefromid,window-open
1,00005580d0f93f8c83da,eswiki,desktop,,0,"[init, saveAttempt, saveIntent, saveAttempt, s...",10,1,1,1,1,2,1,0,2018-12-12 17:30:23,2018-12-12 16:51:41,"[citefromid-window-open, mwSave-window-open, c...",2018-12-12 16:53:47,cite-web,window-open
2,00005580d0f93f8c83da,eswiki,desktop,,0,"[init, saveAttempt, saveIntent, saveAttempt, s...",10,1,1,1,1,2,1,0,2018-12-12 17:30:23,2018-12-12 16:51:41,"[citefromid-window-open, mwSave-window-open, c...",2018-12-12 17:29:59,mwSave,window-open
3,00005580d0f93f8c83da,eswiki,desktop,,0,"[init, saveAttempt, saveIntent, saveAttempt, s...",10,1,1,1,1,2,1,0,2018-12-12 17:30:23,2018-12-12 16:51:41,"[citefromid-window-open, mwSave-window-open, c...",2018-12-12 17:29:35,cite-web,window-open
4,00005580d0f93f8c83da,eswiki,desktop,,0,"[init, saveAttempt, saveIntent, saveAttempt, s...",10,1,1,1,1,2,1,0,2018-12-12 17:30:23,2018-12-12 16:51:41,"[citefromid-window-open, mwSave-window-open, c...",2018-12-12 17:29:03,citefromid,window-open


In [143]:
session_acts = (
    session_acts.
    assign(
        wiki=lambda df: pd.Categorical(df["wiki"]),
        platform=lambda df: pd.Categorical(df["platform"]),
        user_class=lambda df: pd.Categorical(df["user_class"]),
    )
)

In [144]:
session_acts.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 512202 entries, 0 to 512201
Data columns (total 20 columns):
editing_session_id    512202 non-null object
wiki                  512202 non-null category
platform              512202 non-null category
user_class            100267 non-null category
page_ns               512202 non-null int64
funnel_actions        512202 non-null object
edit_count            512202 non-null int64
init_count            512202 non-null int64
loaded_count          512202 non-null int64
ready_count           512202 non-null int64
save_intent_count     512202 non-null int64
save_attempt_count    512202 non-null int64
save_success_count    512202 non-null int64
abort_count           512202 non-null int64
max_timestamp         512202 non-null datetime64[ns]
min_timestamp         512202 non-null datetime64[ns]
feature_actions       512202 non-null object
dt                    512202 non-null datetime64[ns]
feat                  512202 non-null object
act          

It'll just mess things up if we try to include sessions with no feature—and since opening the save dialog is a feature, I'm not sure where the

# What kinds of features are we logging?

In [168]:
feature_stats = (
    session_acts
    .groupby(["feat", "act"])
    .agg({
        "dt": "count",
        "editing_session_id": pd.Series.nunique
    })
    .sort_values(by="editing_session_id", ascending=False)
    .transform({
        "dt": lambda s: s,
        "editing_session_id": lambda s: s.div(n_desktop_sess)
    })
    .reset_index()
    .rename({
        "feat": "feature",
        "act": "action",
        "dt": "use count",
        "editing_session_id": "sessions using",

    }, axis=1)
)

pd_display_all(feature_stats)

Unnamed: 0,feature,action,use count,sessions using
0,mwSave,window-open,89430,0.999474
1,clipboard,paste,106225,0.259507
2,link,window-open,104912,0.255591
3,transclusion,window-open,30207,0.156879
4,citefromid,window-open,36305,0.144179
5,clipboard,copy,36179,0.123966
6,clipboard,dragstart,13175,0.063145
7,clipboard,cut,11833,0.058459
8,textStyle/bold,toggle-selection,11739,0.047803
9,textStyle/italic,toggle-selection,12180,0.04598


# How many sessions use basic features?

In [133]:
cite_acts = session_acts["feat"].str.match('cite|reference', case=False)
link_acts = session_acts["feat"].str.match('link', case=False) & ~(session_acts["feat"] == "linkEducationPopup")
format_acts = session_acts["feat"].str.match('textStyle', case=False)
image_acts = session_acts["feat"] == "media"
basic_acts = cite_acts | link_acts | format_acts | image_acts

def pct_sessions(bool_arr):
    return (
        session_acts[bool_arr]
        .groupby("platform")
        ["editing_session_id"].nunique()
        / n_desktop_sess
    ).apply(pct_str)

In [134]:
pct_sessions(basic_acts)

platform
desktop    39.9%
Name: editing_session_id, dtype: object

## Just citations?

In [135]:
pct_sessions(cite_acts)

platform
desktop    16.7%
Name: editing_session_id, dtype: object

## Just links?

In [137]:
pct_sessions(link_acts)

platform
desktop    26.3%
Name: editing_session_id, dtype: object

## Just text formatting?

In [138]:
pct_sessions(format_acts)

platform
desktop    8.7%
Name: editing_session_id, dtype: object

## Just images?

In [139]:
pct_sessions(image_acts)

platform
desktop    4.1%
Name: editing_session_id, dtype: object

# How many sessions use any features?

"Any feature" excludes:
* Clipboard use (`feat = clipboard`)
* Education popups clicks (`feat like "EducationPopup"`)
* Save dialog opens (`feat == "mwSave"`)

This leaves many features in addition to the basics above: templates, page settings, find and replace, tables, special characters.

In [109]:
(
    session_acts
    .query("feat != 'clipboard' & not feat.str.match('EducationPopup') & feat != 'mwSave'")
    .groupby("platform")
    ["editing_session_id"].nunique()
    / n_desktop_sess
).apply(pct_str)

platform
desktop    54.6%
Name: editing_session_id, dtype: object

* `alienExtension`: opening an inspector to edit the wikitext of something VE doesn't support (like a blockquote)
* `meta`: opening page settings
* `pages`: ?

# Where is the mobile VE feature use data?

In [98]:
hive.run("""
select
    vefu.event.action as action, 
    vefu.event.feature as feature,
    count(*) as count
from event.editattemptstep eas
inner join event.visualeditorfeatureuse vefu
on eas.event.editing_session_id = vefu.event.editingsessionid
where
    eas.event.platform = "phone" and
    eas.event.editor_interface = "visualeditor" and
    eas.year >= 2018 and eas.dt >= "2018-11-30" and
    vefu.year >= 2018 and vefu.dt >= "2018-11-30"
group by vefu.event.action, vefu.event.feature
""")

Unnamed: 0,action,feature,count


In [99]:
hive.run("""
select
    vefu.event.action as action, 
    vefu.event.feature as feature,
    count(*) as count
from event.editattemptstep eas
inner join event.visualeditorfeatureuse vefu
on eas.event.editing_session_id = vefu.event.editingsessionid
wherej
    eas.event.platform = "desktop" and
    eas.event.editor_interface = "visualeditor" and
    eas.year >= 2018 and eas.dt >= "2018-11-30" and
    vefu.year >= 2018 and vefu.dt >= "2018-11-30"
group by vefu.event.action, vefu.event.feature
""")

Unnamed: 0,action,feature,count
0,clear,language,59
1,clear,link,9068
2,clear,link/internal,61397
3,copy,clipboard,356864
4,cut,clipboard,94352
5,dragstart,clipboard,124260
6,paste,clipboard,884842
7,show,citefromidEducationPopup,48335
8,show,helpPopup,9855
9,show,linkEducationPopup,58269


In [154]:
pd_display_all(_99)

Unnamed: 0,action,feature,count
0,clear,language,59
1,clear,link,9068
2,clear,link/internal,61397
3,copy,clipboard,356864
4,cut,clipboard,94352
5,dragstart,clipboard,124260
6,paste,clipboard,884842
7,show,citefromidEducationPopup,48335
8,show,helpPopup,9855
9,show,linkEducationPopup,58269
