In [49]:
from collections import OrderedDict

import matplotlib as mpl
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import pyarrow.parquet as pq
import seaborn as sns
from wmfdata import charting, hive
from wmfdata.utils import pd_display_all, pct_str

In [2]:
%matplotlib inline
mpl.style.use(charting.mpl_style)

# Data wrangling

Right now, when we see sessions that ended before the user tried to save, we can't tell whether the user actually attempted to do anything or not. So, to avoid that messing with our results, I'll limit this to only sessions where they user actually attempted to save.

In [3]:
ves_categories = ["wiki", "platform", "integration", "editor", "user_experience"]
ve_sessions = (
    pq.read_table("data/sessions.parquet")
    .to_pandas(categories=ves_categories)
    .query("editor == 'visualeditor' & save_intent_count >= 1")
    .reset_index(drop=True)
)
ve_sessions.info()
ve_sessions.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 38551 entries, 0 to 38550
Data columns (total 17 columns):
editing_session_id    38551 non-null object
wiki                  38551 non-null category
platform              38551 non-null category
integration           38551 non-null category
editor                38551 non-null category
page_ns               38551 non-null int64
init_count            38551 non-null int64
loaded_count          38551 non-null int64
ready_count           38551 non-null int64
save_intent_count     38551 non-null int64
save_attempt_count    38551 non-null int64
save_success_count    38551 non-null int64
abort_count           38551 non-null int64
max_timestamp         38551 non-null datetime64[ns]
min_timestamp         38551 non-null datetime64[ns]
user_experience       38551 non-null category
duration              38551 non-null int64
dtypes: category(5), datetime64[ns](2), int64(9), object(1)
memory usage: 3.8+ MB


Unnamed: 0,editing_session_id,wiki,platform,integration,editor,page_ns,init_count,loaded_count,ready_count,save_intent_count,save_attempt_count,save_success_count,abort_count,max_timestamp,min_timestamp,user_experience,duration
0,007e98003e2312d8b6d6,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-24 01:30:24,2019-02-24 01:30:01,1000+ edits,23
1,00804a50ff2ded097d53,dewiki,desktop,page,visualeditor,0,1,1,1,1,1,1,1,2019-02-19 15:59:38,2019-02-19 15:58:27,IP,71
2,00982ec0f49183639dfc,nlwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-22 09:00:23,2019-02-22 08:59:58,10-99 edits,25
3,00df47f0a0ff24670d76,enwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-20 02:38:15,2019-02-20 02:34:39,1-9 edits,216
4,00e0d590151587cb30ef,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-19 15:48:31,2019-02-19 15:48:15,100-999 edits,16


In [4]:
fu_categories = ["feature", "action"]
feature_uses = (
    pq.read_table("data/feature_uses.parquet").to_pandas(categories=fu_categories)
)
feature_uses.info()
feature_uses.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 350913 entries, 0 to 350912
Data columns (total 4 columns):
timestamp             350913 non-null datetime64[ns]
editing_session_id    350913 non-null object
feature               350913 non-null category
action                350913 non-null category
dtypes: category(2), datetime64[ns](1), object(1)
memory usage: 8.7+ MB


Unnamed: 0,timestamp,editing_session_id,feature,action
0,2019-02-10 00:34:02,4e656710bee99109c67d,link,window-open
1,2019-02-10 00:42:02,99ff91c0b31b4f1fae12,link,window-open
2,2019-02-10 00:25:26,04791050ec87c8958236,citefromid,window-open
3,2019-02-10 00:37:29,ebc3ff909aa528ffa4ab,citefromid,window-open
4,2019-02-10 00:34:07,637a6560b16658920ed6,mwSave,window-open


In [18]:
session_uses = pd.merge(
    ve_sessions, 
    feature_uses, 
    how="inner", 
    on="editing_session_id"
)

session_uses.info()
session_uses.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 235325 entries, 0 to 235324
Data columns (total 20 columns):
editing_session_id    235325 non-null object
wiki                  235325 non-null category
platform              235325 non-null category
integration           235325 non-null category
editor                235325 non-null category
page_ns               235325 non-null int64
init_count            235325 non-null int64
loaded_count          235325 non-null int64
ready_count           235325 non-null int64
save_intent_count     235325 non-null int64
save_attempt_count    235325 non-null int64
save_success_count    235325 non-null int64
abort_count           235325 non-null int64
max_timestamp         235325 non-null datetime64[ns]
min_timestamp         235325 non-null datetime64[ns]
user_experience       235325 non-null category
duration              235325 non-null int64
timestamp             235325 non-null datetime64[ns]
feature               235325 non-null category
action 

Unnamed: 0,editing_session_id,wiki,platform,integration,editor,page_ns,init_count,loaded_count,ready_count,save_intent_count,save_attempt_count,save_success_count,abort_count,max_timestamp,min_timestamp,user_experience,duration,timestamp,feature,action
0,007e98003e2312d8b6d6,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-24 01:30:24,2019-02-24 01:30:01,1000+ edits,23,2019-02-24 01:30:21,mwSave,window-open
1,007e98003e2312d8b6d6,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-24 01:30:24,2019-02-24 01:30:01,1000+ edits,23,2019-02-24 01:30:06,table,style-header
2,007e98003e2312d8b6d6,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-24 01:30:24,2019-02-24 01:30:01,1000+ edits,23,2019-02-24 01:30:08,table,style-header
3,007e98003e2312d8b6d6,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-24 01:30:24,2019-02-24 01:30:01,1000+ edits,23,2019-02-24 01:30:13,table,style-header
4,007e98003e2312d8b6d6,frwiki,desktop,page,visualeditor,0,1,1,1,1,1,1,0,2019-02-24 01:30:24,2019-02-24 01:30:01,1000+ edits,23,2019-02-24 01:30:11,table,style-header


In [20]:
desktop_uses = session_uses.query("platform == 'desktop'")
n_desktop_sess = desktop_uses["editing_session_id"].nunique()
n_desktop_sess

33735

In [21]:
phone_uses = session_uses.query("platform == 'phone'")
n_phone_sess = phone_uses["editing_session_id"].nunique()
n_phone_sess

4621

# Basic stats

In [29]:
def calc_feature_stats(df):
    n_sess = df["editing_session_id"].nunique()
    
    pd_display_all(
        df
        .groupby(["feature", "action"])
        .agg({
            "timestamp": "count",
            "editing_session_id": pd.Series.nunique
        })
        .sort_values(by="editing_session_id", ascending=False)
        .assign(
            editing_session_id=lambda df: (df["editing_session_id"] / n_sess).apply(pct_str)
        
        )
        .reset_index()
        .rename({
            "feat": "feature",
            "act": "action",
            "dt": "use count",
            "editing_session_id": "sessions using",

        }, axis=1)
    )

What feature uses are we seeing...on desktop?

In [30]:
calc_feature_stats(desktop_uses)

Unnamed: 0,feature,action,timestamp,sessions using
0,mwSave,window-open,36862,99.9%
1,link,window-open,42586,26.1%
2,clipboard,paste,45513,25.5%
3,transclusion,window-open,13043,15.6%
4,citefromid,window-open,13644,14.3%
5,clipboard,copy,16632,12.6%
6,clipboard,dragstart,5482,6.3%
7,clipboard,cut,4996,5.7%
8,textStyle/bold,toggle-selection,5886,5.5%
9,linkEducationPopup,show,1867,4.8%


...on mobile?

In [31]:
calc_feature_stats(phone_uses)

Unnamed: 0,feature,action,timestamp,sessions using
0,mwSave,window-open,5321,99.8%
1,link,window-open,2825,19.2%
2,transclusion,window-open,1709,19.1%
3,clipboard,paste,761,9.0%
4,citefromid,window-open,466,6.6%
5,textStyle/bold,toggle-insertion,324,3.3%
6,textStyle/bold,toggle-selection,235,2.8%
7,link/internal,clear,253,2.5%
8,clipboard,copy,199,2.4%
9,textStyle/italic,toggle-insertion,153,1.6%


* `alienExtension`: opening an inspector to edit the wikitext of something VE doesn't support (like a blockquote)
* `meta`: opening page settings
* `pages`: ?

# How many sessions use basic features?

In [52]:
def feature_type_stats(df):
    n_sess = df["editing_session_id"].nunique()
    features = df["feature"]

    cite_uses = features.str.match('cite|reference', case=False)
    link_uses = features.str.match('link', case=False) & ~(features == "linkEducationPopup")
    format_uses = features.str.match('textStyle', case=False)
    image_uses = features == "media"
    basic_uses = cite_uses | link_uses | format_uses | image_uses
    any_uses = ~(features == 'clipboard') & ~features.str.match('EducationPopup') & ~(features == 'mwSave')

    uses_arrays = OrderedDict([
        ("Citations", cite_uses),
        ("Links", link_uses),
        ("Formatting", format_uses),
        ("Images", image_uses),
        ("Basic features", basic_uses),
        ("Any feature", any_uses)
    ])

    for feature_type, array in uses_arrays.items():
        pct_sess = df[array]["editing_session_id"].nunique() / n_sess
        print("{feature_type}: {pct}".format(feature_type=feature_type, pct=pct_str(pct_sess)))

In [53]:
feature_type_stats(desktop_uses)

Citations: 16.4%
Links: 26.8%
Formatting: 10.4%
Images: 4.3%
Basic features: 41.1%
Any feature: 55.4%


In [56]:
feature_type_stats(phone_uses)

Citations: 7.4%
Links: 20.2%
Formatting: 8.1%
Images: 0.6%
Basic features: 27.5%
Any feature: 44.9%
