In [18]:
import wmfdata as wmf
from wmfdata.utils import (
    pd_display_all,
    df_to_remarkup
)

# Contributor events

In [9]:
contributor_event = wmf.presto.run("""
SELECT
    FROM_ISO8601_TIMESTAMP(meta.dt) AS dt,
    meta.domain AS domain,
    access_method,
    context_page_title,
    event_type,
    publish_failure_message,
    story_already_exists,
    story_title,
    user_edit_count_bucket,
    user_is_anonymous,
    user_name,
    contribution_attempt_id
FROM event.mediawiki_wikistories_contribution_event
WHERE
    year >= 0
""")

In [None]:
contributor_event.describe()

Well, this is bizarre! This field was introduced a few weeks ago, so all the null values are expected. The `codfw` and `eqiad` values are not, though.

In [22]:
attempt_id = (
    contributor_event
    ["contribution_attempt_id"]
    .value_counts(dropna=False)
    .head(10)
    .to_frame()
    .reset_index()
    .rename({
        "index": "value",
        "contribution_attempt_id": "frequency"
    }, axis="columns")
)

attempt_id

Unnamed: 0,value,frequency
0,,1340
1,codfw,1215
2,eqiad,50
3,7d4a499adcd850ec5769,3
4,c8d489b1613f97a9124c,3
5,ce00b7c2a11114e5ea50,2
6,92de0c331e9d2d29fae3,2
7,cdfce631ceeaab2e262c,2
8,03f21bdc208d425b7ffd,2
9,88f3a3f6b6b005c520e8,2


However, if we only look at the data since the start of the deployment of the code setting this field, those weird values go away. That suggests the problem happened after ingestion, not in the instrumentation code.

In [25]:
recent_attempt_id = (
    contributor_event
    .query("dt >= '2022-10-17'")
    ["contribution_attempt_id"]
    .value_counts(dropna=False)
    .head(10)
    .to_frame()
    .reset_index()
    .rename({
        "index": "value",
        "contribution_attempt_id": "frequency"
    }, axis="columns")
)

recent_attempt_id

Unnamed: 0,value,frequency
0,,27
1,7d4a499adcd850ec5769,3
2,c8d489b1613f97a9124c,3
3,3bd60ed9a56b8fb2fda0,2
4,cdfce631ceeaab2e262c,2
5,65de010762602c8ea53c,2
6,56e1fecbdd705f7ba006,2
7,a6c128c4b0315732fbf5,2
8,92de0c331e9d2d29fae3,2
9,3aa0bd519016f1a5ed58,2


These do seem to be normal events otherwise.

In [None]:
(
    contributor_event
    .query("contribution_attempt_id in ('codfw', 'eqiad')")
    .sample(10)
    .pipe(pd_display_all)
)

Ahh, it seems like the datacenter field is getting duplicated into this one?

In [33]:
attempt_id_datacenter = wmf.presto.run("""
SELECT
    contribution_attempt_id,
    datacenter
FROM event.mediawiki_wikistories_contribution_event
WHERE
    year >= 0
    AND contribution_attempt_id IN ('codfw', 'eqiad')
""")

In [34]:
attempt_id_datacenter.value_counts(dropna=False)

contribution_attempt_id  datacenter
codfw                    codfw         1215
eqiad                    eqiad           50
dtype: int64

So far, all of the different paths through the funnel that we've recorded look very reasonable.

In [45]:
cai = contributor_event["contribution_attempt_id"].notna()

(
    contributor_event
    .query("dt >= '2022-10-17' & @cai")
    .sort_values("dt")
    .groupby("contribution_attempt_id")
    ["event_type"]
    .agg(lambda s: "-".join(s))
    .value_counts()
)

story_builder_open                                    68
story_builder_open-publish_success                    25
story_builder_open-publish_failure-publish_success     2
Name: event_type, dtype: int64

`story_already_exists` is no longer hardcoded to false.

In [49]:
(
    contributor_event
    .query("dt >= '2022-10-03' & event_type == 'publish_success'")
    ["story_already_exists"]
    .value_counts()
)

False    321
True      40
Name: story_already_exists, dtype: int64

For this sample, the values of `story_already_exists` match the on-wiki history.

In [None]:
(
    contributor_event
    .query("dt >= '2022-10-03' & event_type == 'publish_success'")
    .sample(10)
    [["dt", "story_title", "user_name", "story_already_exists"]]
)

# Consumer event

In [51]:
consumer_events = wmf.presto.run("""
SELECT
    FROM_ISO8601_TIMESTAMP(meta.dt) AS dt,
    meta.domain AS domain,
    access_method,
    activity_session_id,
    event_type,
    experiment_entry_date,
    experiment_group,
    next_story_opened,
    page_story_count,
    page_title,
    page_visible_time,
    pageview_id,
    referrer_type,
    session_days,
    session_is_first_of_day,
    story_completed,
    story_frame_count,
    story_frames_viewed,
    story_open_time,
    story_title
FROM event.mediawiki_wikistories_consumption_event
WHERE
    year >= 0
""")

Only story view events have `story_open_time` logged.

In [57]:
(
    consumer_events
    [lambda df: df["story_open_time"].notna()]
    ["event_type"]
    .value_counts()
)

story_view    2108
Name: event_type, dtype: int64

All story_view events have a `story_open_time`.

In [58]:
(
    consumer_events
    .query("event_type == 'story_view'")
    [lambda df: df["story_open_time"].isna()]
)

Unnamed: 0,dt,domain,access_method,activity_session_id,event_type,experiment_entry_date,experiment_group,next_story_opened,page_story_count,page_title,page_visible_time,pageview_id,referrer_type,session_days,session_is_first_of_day,story_completed,story_frame_count,story_frames_viewed,story_open_time,story_title


In [63]:
(consumer_events["story_open_time"].dropna() / 1000).round().astype(int)

0       49
1       16
3        8
4       74
7        2
        ..
5267     5
5268     6
5269     5
5271    26
5278     4
Name: story_open_time, Length: 2108, dtype: int64

Successive story open times within the same pageview do not always increase, which means the `story_open_time` is calculated indepedently for each story rather than being cumulative. I will have to note that in the schema documentation.

In [69]:
(
    consumer_events
    .query("event_type == 'story_view'")
    .assign(story_open_time=lambda df: (df["story_open_time"] / 1000).round().astype(int))
    .sort_values("dt")
    .groupby("pageview_id")
    ["story_open_time"]
    .agg(lambda s: ", ".join(s.astype(str)))
    .sample(25)
)

pageview_id
634bb09f447ae4e965e9            20
f26da9cd44b0bdf9f2a8    24, 16, 19
2e362abe04de3e3bf06d             7
7caa52d18acf2a34f262      151, 826
1b4bc20a98c78a57d81b             5
e4066e7cf25baf7a45ab             5
20e510b644c9b493e0fa             1
fd97f3230b41126b8d0b         8, 15
97c1fc5e2d359dcc4d30           153
ccd5db5a214ba3bdb087             2
493cfb48ca348922033e          4, 8
8a8cdb2660a6259884c3    43, 87, 58
72be0d4c5f3d31be5011             6
4a398535074c104ac958             5
85c19851e13ca9188d4d            26
bfa21d7a64a99e1007a2     12, 16, 2
10b46233b0773c67e96a             2
e2df73fc4a4df1b346f9             3
43e30ff6aab618402c39             7
74a79927de2f04524754         3, 75
6eb6d859c7f5a63d0676            23
a1dc725f6d2386d86bd6          2, 5
ed3cb29074a60050ef1e            11
615312e168d7026a447f            39
bb99b8ac1dda60ae4e90           121
Name: story_open_time, dtype: object