In [235]:
from IPython.display import display
import numpy as np
import pandas as pd
import wmfdata as wmf
from wmfdata.utils import pd_display_all, df_to_remarkup, pct_str

In [229]:
wmf.charting.set_mpl_style()
pd.core.api.set_eng_float_format(use_eng_prefix=True)

# Data collection and cleaning

In [240]:
app_events = wmf.spark.run("""
SELECT
  ip,
  wiki,
  dt,
  webhost,
  geocoded_data,
  ev.*,
  ua.*
FROM event.inukapageview ipv
LATERAL VIEW INLINE(ARRAY(event)) ev
LATERAL VIEW INLINE(ARRAY(useragent)) ua
WHERE
  event.client_type = "kaios-app" AND
  year = 2020 AND 
  month = 3 AND
  day >= 23
""").assign(
  dt=lambda df: pd.to_datetime(df["dt"]),
  load_dt=lambda df: pd.to_datetime(df["load_dt"]),
  page_open_time=lambda df: pd.to_timedelta(df["page_open_time"], unit="ms"),
  page_visible_time=lambda df: pd.to_timedelta(df["page_visible_time"], unit="ms")
)

In [241]:
locations = pd.DataFrame(app_events["geocoded_data"].to_list())
app_events = pd.concat([app_events, locations], axis=1).drop("geocoded_data", axis=1)

There are a lot of events geolocation to Ashburn, Virginia that are almost certainly coming from the Inuka app's continuous integration service. Let's filter those out.

In [296]:
ci_events = app_events.query("city == 'Ashburn'").index
app_events = app_events.drop(ci_events)

In [None]:
app_events.sample(5).pipe(pd_display_all)
display(app_events.info())
app_events.describe(include="all").pipe(pd_display_all)

In [299]:
def last_event(df):
  max_open_time = df["page_open_time"].max()
  return df.query("page_open_time == @max_open_time").iloc[0]

app_pvs = app_events.groupby("pageview_token").apply(last_event)

In [None]:
app_pvs.sample(5).pipe(pd_display_all)
display(app_pvs.info())
app_pvs.describe(include="all").pipe(pd_display_all)

# Identifiers and uniqueness

If we assume that each unique pair of city and OS family is a unique device (relatively safe, since we're talking about a small group of developers), some devices have a LOT of different user IDs. However, these are all desktop devices, and a chat with the developers suggests that their development environments reinstall the app constantly. 

The KaiOS (Firefox OS) devices tend to have relatively few user IDs, suggesting that this will work properly for our users.

In [304]:
device_id_cols = ["country", "subdivision", "city", "os_family"]

def device_stats(df):
  events = len(df)
  user_ids = df["user_id"].nunique()

  return pd.Series({
    "device_events": events,
    "unique_user_IDs": user_ids
  })

app_events.groupby(device_id_cols).apply(device_stats).reset_index("os_family").reset_index(drop=True)

Unnamed: 0,os_family,device_events,unique_user_IDs
0,Firefox OS,53,5
1,Mac OS X,9,2
2,Windows,1,1
3,Firefox OS,13,1
4,Windows,182,1
5,Firefox OS,94,2
6,Mac OS X,2,1
7,Android,7,2
8,Firefox OS,209,6
9,Mac OS X,56,4


In [305]:
(app_events
 .groupby("user_id")
 ["session_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("sessions per user")
)

Unnamed: 0_level_0,frequency
sessions per user,Unnamed: 1_level_1
1,186
2,265
3,3
4,1
5,1
10,1
13,1


There are a lot of sessions which have multiple user IDs, which should never happen.

In [319]:
(app_events
 .groupby("session_id")
 ["user_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("user IDs per session")
)

Unnamed: 0_level_0,frequency
user IDs per session,Unnamed: 1_level_1
1,215
2,271


However, all of these sessions come from desktop devices, so they're likely just a quirk of the desktop development environments.

In [320]:
multiuser_sessions = (app_events
  .groupby("session_id")
  .filter(lambda g: g["user_id"].nunique() > 1 )
)

multiuser_sessions.groupby("os_family").apply(len)

os_family
Mac OS X    859
dtype: int64

In [307]:
(app_events
 .groupby("session_id")
 ["pageview_token"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("pageviews per session")
)

Unnamed: 0_level_0,frequency
pageviews per session,Unnamed: 1_level_1
1,132
2,162
3,43
4,73
5,39
6,15
7,1
8,2
9,1
10,1


A large number of pageviews have more than one session ID.

In [308]:
(app_events
 .groupby("pageview_token")
 ["session_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("session IDs per pageview")
)

Unnamed: 0_level_0,frequency
session IDs per pageview,Unnamed: 1_level_1
1,860
2,400


This can be correct if a user closes the app for more than an hour and then reopens it, viewing the same page. However, many of the events are less than an hour apart, meaning they should be part of the same session. Crucially, some of these events are coming from KaiOS devices, meaning our actual users might encounter this bug too.  

In [344]:
multisession_pvs = app_events.groupby("pageview_token").filter(lambda g: g["session_id"].nunique() > 1)

def interevent_time(df):
  interevent_time = df["dt"].max() - df["load_dt"].min()
  if interevent_time <= pd.Timedelta("1 hour"):
    return "≤ 1 hour"
  else:
    return "> 1 hour"

(multisession_pvs
  .groupby(["pageview_token", "os_family"])
  .apply(interevent_time)
  .rename("time_between_events")
  .reset_index()
  .groupby(["os_family", "time_between_events"])
  .apply(len)
)

os_family   time_between_events
Firefox OS  > 1 hour                44
            ≤ 1 hour                 6
Mac OS X    > 1 hour                12
            ≤ 1 hour               331
Windows     > 1 hour                 7
dtype: int64

This is the count of events, excluding the first by a user, grouped by their operating system, the time since the previous event, and whether the session ID is different from the previous one. There are no cases where a session ID is improperly retained after an hour, but there are cases where the session ID improperly changed before an hour. Most come from Mac OS X but a couple come from KaiOS events

Oddly, there are only two such KaiOS events, whereas there were six KaiOS pageviews that had multiple session IDs despite lasting less than an hour.

In [447]:
app_events = app_events.sort_values(["user_id", "dt"]).reset_index(drop=True)

previous_dt = (
  app_events
  .groupby("user_id")
  ["dt"]
  .apply(lambda x: x.shift(1))
)

previous_session_id = (
  app_events
  .groupby("user_id")
  ["session_id"]
  .apply(lambda x: x.shift(1))
)

app_events["previous_dt"] = previous_dt
app_events["previous_session_id"] = previous_session_id

def time_since_previous(s):
  t = s["dt"] - s["previous_dt"]
  if t <= pd.Timedelta("1 hour"):
    return "≤ 1 hour"
  else:
    return "> 1 hour"
  
def stats(row):
  return pd.Series({
    "os_family": row["os_family"],
    "new_session_id": row["previous_session_id"] != row["session_id"],
    "time_since_previous": time_since_previous(row)
  })

(app_events
  .query("~previous_session_id.isna()")
  .apply(stats, axis=1)
  .groupby(["os_family", "time_since_previous", "new_session_id"])
  .apply(len)
)

os_family   time_since_previous  new_session_id
Android     ≤ 1 hour             False               5
Firefox OS  > 1 hour             True               16
            ≤ 1 hour             False             432
                                 True                2
Mac OS X    > 1 hour             True               12
            ≤ 1 hour             False             554
                                 True              306
Windows     > 1 hour             True               12
            ≤ 1 hour             False             170
dtype: int64

The gap comes from another weird thing: some pageviews "reappear" after an intervening pageview.

In [480]:
def is_consecutive(df):
  # This relies on the data frame being sorted by user ID and then dt
  index_start = df.index[0]
  index_end = df.index[-1] + 1
  return (index_end - index_start) == len(df)

app_events["pageview_is_consecutive"] = app_events.groupby(["pageview_token"])["dt"].transform(is_consecutive)

(app_events
  .groupby(["os_family", "pageview_is_consecutive"])
  ["pageview_token"]
  .nunique()
)

os_family   pageview_is_consecutive
Android     False                        1
            True                         4
Firefox OS  False                      181
            True                        91
Mac OS X    False                      224
            True                       636
Windows     False                       51
            True                        72
Name: pageview_token, dtype: int64

Among the 181 non-consecutive KaiOS pageviews, 82% were supposedly not hidden at any point!

In [513]:
k = (app_events
  .query("os_family == 'Firefox OS' & ~pageview_is_consecutive")
  .groupby("pageview_token")
  .apply(lambda df: 1 - df["page_visible_time"].max() / df["page_open_time"].max())
  .rename("hidden_time")
  .pipe(lambda df: (df == 0).sum() / len(df))
)

pct_str(k)

'81.8%'

In [285]:
(app_events
 .groupby("pageview_token")
 ["user_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("user IDs per pageview")
)

Unnamed: 0_level_0,frequency
user IDs per pageview,Unnamed: 1_level_1
1,1977


In [278]:
(app_events
 .groupby("pageview_token")
 .apply(len)
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("events per pageview")
)

Unnamed: 0_level_0,frequency
events per pageview,Unnamed: 1_level_1
1,911
2,1066


The load time is constant for each pageview, as it should be.

In [450]:
(app_events.groupby("pageview_token")["load_dt"].nunique() == 1).all()

True

There are no cases of duplicate events for the same pageview (i.e. where the page open time is the same).

In [451]:
def duplicates(df):
  return df["page_open_time"].nunique() != len(df)

(app_events.groupby("pageview_token").apply(duplicates) == False).all()

True

# User agent parsing

For some reason, the KaiOS devices are getting classified as Firefox OS devices.

In [122]:
(app_events
  .groupby(["os_family", "device_family"])
  .apply(len)
  .rename("events")
)

os_family   device_family     
Android     Nexus 5                  8
Firefox OS  Generic Smartphone     613
Linux       Other                 3354
Mac OS X    Other                 3004
Windows     Other                  401
Name: events, dtype: int64

This seems to be a general issue with EventLogging.

In [215]:
classification_rates = wmf.spark.run("""
SELECT
  COUNT(1) as kaios_events,
  SUM(CAST(useragent.os_family = "KaiOS" AS INT)) / COUNT(1) AS kaios_classified,
  SUM(CAST(useragent.os_family = "Firefox OS" AS INT)) / COUNT(1) AS firefox_classified
FROM event.inukapageview ipv
WHERE
  event.client_type IN ("kaios-web", "kaios-app") AND
  year > 0
""")

In [453]:
classification_rates.style.format({"kaios_classified": pct_str, "firefox_classified": pct_str})

Unnamed: 0,kaios_events,kaios_classified,firefox_classified
0,416398,0.0%,97.8%


# Times

In [124]:
app_events["page_open_time"].describe()

count                      7380
mean     0 days 00:53:33.856936
std      0 days 08:59:01.010264
min      0 days 00:00:00.003000
25%      0 days 00:00:00.093000
50%      0 days 00:00:00.534000
75%      0 days 00:00:03.062000
max      5 days 14:18:26.206000
Name: page_open_time, dtype: object

In [191]:
app_events["page_visible_time"].describe()

count                      7477
mean     0 days 00:48:07.725528
std      0 days 08:33:49.800681
min      0 days 00:00:00.003000
25%      0 days 00:00:00.093000
50%      0 days 00:00:00.531000
75%      0 days 00:00:03.013000
max      5 days 14:18:26.206000
Name: page_visible_time, dtype: object

"-1 days +23:59:59.369984" is a [very weird way](https://github.com/pandas-dev/pandas/issues/31924) of writing "-00:00:00.630016", but still, the `page_open_time` is always very close to the span between the time when the page was loaded and the time when the event was received.

In [190]:
(app_events["dt"] - app_events["load_dt"] - app_events["page_open_time"]).describe()

count                        7477
mean     -1 days +23:59:59.369984
std        0 days 00:00:02.109687
min      -1 days +23:59:49.107000
25%      -1 days +23:59:59.243000
50%      -1 days +23:59:59.514000
75%      -1 days +23:59:59.793000
max        0 days 00:01:20.255000
dtype: object

In [193]:
(app_events["page_open_time"] - app_events["page_visible_time"]).describe()

count                      7477
mean     0 days 00:04:50.292598
std      0 days 02:28:37.777302
min             0 days 00:00:00
25%             0 days 00:00:00
50%             0 days 00:00:00
75%             0 days 00:00:00
max      5 days 13:04:04.255000
dtype: object

The pageviews that were open for very long periods of time all come from KaiOS, and most of them are from pageviews that aren't interrupted by others.

In [504]:
app_events.query("page_open_time > '3 days'").groupby(["os_family", "pageview_is_consecutive"]).apply(len)

os_family   pageview_is_consecutive
Firefox OS  False                       2
            True                       32
dtype: int64

Most of them (30) have the exact same page visible time as page open time, asserting that they've never been hidden.

In [519]:
(app_events
  .query("page_open_time > '3 days'")
  .apply(lambda x: x["page_open_time"] == x["page_visible_time"], axis=1)
  .sum()
)

30

# Other fields

In [260]:
(app_pvs
  .groupby("wiki")
  .apply(len)
  .rename("pageviews per wiki")
)

wiki
abwiki        1
bnwiki        1
dewiki        1
enwiki     1823
eswiki        1
frpwiki       3
frwiki       25
hiwiki        7
jawiki        2
mrwiki       21
pawiki        1
plwiki       20
ptwiki       71
Name: pageviews per wiki, dtype: int64

All events in the same pageview have the same section count.

In [265]:
(app_events.groupby("pageview_token")["section_count"].nunique() == 1).all()

True

No events with more opened sections than total sections.

In [262]:
app_events.query("section_count - opened_section_count < 0")

Unnamed: 0,ip,wiki,dt,webhost,user_id,session_id,pageview_token,client_type,referring_domain,load_dt,...,wmf_app_version,city,continent,country,country_code,latitude,longitude,postal_code,subdivision,timezone


All events from the same pageview have the same namespace.

In [271]:
(app_events.groupby("pageview_token")["page_namespace"].nunique() == 1).all()

True

The namespaces are mostly the main page (-1) and articles (0), as expected. The other ones represented are Help, Project, and Template, which seems reasonable since those are sometimes linked from articles.

In [269]:
app_pvs["page_namespace"].value_counts()

-1     1121
 0      824
 12      27
 4        4
 10       1
Name: page_namespace, dtype: int64

The main page and the search page are always the same here, as expected.

In [272]:
(app_events["is_main_page"] == app_events["is_search_page"]).all()

True