In [2]:
from IPython.display import display
import numpy as np
import pandas as pd
import wmfdata as wmf
from wmfdata.utils import pd_display_all, df_to_remarkup, pct_str

import secrets

In [3]:
from importlib import reload
reload(secrets)

<module 'secrets' from '/srv/home/neilpquinn-wmf/proj/KaiOS-readership/secrets.py'>

In [4]:
wmf.charting.set_mpl_style()
pd.core.api.set_eng_float_format(use_eng_prefix=True)

# Data collection and cleaning

A [crucial instrumentation fix](https://github.com/wikimedia/wikipedia-kaios/pull/194) was merged on 31 March; let's see what's happened to the data from the developers' test devices since then.

In [5]:
events = wmf.spark.run("""
SELECT
  ip,
  wiki,
  dt,
  webhost,
  geocoded_data,
  ev.*,
  ua.*
FROM event.inukapageview ipv
LATERAL VIEW INLINE(ARRAY(event)) ev
LATERAL VIEW INLINE(ARRAY(useragent)) ua
WHERE
  event.client_type = "kaios-app" AND
  year = 2020 AND 
  (month = 5 AND day >= 18 OR month = 6)
""").assign(
  dt=lambda df: pd.to_datetime(df["dt"]),
  load_dt=lambda df: pd.to_datetime(df["load_dt"]),
  page_open_time=lambda df: pd.to_timedelta(df["page_open_time"], unit="ms"),
  page_visible_time=lambda df: pd.to_timedelta(df["page_visible_time"], unit="ms")
)

locations = pd.DataFrame(events["geocoded_data"].to_list())
events = pd.concat([events, locations], axis=1).drop("geocoded_data", axis=1)

Let's see where the events are coming from.

In [None]:
events.groupby(["country", "subdivision", "city"]).apply(len).to_frame().rename({0: "events"}, axis=1)

We can attribute each event to a developer based on its geolocation and then drop that sensitive data.

In [7]:
def identify_user(row):
  loc = row["subdivision"] + ", " + row["country"]
  return secrets.developer_locations.get(loc)

events["user"] = events.apply(identify_user, axis=1)
events = events.drop([
  "ip", "city", "continent", "country",
  "country_code", "latitude", "longitude", "postal_code",
  "subdivision", "timezone"
], axis=1)

In [8]:
events["user"].value_counts(dropna=False)

Eduardo      667
Stephane     453
Huei         364
José         126
Sudhanshu     92
NaN           13
Angie          9
Name: user, dtype: int64

In [9]:
events["load_dt"].dt.date.to_frame().groupby("load_dt").apply(len)

load_dt
2020-05-14      1
2020-05-19     52
2020-05-20    144
2020-05-21     71
2020-05-22    116
2020-05-23     28
2020-05-24     31
2020-05-25    396
2020-05-26    182
2020-05-27    175
2020-05-28     52
2020-05-29     65
2020-05-31    173
2020-06-01    212
2020-06-02     26
dtype: int64

In [10]:
events.sample(5).pipe(pd_display_all)
display(events.info())
events.describe(include="all").pipe(pd_display_all)

Unnamed: 0,wiki,dt,webhost,user_id,session_id,pageview_token,client_type,referring_domain,load_dt,page_open_time,page_visible_time,section_count,opened_section_count,page_namespace,is_main_page,is_search_page,app_version,browser_family,browser_major,browser_minor,device_family,is_bot,is_mediawiki,os_family,os_major,os_minor,wmf_app_version,user
1459,enwiki,2020-05-23 03:50:17+00:00,en.wikipedia.org,d8ddfbff456b5305034d,cbecba770c8aac695f3c,5d5eb1843ccc2adcb27a,kaios-app,kaios-app,2020-05-23 03:47:11.343000+00:00,00:03:06.562000,00:00:14.647000,0,0,-1,True,True,1.0.0,Chrome,81,0,Mac,False,False,Mac OS X,10,15.0,-,José
191,enwiki,2020-05-31 14:49:18+00:00,en.wikipedia.org,5195362c0faa201957d3,77a25358faf36b5cc43b,8d85a812b47e67cece80,kaios-app,kaios-app,2020-05-31 14:49:12.069000+00:00,00:00:06.632000,00:00:06.632000,0,0,-1,True,True,1.0.0,Firefox,76,0,Mac,False,False,Mac OS X,10,15.0,-,Eduardo
1110,enwiki,2020-05-21 13:54:19+00:00,en.wikipedia.org,9ec8cbfa14dd10f70f3f,7eae3feda29968a66512,f26dbae0fd0ed2e053f8,kaios-app,kaios-app,2020-05-21 13:31:01.250000+00:00,00:23:19.462000,00:02:30.610000,11,0,0,False,False,1.0.0,Firefox Mobile,48,0,LYF F120B,False,False,KaiOS,2,5.0,-,Stephane
1004,bewiki,2020-05-20 11:23:46+00:00,be.wikipedia.org,453e0596fa6f0034f8cf,8e3596ab9fc64672002d,764b4c9fee9f1dd3c9c4,kaios-app,kaios-app,2020-05-20 11:23:37.084000+00:00,00:00:08.816000,00:00:08.816000,3,0,0,False,False,1.0.0,Firefox,49,0,Other,False,False,Windows,10,,-,Huei
1541,enwiki,2020-05-21 23:06:13+00:00,en.wikipedia.org,1c6500788afe1d29af21,f73d5aee5856186524da,fa7512bcaa21ed89e925,kaios-app,kaios-app,2020-05-21 23:09:11.761000+00:00,00:01:12.913000,00:00:49.226000,0,0,-1,True,True,1.0.0,Firefox,57,0,Mac,False,False,Mac OS X,10,12.0,-,Eduardo


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1724 entries, 0 to 1723
Data columns (total 28 columns):
 #   Column                Non-Null Count  Dtype              
---  ------                --------------  -----              
 0   wiki                  1724 non-null   object             
 1   dt                    1724 non-null   datetime64[ns, UTC]
 2   webhost               1724 non-null   object             
 3   user_id               1724 non-null   object             
 4   session_id            1724 non-null   object             
 5   pageview_token        1724 non-null   object             
 6   client_type           1724 non-null   object             
 7   referring_domain      1724 non-null   object             
 8   load_dt               1724 non-null   datetime64[ns, UTC]
 9   page_open_time        1724 non-null   timedelta64[ns]    
 10  page_visible_time     1724 non-null   timedelta64[ns]    
 11  section_count         1724 non-null   int64              
 12  opened

None

Unnamed: 0,wiki,dt,webhost,user_id,session_id,pageview_token,client_type,referring_domain,load_dt,page_open_time,page_visible_time,section_count,opened_section_count,page_namespace,is_main_page,is_search_page,app_version,browser_family,browser_major,browser_minor,device_family,is_bot,is_mediawiki,os_family,os_major,os_minor,wmf_app_version,user
count,1724,1724,1724,1724,1724,1724,1724,1724,1724,1724,1724,1.724k,1.724k,1.724k,1724,1724,1724,1724,1709.0,1709.0,1724,1724,1724,1724,1696.0,1352.0,1724,1711
unique,16,1560,16,227,290,1184,1,1,1184,,,,,,2,2,1,6,9.0,2.0,8,1,1,6,3.0,6.0,1,6
top,enwiki,2020-05-25 19:03:24+00:00,en.wikipedia.org,1c6500788afe1d29af21,c0313d32eb2dc353336d,bb07d67c6e49026bd44e,kaios-app,kaios-app,2020-05-22 16:24:33.348000+00:00,,,,,,False,False,1.0.0,Firefox,57.0,0.0,Mac,False,False,Mac OS X,10.0,14.0,-,Eduardo
freq,1637,5,1637,343,124,44,1724,1724,44,,,,,,1087,1087,1724,1106,340.0,1420.0,1100,1724,1724,1100,1444.0,387.0,1724,667
first,,2020-05-19 13:52:54+00:00,,,,,,,2020-05-14 19:28:53.009000+00:00,,,,,,,,,,,,,,,,,,,
last,,2020-06-02 06:57:10+00:00,,,,,,,2020-06-02 05:34:07.110000+00:00,,,,,,,,,,,,,,,,,,,
mean,,,,,,,,,,0 days 00:19:39.808794,0 days 00:16:30.584053,8.307,348.028m,-355.568m,,,,,,,,,,,,,,
std,,,,,,,,,,0 days 03:29:17.485511,0 days 03:28:57.639475,6.858,989.552m,640.587m,,,,,,,,,,,,,,
min,,,,,,,,,,0 days 00:00:00.008000,0 days 00:00:00.008000,0.000,0.000,-1.000,,,,,,,,,,,,,,
25%,,,,,,,,,,0 days 00:00:02.090000,0 days 00:00:02.039500,0.000,0.000,-1.000,,,,,,,,,,,,,,


# Identifiers and uniqueness

If we assume that each unique combination of user, OS family, and device family is a unique device, some devices have a LOT of different user IDs. However, these are all desktop devices, and a chat with the developers suggests that their development environments reinstall the app constantly. 

The KaiOS devices tend to have relatively few user IDs, suggesting that this will work properly for our users.

In [57]:
device_id_cols = ["user", "os_family", "device_family"]

def device_stats(df):
  events = len(df)
  user_ids = df["user_id"].nunique()

  return pd.Series({
    "events": events,
    "user_IDs": user_ids
  })

events.groupby(device_id_cols).apply(device_stats).reset_index()

Unnamed: 0,user,os_family,device_family,events,user_IDs
0,Angie,KaiOS,LYF F300B,9,2
1,Eduardo,KaiOS,Nokia 8110,29,2
2,Eduardo,Mac OS X,Mac,638,2
3,Huei,Android,Nexus 5,4,1
4,Huei,KaiOS,Nokia 2720,7,2
5,Huei,Mac OS X,Mac,9,1
6,Huei,Windows,Other,344,4
7,José,KaiOS,Generic Smartphone,24,2
8,José,KaiOS,LYF F300B,40,4
9,José,KaiOS,Nokia 2720,2,1


In [58]:
(events
 .groupby("user_id")
 ["session_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("sessions per user")
)

Unnamed: 0_level_0,frequency
sessions per user,Unnamed: 1_level_1
1,125
2,92
3,3
4,1
6,1
7,1
9,2
10,1
15,1


There are a large number of sessions which have multiple user IDs, which should never happen.

In [59]:
(events
 .groupby("session_id")
 ["user_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("user IDs per session")
)

Unnamed: 0_level_0,frequency
user IDs per session,Unnamed: 1_level_1
1,202
2,88


However, all of these session come from Stephane's macOS device, so they're likely just a quirk of his development environment.

In [60]:
multiuser_sessions_events = (events
  .groupby("session_id")
  .filter(lambda g: g["user_id"].nunique() > 1 )
)

multiuser_session_events.groupby(["user", "os_family"]).apply(lambda x: x["session_id"].nunique())

user      os_family
Stephane  Mac OS X     88
dtype: int64

In [61]:
(events
 .groupby("session_id")
 ["pageview_token"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("pageviews per session")
)

Unnamed: 0_level_0,frequency
pageviews per session,Unnamed: 1_level_1
1,126
2,76
3,27
4,14
5,7
6,2
7,4
8,3
9,2
10,2


In [17]:
(app_events
 .groupby("pageview_token")
 .apply(len)
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("events per pageview")
)

Unnamed: 0_level_0,frequency
events per pageview,Unnamed: 1_level_1
1,1044
2,65
3,20
4,17
5,8
6,6
7,5
9,6
10,2
11,1


This is the count of events, excluding the first by a user, grouped by their operating system, the time since the previous event, and whether the session ID is different from the previous one.

In the past, we've seen cases where a session ID looks improperly changed before an hour ("interrupted sessions") and cases where it looks improperly retained after an hour ("stuck sessions"). This time, we didn't see any of those from KaiOS devices.

In [62]:
events = events.sort_values(["user_id", "dt"]).reset_index(drop=True)

previous_dt = (
  events
  .groupby("user_id")
  ["dt"]
  .apply(lambda x: x.shift(1))
)

previous_session_id = (
  events
  .groupby("user_id")
  ["session_id"]
  .apply(lambda x: x.shift(1))
)

events["previous_dt"] = previous_dt
events["previous_session_id"] = previous_session_id

def time_since_previous(s):
  t = s["dt"] - s["previous_dt"]
  if t <= pd.Timedelta("1 hour"):
    return "≤ 1 hour"
  else:
    return "> 1 hour"
  
def stats(row):
  return pd.Series({
    "os_family": row["os_family"],
    "new_session_id": row["previous_session_id"] != row["session_id"],
    "time_since_previous": time_since_previous(row)
  })

(events
  .query("~previous_session_id.isna()")
  .apply(stats, axis=1)
  .groupby(["os_family", "time_since_previous", "new_session_id"])
  .apply(len)
)

os_family  time_since_previous  new_session_id
Android    ≤ 1 hour             False               3
KaiOS      > 1 hour             True               14
           ≤ 1 hour             False             205
Mac OS X   > 1 hour             True               27
           ≤ 1 hour             False             788
                                True               93
Other      > 1 hour             True                1
           ≤ 1 hour             False              13
Ubuntu     > 1 hour             True                3
           ≤ 1 hour             False               9
Windows    > 1 hour             True               21
           ≤ 1 hour             False             320
dtype: int64

In [63]:
key_cols = [
  "user", "os_family", "device_family", "user_id", "session_id", 
  "pageview_token", "dt", "load_dt", "page_open_time", "page_visible_time"
]

stuck_sessions = events.query(
  "~previous_session_id.isna() & "
  "previous_session_id == session_id & "
  "(dt - previous_dt) >= '1 hour'"
)["session_id"].unique()

events.query("session_id.isin(@stuck_sessions)")[key_cols].pipe(pd_display_all)

Unnamed: 0,user,os_family,device_family,user_id,session_id,pageview_token,dt,load_dt,page_open_time,page_visible_time


In [64]:
interrupted_session_users = events.query(
  "~previous_session_id.isna() & "
  "previous_session_id != session_id & "
  "(dt - previous_dt) < '1 hour'"
)["user_id"].unique()

interrupted_session_events = events.query("user_id.isin(@interrupted_session_users)")[key_cols].sort_values(["user_id", "dt"])
interrupted_session_events.head()

Unnamed: 0,user,os_family,device_family,user_id,session_id,pageview_token,dt,load_dt,page_open_time,page_visible_time
0,Stephane,Mac OS X,Mac,007477041675da58382e,59d9e5efef9047edbde3,bc06d5e4778dbbaeb1ca,2020-05-25 19:03:16+00:00,2020-05-25 19:03:15.962000+00:00,00:00:00.021000,00:00:00.021000
1,Stephane,Mac OS X,Mac,007477041675da58382e,ba95d1cbdad262750465,4ef299ecbc1fb8ddd44f,2020-05-25 19:03:17+00:00,2020-05-25 19:03:16.121000+00:00,00:00:01.229000,00:00:01.229000
91,Stephane,Mac OS X,Mac,06c22e33c07687fdaa08,297c9ef0ee7bd88f4ae2,a910fa85558591419d23,2020-05-25 18:55:47+00:00,2020-05-25 18:55:17.167000+00:00,00:00:29.892000,00:00:29.892000
92,Stephane,Mac OS X,Mac,06c22e33c07687fdaa08,504a369accf44ba0033e,88ba209de43884d66884,2020-05-25 18:55:55+00:00,2020-05-25 18:55:47.641000+00:00,00:00:08.128000,00:00:08.128000
93,Stephane,Mac OS X,Mac,094da21d9bec6f206909,073ea7c460b2078225ab,44e311f2e4d3baefcc09,2020-05-25 19:10:18+00:00,2020-05-25 19:10:18.219000+00:00,00:00:00.098000,00:00:00.098000


All of these interrupted sessions come from Stephane's Mac!

In [32]:
(
  events
  .query("user_id.isin(@interrupted_session_users)")
  .groupby(["user", "os_family"])
  .apply(lambda x: x["user_id"].nunique())
)

user      os_family
Stephane  Mac OS X     85
dtype: int64

In [21]:
(events
 .groupby("pageview_token")
 ["user_id"]
 .nunique()
 .value_counts()
 .to_frame(name="frequency")
 .sort_index()
 .rename_axis("user IDs per pageview")
)

Unnamed: 0_level_0,frequency
user IDs per pageview,Unnamed: 1_level_1
1,1184


The load time is constant for each pageview, as it should be.

In [22]:
assert (events.groupby("pageview_token")["load_dt"].nunique() == 1).all()

There are no cases of duplicate events for the same pageview (i.e. where the page open time is the same).

In [23]:
def has_duplicates(df):
  return df["page_open_time"].nunique() != len(df)

assert (events.groupby("pageview_token").apply(has_duplicates) == False).all()

There are nearly a hundred cases where the same device sends two events at the same second.

In [68]:
duplicate_dt_events = (
  events
  .groupby(["user", "device_family", "dt"])
  .filter(lambda df: len(df) >= 2)
  [key_cols]
  .sort_values("dt")
)

len(duplicate_dt_events.nunique()

93

Some of the duplicated events seem to be legitimate cases of distinct events being received during the same second, but others appear to be some kind of error.

In [70]:
duplicate_dt_events.head(6)

Unnamed: 0,user,os_family,device_family,user_id,session_id,pageview_token,dt,load_dt,page_open_time,page_visible_time
603,Huei,Windows,Other,453e0596fa6f0034f8cf,37d4db3486c2d024a8b7,0d9b03445e5194ab9d93,2020-05-20 09:11:32+00:00,2020-05-20 08:46:11.960000+00:00,00:25:04.840000,00:25:04.840000
604,Huei,Windows,Other,453e0596fa6f0034f8cf,37d4db3486c2d024a8b7,8d85a5bae3757dfab8f7,2020-05-20 09:11:32+00:00,2020-05-20 09:11:16.823000+00:00,00:00:04.637000,00:00:04.637000
535,Stephane,Mac OS X,Mac,3c39a75573d800dfe96d,1a105c5ea1d23c7a1202,e54302cf789bfafbda02,2020-05-25 19:03:06+00:00,2020-05-25 19:03:06.151000+00:00,00:00:00.015000,00:00:00.015000
1541,Stephane,Mac OS X,Mac,bbe9f0210cb010a5b038,1a105c5ea1d23c7a1202,0e61dae0c86847904ed5,2020-05-25 19:03:06+00:00,2020-05-25 19:03:06.017000+00:00,00:00:00.040000,00:00:00.040000
536,Stephane,Mac OS X,Mac,3c39a75573d800dfe96d,1aac3b7a1091f5d12e16,ef799491dffbff4b8c70,2020-05-25 19:03:09+00:00,2020-05-25 19:03:06.290000+00:00,00:00:03.122000,00:00:03.122000
501,Stephane,Mac OS X,Mac,2988552f0eb97deb2d94,1aac3b7a1091f5d12e16,dd4d6d2e20ab80fd2bb0,2020-05-25 19:03:09+00:00,2020-05-25 19:03:09.470000+00:00,00:00:00.011000,00:00:00.011000


However, all of the duplicate events were from desktop operating systems, and almost all were from Stephane's Mac! (Noticing a theme here?)

In [34]:
events.groupby(["user", "os_family"]).apply(len)

user      os_family
Huei      Windows        4
Stephane  Mac OS X     250
dtype: int64

# User agent parsing

The issues with correctly parsing KaiOS user agents ([T248560](https://phabricator.wikimedia.org/T248560)) have been solved.

In [35]:
(events
  .groupby(["os_family", "device_family"])
  .apply(len)
  .rename("events")
)

os_family  device_family     
Android    Nexus 5                  4
KaiOS      Generic Smartphone      24
           LYF F120B               70
           LYF F300B               73
           Nokia 2720               9
           Nokia 8110              72
Mac OS X   Mac                   1100
Other      Other                   15
Ubuntu     Other                   13
Windows    Other                  344
Name: events, dtype: int64

# Times

When look at event durations, we ignore the desktop events because desktop apps can stay open for very long times, while KaiOS apps can't.

There are no impossibly long times, so everything looks good!

In [71]:
events.query("os_family == 'KaiOS'")["page_open_time"].describe()

count                       248
mean     0 days 00:03:43.136056
std      0 days 00:12:22.635709
min      0 days 00:00:00.343000
25%      0 days 00:00:02.196000
50%      0 days 00:00:10.901000
75%      0 days 00:01:00.850000
max      0 days 01:57:33.250000
Name: page_open_time, dtype: object

In [72]:
events.query("os_family == 'KaiOS'")["page_visible_time"].describe()

count                       248
mean     0 days 00:00:46.152717
std      0 days 00:01:56.655377
min      0 days 00:00:00.343000
25%      0 days 00:00:02.196000
50%      0 days 00:00:10.792000
75%      0 days 00:00:59.842250
max      0 days 00:19:38.187000
Name: page_visible_time, dtype: object

Pandas has [very weird way of writing time intervals](https://github.com/pandas-dev/pandas/issues/31924), but still, the `page_open_time` is always very close to the span between the time when the page was loaded and the time when the event was received.

In [38]:
(events["dt"] - events["load_dt"] - events["page_open_time"]).describe()

count                        1724
mean     -1 days +23:59:06.961936
std        0 days 00:01:42.924385
min      -1 days +23:55:33.732000
25%      -1 days +23:59:55.924250
50%      -1 days +23:59:59.235500
75%      -1 days +23:59:59.692250
max        0 days 00:01:47.844000
dtype: object

# Other fields

In [40]:
(events
  .groupby("wiki")
  .apply(len)
  .rename("pageviews per wiki")
)

wiki
acewiki       1
alswiki       1
bewiki        8
dewiki        1
doiwiki       1
enwiki     1637
frwiki        7
hewiki        1
hiwiki        2
knwiki        1
kswiki        5
mlwiki        1
plwiki       14
ptwiki       33
tawiki        7
tewiki        4
Name: pageviews per wiki, dtype: int64

All events in the same pageview have the same section count.

In [41]:
assert (events.groupby("pageview_token")["section_count"].nunique() == 1).all() == True

No events with more opened sections than total sections.

In [42]:
events.query("section_count - opened_section_count < 0")

Unnamed: 0,wiki,dt,webhost,user_id,session_id,pageview_token,client_type,referring_domain,load_dt,page_open_time,...,device_family,is_bot,is_mediawiki,os_family,os_major,os_minor,wmf_app_version,user,previous_dt,previous_session_id


All events from the same pageview have the same namespace.

The namespaces are mostly the main page (-1) and articles (0), as expected. The other one represented is Help (12), which seems reasonable since Help pages are sometimes linked from articles.

In [43]:
events["page_namespace"].value_counts()

 0     1085
-1      637
 12       2
Name: page_namespace, dtype: int64

The search page is the same thing as the main page, as expected.

In [46]:
assert (events["is_main_page"] == events["is_search_page"]).all()