In [1]:
from inspect import cleandoc
import pandas as pd
import scipy.stats as ss

import wmfdata as wmf
from wmfdata.utils import pct_str

In [2]:
user = pd.read_parquet("data/2022-03-07_experiment_user.parquet")

In [3]:
def ttest(name, df, col):
    control = df.query("experiment_group == 'control'")
    trending = df.query("experiment_group == 'trending-articles'")
    control_mean = control[col].mean()
    trending_mean = trending[col].mean()
    trending_advantage = trending_mean / control_mean - 1
    
    output = f"""
    {name}
        Control mean: {round(control_mean, 2)}
        Trending mean: {round(trending_mean, 2)}
        Trending advantage: {pct_str(trending_advantage)}
        {ss.ttest_ind(control[col], trending[col], equal_var=False)}
    """
    
    print(cleandoc(output))

## Basic tests

In [4]:
ttest("Session length (min)", user, "mean_session_min")

Session length (min)
    Control mean: 2.16
    Trending mean: 2.51
    Trending advantage: 16.2%
    Ttest_indResult(statistic=-13.93857256553007, pvalue=4.013745333720957e-44)


In [5]:
ttest("Sessions", user, "sessions")

Sessions
    Control mean: 2.12
    Trending mean: 2.18
    Trending advantage: 3.1%
    Ttest_indResult(statistic=-2.941168742951041, pvalue=0.003270426330140515)


## Basic tests, known enwiki users only
Users with their language set to something other than English did not see any recommendations, so ideally we would analyze only those users who we knew consistently used English.

However, since we only started storing the `wiki` field partway through, these known English users only amount to about 40% of our total users, even though the true proportion of consistent English user is about 90%.

In [6]:
len(user)

116420

In [7]:
user.groupby("wiki").size().sort_values(ascending=False).head(20)

wiki
                        54543
enwiki                  45441
-enwiki                  5477
enwiki-                  5146
swwiki                    465
urwiki                    255
enwiki-swwiki             245
swwiki-enwiki             202
enwiki-dewiki             172
-swwiki                   168
swwiki-                   142
enwiki-urwiki             132
urwiki-enwiki             106
frwiki                    101
enwiki-enwiki-dewiki       88
swwiki-swwiki-enwiki       82
enwiki-enwiki-swwiki       81
enwiki-dewiki-enwiki       79
enwiki-swwiki-enwiki       77
swwiki-enwiki-swwiki       76
dtype: int64

In [8]:
enwiki_user = user.query("wiki == 'enwiki'")

In [9]:
ttest("Session length (min)", enwiki_user, "mean_session_min")

Session length (min)
    Control mean: 2.01
    Trending mean: 2.38
    Trending advantage: 18.5%
    Ttest_indResult(statistic=-9.35226749245592, pvalue=8.98847828993865e-21)


In [10]:
ttest("Sessions", enwiki_user, "sessions")

Sessions
    Control mean: 1.88
    Trending mean: 1.93
    Trending advantage: 2.7%
    Ttest_indResult(statistic=-1.965069188684298, pvalue=0.04941260124191154)


## Deeper analysis

Interestingly (and contrary to my initial impression), there is significant variation between countries.

Some possible reasons are varying qualities of the recommendations (although my general impression was that the recommendations for Pakistan, Tanzania, and Uganda were quite good and the ones for Nigeria somewhat worse, which doesn't really align with the variation seen here), differences in the proportions of installs that were organic instead of ad-driven, or differences in the makeup of KaiOS users in each country.

### Comparing countries

In [12]:
user.groupby(["experiment_group", "country"])["sessions"].mean().unstack()

country,NG,PK,TZ,UG
experiment_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,2.8297,1.690294,2.170552,2.677971
trending-articles,2.89025,1.677969,2.270477,2.922993


In [13]:
print("SESSIONS")
for name, group in user.groupby("country"):
    ttest(name, group, "sessions")

SESSIONS
NG
    Control mean: 2.83
    Trending mean: 2.89
    Trending advantage: 2.1%
    Ttest_indResult(statistic=-0.4868598905980566, pvalue=0.626364798576808)
PK
    Control mean: 1.69
    Trending mean: 1.68
    Trending advantage: -0.7%
    Ttest_indResult(statistic=0.6586818935099917, pvalue=0.5101030998810226)
TZ
    Control mean: 2.17
    Trending mean: 2.27
    Trending advantage: 4.6%
    Ttest_indResult(statistic=-3.439965308622941, pvalue=0.0005824441912062435)
UG
    Control mean: 2.68
    Trending mean: 2.92
    Trending advantage: 9.1%
    Ttest_indResult(statistic=-3.520812693878272, pvalue=0.00043165079623369557)


In [14]:
user.groupby(["experiment_group", "country"])["mean_session_min"].mean().unstack()

country,NG,PK,TZ,UG
experiment_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,3.620569,1.480431,2.159232,2.931609
trending-articles,4.054853,1.59192,2.643883,3.646966


In [15]:
print("SESSION TIME (MIN)")
for name, group in user.groupby("country"):
    ttest(name, group, "mean_session_min")

SESSION TIME (MIN)
NG
    Control mean: 3.62
    Trending mean: 4.05
    Trending advantage: 12.0%
    Ttest_indResult(statistic=-4.5568548694063695, pvalue=5.232449292427866e-06)
PK
    Control mean: 1.48
    Trending mean: 1.59
    Trending advantage: 7.5%
    Ttest_indResult(statistic=-3.6000194316132057, pvalue=0.0003185180969607342)
TZ
    Control mean: 2.16
    Trending mean: 2.64
    Trending advantage: 22.4%
    Ttest_indResult(statistic=-12.003759868694605, pvalue=3.92919065802522e-33)
UG
    Control mean: 2.93
    Trending mean: 3.65
    Trending advantage: 24.4%
    Ttest_indResult(statistic=-8.71545388978544, pvalue=3.2058046455092798e-18)


In [16]:
user["total_session_time"] = user["sessions"] * user["mean_session_min"]

In [17]:
user.groupby(["experiment_group", "country"])["total_session_time"].mean().unstack()

country,NG,PK,TZ,UG
experiment_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,20.896831,3.717367,6.606925,11.031064
trending-articles,21.580839,3.406447,8.127728,15.24182


In [18]:
print("TOTAL SESSION TIME (MIN)")
for name, group in user.groupby("country"):
    ttest(name, group, "total_session_time")

TOTAL SESSION TIME (MIN)
NG
    Control mean: 20.9
    Trending mean: 21.58
    Trending advantage: 3.3%
    Ttest_indResult(statistic=-0.2906064475806428, pvalue=0.771356289796685)
PK
    Control mean: 3.72
    Trending mean: 3.41
    Trending advantage: -8.4%
    Ttest_indResult(statistic=0.9109888659658609, pvalue=0.3623069413359067)
TZ
    Control mean: 6.61
    Trending mean: 8.13
    Trending advantage: 23.0%
    Ttest_indResult(statistic=-2.886810954163714, pvalue=0.003894163328956138)
UG
    Control mean: 11.03
    Trending mean: 15.24
    Trending advantage: 38.2%
    Ttest_indResult(statistic=-3.712564952447016, pvalue=0.00020605260072266204)


### Second-week new reader retention

In [19]:
uncensored_cutoff = pd.Timestamp('2022-01-08') - pd.Timedelta(weeks=2)
uncensored_user = user.query("experiment_entry <= @uncensored_cutoff")

In [20]:
def retention_rate(df):
    return df["second_week_retained"].sum() / len(df)

In [21]:
uncensored_user.groupby("experiment_group").apply(retention_rate).apply(pct_str)

experiment_group
control              9.0%
trending-articles    9.4%
dtype: object

In [22]:
(
    uncensored_user
    .groupby(["experiment_group", "country"])
    .apply(retention_rate)
    .unstack()
    .applymap(pct_str)
)

country,NG,PK,TZ,UG
experiment_group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
control,12.2%,5.7%,10.6%,12.7%
trending-articles,11.7%,6.1%,11.1%,14.2%


#