In [2]:
import wmfdata as wmf

In [4]:
query = """
SELECT
    cohort_date,
    SUM(CAST(retention_actions > 0 AS INT)) / COUNT(1) AS second_day_retention
FROM (
    SELECT
        cohort_date,
        cohort.user_id AS user_id,
        COUNT(retention_actions.dt) AS retention_actions
    FROM (
        SELECT
            CAST(DATE_TRUNC('day', TO_TIMESTAMP(dt)) AS DATE) AS cohort_date,
            event.user_id AS user_id,
            TO_TIMESTAMP(MIN(dt)) AS first_read_dt
        FROM event.inukapageview
        WHERE
            event.client_type = 'kaios-app' AND
            (year = 2020 AND month = 9 AND day >= 9 OR
             year = 2020 AND month > 9 OR
             year > 2020) AND
            DATE_TRUNC('day', TO_TIMESTAMP(dt)) < CURRENT_DATE - INTERVAL '2' DAY
        GROUP BY
            event.user_id,
            DATE_TRUNC('day', TO_TIMESTAMP(dt))
    ) cohort
    LEFT JOIN event.inukapageview retention_actions
    ON
        cohort.user_id = retention_actions.event.user_id AND
        TO_TIMESTAMP(retention_actions.dt) BETWEEN
            first_read_dt + INTERVAL '1' DAY AND
            first_read_dt + INTERVAL '2' DAY
    GROUP BY
        cohort.cohort_date,
        cohort.user_id
) retention_actions_per_user
GROUP BY cohort_date
ORDER BY cohort_date
"""

second_day_retention = wmf.spark.run(query, session_type="large")

In [5]:
second_day_retention.style.format({"second_day_retention": wmf.utils.pct_str})

Unnamed: 0,cohort_date,second_day_retention
0,2020-09-09,13.6%
1,2020-09-10,13.7%
2,2020-09-11,13.7%
3,2020-09-12,13.8%
4,2020-09-13,13.2%
5,2020-09-14,13.0%
6,2020-09-15,12.8%
7,2020-09-16,12.7%
8,2020-09-17,12.8%
9,2020-09-18,12.8%


In [24]:
print(second_day_retention.to_string(index=False))

cohort_date  second_day_retention
 2020-09-09              0.136215
 2020-09-10              0.137292
 2020-09-11              0.136838
 2020-09-12              0.137823
 2020-09-13              0.131592
 2020-09-14              0.130276
 2020-09-15              0.127931
 2020-09-16              0.127399
 2020-09-17              0.128289
 2020-09-18              0.128372
 2020-09-19              0.120973
 2020-09-20              0.120641
 2020-09-21              0.120104
 2020-09-22              0.117796
 2020-09-23              0.110060
 2020-09-24              0.110213
 2020-09-25              0.110666
 2020-09-26              0.110744
 2020-09-27              0.112330
 2020-09-28              0.109785
 2020-09-29              0.111854
 2020-09-30              0.113027
 2020-10-01              0.110700
 2020-10-02              0.106378
 2020-10-03              0.104399
 2020-10-04              0.102410
 2020-10-05              0.102650
 2020-10-06              0.102769
 2020-10-07   

In [14]:
query = """
SELECT
    cohort_date,
    SUM(CAST(retention_actions > 0 AS INT)) / COUNT(1) AS second_week_retention
FROM (
    SELECT
        cohort_date,
        cohort.user_id AS user_id,
        COUNT(retention_actions.dt) AS retention_actions
    FROM (
        SELECT
            CAST(DATE_TRUNC('day', TO_TIMESTAMP(dt)) AS DATE) AS cohort_date,
            event.user_id AS user_id,
            TO_TIMESTAMP(MIN(dt)) AS first_read_dt
        FROM event.inukapageview
        WHERE
            event.client_type = 'kaios-app' AND
            (year = 2020 AND month = 9 AND day >= 9 OR
             year = 2020 AND month > 9 OR
             year > 2020) AND
            DATE_TRUNC('day', TO_TIMESTAMP(dt)) < CURRENT_DATE - INTERVAL '2' WEEK
        GROUP BY
            event.user_id,
            DATE_TRUNC('day', TO_TIMESTAMP(dt))
    ) cohort
    LEFT JOIN event.inukapageview retention_actions
    ON
        cohort.user_id = retention_actions.event.user_id AND
        TO_TIMESTAMP(retention_actions.dt) BETWEEN
            first_read_dt + INTERVAL '1' WEEK AND
            first_read_dt + INTERVAL '2' WEEK
    GROUP BY
        cohort.cohort_date,
        cohort.user_id
) retention_actions_per_user
GROUP BY cohort_date
ORDER BY cohort_date
"""

second_week_retention = wmf.spark.run(query, session_type="large")

In [15]:
second_week_retention.style.format({"second_week_retention": wmf.utils.pct_str})

Unnamed: 0,cohort_date,second_week_retention
0,2020-09-09,13.6%
1,2020-09-10,14.5%
2,2020-09-11,15.9%
3,2020-09-12,16.4%
4,2020-09-13,16.7%
5,2020-09-14,16.5%
6,2020-09-15,16.7%
7,2020-09-16,16.6%
8,2020-09-17,16.8%
9,2020-09-18,16.6%


In [41]:
query = """
SELECT
    cohort_date,
    SUM(CAST(s_since_first_run < 86400 AS INT)) AS less_than_1d_users,
    SUM(CAST(s_since_first_run >= 86400 AND s_since_first_run < 172800 AS INT)) AS 1d_users,
    SUM(CAST(s_since_first_run >= 172800 AND s_since_first_run < 604800 AS INT)) AS 2_6d_users,
    SUM(CAST(s_since_first_run >= 604800 AND s_since_first_run < 1209600 AS INT)) AS 7_13d_users,
    SUM(CAST(s_since_first_run >= 1209600 AND s_since_first_run < 2592000 AS INT)) AS 14_30d_users,
    SUM(CAST(s_since_first_run >= 2592000 AND s_since_first_run < 7776000 AS INT)) AS 30_89d_users,
    SUM(CAST(s_since_first_run >= 7776000 AND s_since_first_run < 31536000 AS INT)) AS 90_364d_day_users,
    SUM(CAST(s_since_first_run >= 31536000 AS INT)) AS 1yr_plus_users
FROM (
    SELECT
      cohort.cohort_date,
      cohort.user_id,
      TO_UNIX_TIMESTAMP(FIRST(cohort.first_read_dt)) - TO_UNIX_TIMESTAMP(TO_TIMESTAMP(FIRST(first_run.dt))) AS s_since_first_run
    FROM (
        SELECT
            CAST(DATE_TRUNC('day', TO_TIMESTAMP(dt)) AS DATE) AS cohort_date,
            event.user_id AS user_id,
            TO_TIMESTAMP(MIN(dt)) AS first_read_dt
        FROM event.inukapageview
        WHERE
            event.client_type = 'kaios-app' AND
            (year = 2020 AND month = 9 AND day >= 9 OR
             year = 2020 AND month > 9 OR
             year > 2020) AND
             DATE_TRUNC('day', TO_TIMESTAMP(dt)) < CURRENT_DATE
        GROUP BY
            event.user_id,
            DATE_TRUNC('day', TO_TIMESTAMP(dt))
    ) cohort
    LEFT JOIN event_sanitized.kaiosappfirstrun first_run
    ON cohort.user_id = first_run.event.app_id
    GROUP BY
        cohort.cohort_date,
        cohort.user_id
) time_since_install
GROUP BY cohort_date
ORDER BY cohort_date
"""

users_by_time_since_install = wmf.spark.run(query, session_type="large")

In [42]:
users_by_time_since_install

Unnamed: 0,cohort_date,less_than_1d_users,1d_users,2_6d_users,7_13d_users,14_30d_users,30_89d_users,90_364d_day_users,1yr_plus_users
0,2020-09-09,47863,0,0,0,1,0,0,0
1,2020-09-10,76427,1987,0,0,1,0,0,0
2,2020-09-11,62838,8595,1460,0,1,0,0,0
3,2020-09-12,54091,6703,7132,0,0,0,0,0
4,2020-09-13,48185,5654,10024,0,0,0,0,0
5,2020-09-14,43220,4840,11743,0,0,0,0,0
6,2020-09-15,39550,4043,13243,0,1,1,0,0
7,2020-09-16,37585,3803,13041,596,0,4,0,0
8,2020-09-17,35041,3443,11273,2840,0,3,1,0
9,2020-09-18,33613,3181,9766,4340,0,1,0,0


In [43]:
print(users_by_time_since_install.to_string(index=False))

cohort_date  less_than_1d_users  1d_users  2_6d_users  7_13d_users  14_30d_users  30_89d_users  90_364d_day_users  1yr_plus_users
 2020-09-09               47863         0           0            0             1             0                  0               0
 2020-09-10               76427      1987           0            0             1             0                  0               0
 2020-09-11               62838      8595        1460            0             1             0                  0               0
 2020-09-12               54091      6703        7132            0             0             0                  0               0
 2020-09-13               48185      5654       10024            0             0             0                  0               0
 2020-09-14               43220      4840       11743            0             0             0                  0               0
 2020-09-15               39550      4043       13243            0             1          