In [2]:
import wmfdata as wmf

In [3]:
query = """
SELECT
    cohort_date,
    SUM(CAST(retention_actions > 0 AS INT)) / COUNT(1) AS second_day_retention
FROM (
    SELECT
        cohort_date,
        cohort.user_id AS user_id,
        COUNT(retention_actions.dt) AS retention_actions
    FROM (
        SELECT
            CAST(DATE_TRUNC('day', TO_TIMESTAMP(dt)) AS DATE) AS cohort_date,
            event.user_id AS user_id,
            TO_TIMESTAMP(MIN(dt)) AS first_read_dt
        FROM event.inukapageview
        WHERE
            event.client_type = 'kaios-app' AND
            (year = 2020 AND month = 9 AND day >= 9 OR
             year = 2020 AND month > 9 OR
             year > 2020) AND
            DATE_TRUNC('day', TO_TIMESTAMP(dt)) < CURRENT_DATE - INTERVAL '2' DAY
        GROUP BY
            event.user_id,
            DATE_TRUNC('day', TO_TIMESTAMP(dt))
    ) cohort
    LEFT JOIN event.inukapageview retention_actions
    ON
        cohort.user_id = retention_actions.event.user_id AND
        TO_TIMESTAMP(retention_actions.dt) BETWEEN
            first_read_dt + INTERVAL '1' DAY AND
            first_read_dt + INTERVAL '2' DAY
    GROUP BY
        cohort.cohort_date,
        cohort.user_id
) retention_actions_per_user
GROUP BY cohort_date
ORDER BY cohort_date
"""

second_day_retention = wmf.spark.run(query, session_type="large")

In [4]:
second_day_retention.style.format({"second_day_retention": wmf.utils.pct_str})

Unnamed: 0,cohort_date,second_day_retention
0,2020-11-05,10.7%
1,2020-11-06,10.9%
2,2020-11-07,10.7%
3,2020-11-08,10.5%
4,2020-11-09,10.5%
5,2020-11-10,10.8%
6,2020-11-11,10.1%
7,2020-11-12,10.6%
8,2020-11-13,10.6%
9,2020-11-14,11.6%


In [6]:
query = """
SELECT
    cohort_date,
    SUM(CAST(retention_actions > 0 AS INT)) / COUNT(1) AS second_week_retention
FROM (
    SELECT
        cohort_date,
        cohort.user_id AS user_id,
        COUNT(retention_actions.dt) AS retention_actions
    FROM (
        SELECT
            CAST(DATE_TRUNC('day', TO_TIMESTAMP(dt)) AS DATE) AS cohort_date,
            event.user_id AS user_id,
            TO_TIMESTAMP(MIN(dt)) AS first_read_dt
        FROM event.inukapageview
        WHERE
            event.client_type = 'kaios-app' AND
            (year = 2020 AND month = 9 AND day >= 9 OR
             year = 2020 AND month > 9 OR
             year > 2020) AND
            DATE_TRUNC('day', TO_TIMESTAMP(dt)) < CURRENT_DATE - INTERVAL '2' WEEK
        GROUP BY
            event.user_id,
            DATE_TRUNC('day', TO_TIMESTAMP(dt))
    ) cohort
    LEFT JOIN event.inukapageview retention_actions
    ON
        cohort.user_id = retention_actions.event.user_id AND
        TO_TIMESTAMP(retention_actions.dt) BETWEEN
            first_read_dt + INTERVAL '1' WEEK AND
            first_read_dt + INTERVAL '2' WEEK
    GROUP BY
        cohort.cohort_date,
        cohort.user_id
) retention_actions_per_user
GROUP BY cohort_date
ORDER BY cohort_date
"""

second_week_retention = wmf.spark.run(query, session_type="large")

In [7]:
second_week_retention.style.format({"second_week_retention": wmf.utils.pct_str})

Unnamed: 0,cohort_date,second_week_retention
0,2020-11-05,18.0%
1,2020-11-06,18.2%
2,2020-11-07,18.2%
3,2020-11-08,18.4%
4,2020-11-09,18.8%
5,2020-11-10,18.7%
6,2020-11-11,18.7%
7,2020-11-12,18.6%
8,2020-11-13,19.4%
9,2020-11-14,19.7%


In [11]:
print(second_week_retention.to_string(index=False))

cohort_date  second_week_retention
 2020-11-05               0.180415
 2020-11-06               0.182463
 2020-11-07               0.182047
 2020-11-08               0.183770
 2020-11-09               0.187658
 2020-11-10               0.186788
 2020-11-11               0.186797
 2020-11-12               0.185730
 2020-11-13               0.194118
 2020-11-14               0.196727
 2020-11-15               0.197932
 2020-11-16               0.192084
 2020-11-17               0.189913
 2020-11-18               0.192797
 2020-11-19               0.192282
 2020-11-20               0.188222
 2020-11-21               0.199443
 2020-11-22               0.195344
 2020-11-23               0.191698
 2020-11-24               0.192734
 2020-11-25               0.192071
 2020-11-26               0.190430
 2020-11-27               0.194573
 2020-11-28               0.192712
 2020-11-29               0.194162
 2020-11-30               0.200227
 2020-12-01               0.200840
 2020-12-02         

In [8]:
query = """
SELECT
    cohort_date,
    SUM(CAST(s_since_first_run < 86400 AS INT)) AS less_than_1d_users,
    SUM(CAST(s_since_first_run >= 86400 AND s_since_first_run < 172800 AS INT)) AS 1d_users,
    SUM(CAST(s_since_first_run >= 172800 AND s_since_first_run < 604800 AS INT)) AS 2_6d_users,
    SUM(CAST(s_since_first_run >= 604800 AND s_since_first_run < 1209600 AS INT)) AS 7_13d_users,
    SUM(CAST(s_since_first_run >= 1209600 AND s_since_first_run < 2592000 AS INT)) AS 14_30d_users,
    SUM(CAST(s_since_first_run >= 2592000 AND s_since_first_run < 7776000 AS INT)) AS 30_89d_users,
    SUM(CAST(s_since_first_run >= 7776000 AND s_since_first_run < 31536000 AS INT)) AS 90_364d_day_users,
    SUM(CAST(s_since_first_run >= 31536000 AS INT)) AS 1yr_plus_users
FROM (
    SELECT
      cohort.cohort_date,
      cohort.user_id,
      TO_UNIX_TIMESTAMP(FIRST(cohort.first_read_dt)) - TO_UNIX_TIMESTAMP(TO_TIMESTAMP(FIRST(first_run.dt))) AS s_since_first_run
    FROM (
        SELECT
            CAST(DATE_TRUNC('day', TO_TIMESTAMP(dt)) AS DATE) AS cohort_date,
            event.user_id AS user_id,
            TO_TIMESTAMP(MIN(dt)) AS first_read_dt
        FROM event.inukapageview
        WHERE
            event.client_type = 'kaios-app' AND
            (year = 2020 AND month = 9 AND day >= 9 OR
             year = 2020 AND month > 9 OR
             year > 2020) AND
             DATE_TRUNC('day', TO_TIMESTAMP(dt)) < CURRENT_DATE
        GROUP BY
            event.user_id,
            DATE_TRUNC('day', TO_TIMESTAMP(dt))
    ) cohort
    LEFT JOIN event_sanitized.kaiosappfirstrun first_run
    ON cohort.user_id = first_run.event.app_id
    GROUP BY
        cohort.cohort_date,
        cohort.user_id
) time_since_install
GROUP BY cohort_date
ORDER BY cohort_date
"""

users_by_time_since_install = wmf.spark.run(query, session_type="large")

In [9]:
users_by_time_since_install

Unnamed: 0,cohort_date,less_than_1d_users,1d_users,2_6d_users,7_13d_users,14_30d_users,30_89d_users,90_364d_day_users,1yr_plus_users
0,2020-11-05,8253,542,1387,1164,1988,3848,1,0
1,2020-11-06,8068,529,1400,1204,1906,3897,0,0
2,2020-11-07,7824,503,1354,1139,1877,3849,1,0
3,2020-11-08,7636,499,1315,1131,1794,3801,0,0
4,2020-11-09,7285,457,1204,1042,1714,3652,0,0
...,...,...,...,...,...,...,...,...,...
85,2021-01-29,5157,303,826,649,893,1580,1726,0
86,2021-01-30,5410,307,766,629,870,1531,1795,0
87,2021-01-31,5314,297,844,661,836,1549,1758,0
88,2021-02-01,4745,358,825,651,906,1490,1698,0


In [14]:
print(users_by_time_since_install[[
  "cohort_date",
  "90_364d_day_users",
  "30_89d_users",
  "14_30d_users",
  "7_13d_users",
  "2_6d_users",
  "1d_users",
  "less_than_1d_users"
]].to_string(index=False))

cohort_date  90_364d_day_users  30_89d_users  14_30d_users  7_13d_users  2_6d_users  1d_users  less_than_1d_users
 2020-11-05                  1          3848          1988         1164        1387       542                8253
 2020-11-06                  0          3897          1906         1204        1400       529                8068
 2020-11-07                  1          3849          1877         1139        1354       503                7824
 2020-11-08                  0          3801          1794         1131        1315       499                7636
 2020-11-09                  0          3652          1714         1042        1204       457                7285
 2020-11-10                  0          3913          1758         1056        1208       442                7039
 2020-11-11                  0          3733          1565          973        1236       420                7187
 2020-11-12                  0          3809          1615          964        1187     