## Анализ пользователей по когортам



In [None]:
import pandas as pd
import datetime

import some_funcs
from some_funcs import simple_ch_client, plotly_df, highlight_vals

In [None]:
#----------Вводить свои данные в нижние 3 переменные--------------
CH_HOST_NAME = 'rc1b-bbmw2gw58fm2dbeb.mdb.yandexcloud.net'
CH_USER      = 'z_sergey'
CH_DB_NAME   = 'metrica_data'

#-------------------------------------------
CH_PASS      = open('../.chpass.txt').read().strip()
CH_HOST      = f'https://{CH_HOST_NAME}:8443'
CH_CASERT    = 'YandexInternalRootCA.crt'

In [None]:
my_client = simple_ch_client(CH_HOST, CH_USER, CH_PASS, CH_CASERT)

In [None]:
start_date = datetime.datetime(2020, 7, 1)
end_date = datetime.datetime(2020, 9, 30)

In [None]:
q = f'DROP TABLE IF EXISTS {CH_DB_NAME}.retention_users'
my_client.get_clickhouse_data(q)

In [None]:
q = f'''
    CREATE TABLE {CH_DB_NAME}.retention_users ENGINE = Log AS
    Select a.*, b.min_date
    from 
    (
        SELECT ClientID as client_id, toMonday(StartDate) as date, count() as visits, 
            sum(Purchases) as purchases, sum(Revenue) as revenue
        FROM {CH_DB_NAME}.visits
        WHERE (client_id != 0)
        group by client_id, date
    ) as a
    any left join
    (
        SELECT ClientID as client_id, min(toMonday(StartDate)) as min_date
        FROM {CH_DB_NAME}.visits
        WHERE (client_id != 0)
        group by client_id
    
    ) as b
    using (client_id)
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

my_client.get_clickhouse_data(q)


In [None]:
q = f'''SELECT count(), uniq(client_id) FROM {CH_DB_NAME}.retention_users FORMAT TabSeparatedWithNames'''
my_client.get_clickhouse_df(q)

In [None]:
q = f'''
SELECT 
    client_id,
    min_date, 
    max_date,
    date
FROM
    (
        SELECT
            client_id,
            min(date) as min_date,
            max(date) as max_date
        FROM {CH_DB_NAME}.retention_users
        GROUP BY client_id
        HAVING (min_date <= '{end_date}') AND (min_date >= '{start_date}')
    ) as a
    ALL INNER JOIN
    (
        SELECT 
            client_id,
            date
        FROM {CH_DB_NAME}.retention_users
    ) as b
    USING client_id
LIMIT 10
FORMAT TabSeparatedWithNames
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

my_client.get_clickhouse_df(q)

### Простой retention

In [None]:
q = f'''
SELECT 
    uniq(client_id) as clients,
    min_date, 
    (date - min_date)/7 as week_num
FROM
    (
        SELECT
            client_id,
            min(date) as min_date,
            max(date) as max_date
        FROM {CH_DB_NAME}.retention_users
        GROUP BY client_id
        HAVING (min_date <= '{end_date}') AND (min_date >= '{start_date}')
    ) as a
    ALL INNER JOIN
    (
        SELECT 
            client_id,
            date
        FROM {CH_DB_NAME}.retention_users
    ) as b
    USING client_id
GROUP BY
    week_num,
    min_date
FORMAT TabSeparatedWithNames
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

raw_ret_df = my_client.get_clickhouse_df(q)

In [None]:
ret_df = raw_ret_df.pivot_table(index = 'min_date', values = 'clients', columns = 'week_num').fillna(0).T

In [None]:
ret_df_norm = ret_df.apply(lambda x: 100*x/ret_df.loc[0], axis = 1).applymap(lambda x: x if x!=0 else None)

In [None]:
plotly_df(ret_df_norm)

In [None]:
ret_df_norm.T.fillna('').style.applymap(highlight_vals)

### Rolling retention


In [None]:
q = f'''
SELECT
    uniq(client_id) as clients,
    min_date,
    week_num
FROM
    (SELECT 
        client_id,
        min_date, 
        arrayJoin(range(toUInt64((max_date - min_date)/7) + 1)) as week_num
    FROM
        (
            SELECT
                client_id,
                min(date) as min_date,
                max(date) as max_date
            FROM {CH_DB_NAME}.retention_users
            GROUP BY client_id
            HAVING (min_date <= '{end_date}') AND (min_date >= '{start_date}')
        ))
GROUP BY
    min_date,
    week_num
FORMAT TabSeparatedWithNames
'''.format(
    start_date = start_date.strftime('%Y-%m-%d'),
    end_date = end_date.strftime('%Y-%m-%d')
)

raw_roll_ret_df = my_client.get_clickhouse_df(q)

In [None]:
roll_ret_df = raw_roll_ret_df.pivot_table(index = 'min_date', 
                                          values = 'clients', 
                                          columns = 'week_num').fillna(0).T

In [None]:
roll_ret_df_norm = roll_ret_df.apply(lambda x: 100*x/roll_ret_df.loc[0], axis = 1).applymap(lambda x: x if x!=0 else None)

In [None]:
plotly_df(roll_ret_df_norm)

In [None]:
roll_ret_df_norm.T.fillna('').style.applymap(highlight_vals)