In [49]:
import os
import numpy as np
import pandas as pd
from tqdm import tqdm
import seaborn as sns
import sqlalchemy as sa
import matplotlib.dates as md
import matplotlib.pyplot as plt
from dotenv import dotenv_values
from datetime import datetime, timedelta
from scipy.stats import ttest_ind, chi2_contingency, mannwhitneyu

sns.set(rc={'figure.figsize':(10,5)})

In [50]:
config = dotenv_values("/home/jovyan/.env")

def get_query_clickhouse(q: str) -> pd.DataFrame:
    ch_host = config['CH_HOST']
    ch_port = config['CH_PORT']
    ch_db   = config['CH_READ_DB']
    ch_user = config['CH_READ_USER']
    ch_pass = config['CH_READ_PASS']
    
    engine = sa.create_engine(
        f"clickhouse+native://{ch_user}:"
        f"{ch_pass}@{ch_host}:"
        f"{ch_port}/{ch_db}?secure=True"
    )
    return pd.read_sql_query(q, con=engine)

In [129]:
def calculate_pvalue(test, control, metric, group_id, df, metric_type='proportion'):
        if metric_type == 'average' or metric_type == 'ratio':
            equal_var = False
        else:
            equal_var = True

        if metric_type == 'ratio':
            c_num_values = df[df[group_id] == control][metric[0]].values
            c_denom_values = df[df[group_id] == control][metric[1]].values
            
            t_num_values = df[df[group_id] == test][metric[0]].values
            t_denom_values = df[df[group_id] == test][metric[1]].values
                       
            control_mean = c_num_values.sum() / c_denom_values.sum()
            test_mean = t_num_values.sum() / t_denom_values.sum()

            c_values = c_num_values - c_denom_values * control_mean
            t_values = t_num_values - t_denom_values * control_mean
        else:
            c_values = df[df[group_id] == control][metric].values
            t_values = df[df[group_id] == test][metric].values
            
            control_mean = c_values.mean()
            test_mean = t_values.mean()
    
        ttest_pvalue = ttest_ind(c_values, t_values, equal_var=equal_var).pvalue
        chi_pvalue = chi2_contingency(np.array([[np.sum(c_values == 1), np.sum(c_values == 0)],
                                                [np.sum(t_values == 1), np.sum(t_values == 0)]]))[1]
        mannwhitneyu_pvalue = mannwhitneyu(c_values, t_values).pvalue
        return ttest_pvalue, chi_pvalue, mannwhitneyu_pvalue, control_mean, test_mean
        
def calculate_proportion_metrics(test, control, metrics, group_id, df):
    """
    Вычисляет значение p-value для конверсионных метрик.
    Используется стандартный t-test.
    Параметры
    ----------
    test : str
        Название тестовой группы
    control : str
        Название контрольной группы
    metrics : list of str
        Список метрик, для которых нужно посчитать статистику
    group_id : str
        Название колонки, по которой определяется группа
    df : pd.Dataframe
        Датафрейм с данными, уровень группировки - айдишник пользователя. Примеры в запросе для денежных метрик.
    Возвращает
    -------
    Печатает результат теста
    """
    for metric in metrics:
        result =  calculate_pvalue(test, control, metric, group_id, df, metric_type='proportion')
        if result[0]<0.01 or result[1]<0.01 or result[2]<0.01:
            print('STATZNACHIMO')
        elif result[0]<0.05 or result[1]<0.05 or result[2]<0.05:
            print('statznachimo?')

        print('ttest p-value for', metric, ' is ', result[0], '\n',
              'chi2 p-value for', metric, ' is ', result[1], '\n',
              'mannwhitneyu p-value for', metric, ' is ', result[2], '\n',
              metric, ' mean value in control is ', result[3], '\n',
              metric, ' mean value in test is ', result[4], '\n',
              '---------------------------------------------------', '\n',
              'p-value:', round(np.mean(result[0:3]), 2), '\n',
              '(контрольная) ', round(result[3]*100, 2), '% --> (тестовая) ', round(result[4]*100, 2), '% \n'
              'абсолютные изменения:', round(abs(result[3]-result[4])*100, 3), '% \n'
              )
        
    return result
        
def calculate_average_metrics(test, control, metrics, group_id, df):
    """
    Вычисляет значение p-value для денежных и других средних поюзерных метрик.
    Используется Welch t-test с поправкой на неравенство дисперсий.
    Параметры
    ----------
    test : str
        Название тестовой группы
    control : str
        Название контрольной группы
    metrics : list of str
        Список метрик, для которых нужно посчитать статистику
    group_id : str
        Название колонки, по которой определяется группа
    df : pd.Dataframe
        Датафрейм с данными, уровень группировки - айдишник пользователя. Примеры в запросе для денежных метрик.
    Возвращает
    -------
    Печатает результат теста
    """
    for metric in metrics:
        result =  calculate_pvalue(test, control, metric, group_id, df, metric_type='average')
        if result[0]<0.01 or result[1]<0.01 or result[2]<0.01:
            print('STATZNACHIMO')
        elif result[0]<0.05 or result[1]<0.05 or result[2]<0.05:
            print('statznachimo?')

        print('ttest p-value for', metric, ' is ', result[0], '\n',
              'chi2 p-value for', metric, ' is ', result[1], '\n',
              'mannwhitneyu p-value for', metric, ' is ', result[2], '\n',
              metric, ' mean value in control is ', result[3], '\n',
              metric, ' mean value in test is ', result[4], '\n',
              '---------------------------------------------------', '\n',
              'p-value:', round(np.mean(result[0:3]), 2), '\n',
              '(контрольная) ', round(result[3]*100, 2), '% --> (тестовая) ', round(result[4]*100, 2), '% \n'
              'абсолютные изменения:', round(abs(result[3]-result[4])*100, 3), '% \n'
              )
        
    return result

def calculate_ratio_metrics(test, control, metrics, group_id, df):
    """
    Вычисляет значение p-value для глобальных средних метрик или ratio-метрик, например для среднего чека.
    Используется Welch t-test после линеаризации - про линеаризацию тут https://instamart.atlassian.net/wiki/spaces/ANLT/pages/edit-v2/2061107789
    Параметры
    ----------
    test : str
        Название тестовой группы
    control : str
        Название контрольной группы
    metrics : list of tuples
        Список числителей и знаменателей ratio метрики в формате [('num1', 'denom1'), ('num2', 'denom2')]
    group_id : str
        Название колонки, по которой определяется группа
    df : pd.Dataframe
        Датафрейм с данными, уровень группировки тот - по какой сущности считается ratio метрика. Например, для среднего чека уровнем группировки будет order_id. Пример запроса выгружающего такой датафрейм можно найти ниже в расчете среднего чека.
    Возвращает
    -------
    Печатает результат теста
    """
    for metric in metrics:
        result =  calculate_pvalue(test, control, metric, group_id, df, metric_type='ratio')
        if result[0]<0.01 or result[1]<0.01 or result[2]<0.01:
            print('STATZNACHIMO')
        elif result[0]<0.05 or result[1]<0.05 or result[2]<0.05:
            print('statznachimo?')
            
        print('ttest p-value for', metric, ' is ', result[0], '\n',
              'chi2 p-value for', metric, ' is ', result[1], '\n',
              'mannwhitneyu p-value for', metric, ' is ', result[2], '\n',
              metric, ' mean value in control is ', result[3], '\n',
              metric, ' mean value in test is ', result[4], '\n',
              '---------------------------------------------------', '\n',
              'p-value:', round(np.mean(result[0:3]), 2), '\n',
              '(контрольная) ', round(result[3]*100, 2), '% --> (тестовая) ', round(result[4]*100, 2), '% \n'
              'абсолютные изменения:', round(abs(result[3]-result[4])*100, 3), '% \n'
              )
        
    return result

In [71]:
# параметры эксперимента
start_date = '2024-04-25'
end_date = '2024-05-23'
exp_id = '43713c06-39f9-4f49-a959-4a4986427a78'

control = 'control'
test = 'test'

# Целевая - Доля отмененных заказов самовывоза

In [154]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
    , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )

    , t1 as (
        select distinct
            order_number,
            if(max(shipping_category_id) = 2, 1, 0) as flg_only_food,
            if(sum(shipping_category_id) %% 3 = 0, 1, 0) as flg_only_alco
        from bi_shipments_financial
        where toDate(completed_at) between start_date and end_date + interval 1 day
            and shipping_method_kind = 'pickup'
            and retailer_category_name != 'Аптека'
        group by order_number
    )
                
    , first_action as (
        select
            toString(anonymous_id) as anonymous_id,
            params['order_number'] as order_number
        from event.new_app
        where toDate(ts) between start_date and end_date + Interval 1 day
            and ts between start_date and end_date + Interval 1 day
            and event = 'Order Completed'
            and params['type_delivery']='pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
        
    , second_action as (
        select
            order_number
        from bi_shipments_financial
        where toDate(completed_at) between start_date and end_date + Interval 1 day
            and shipment_state = 'canceled'
            and order_number global in (select order_number from first_action)
        group by order_number
    )
    
select 
    group, if(second_action.order_number != '', 1, 0) as is_converted
    , case
        when flg_only_alco = 1 and flg_only_food = 0 then 'only alco'
        when flg_only_food = 1 and flg_only_alco = 0 then 'only food'
        else 'mix'
    end as order_type
from first_action left join ab_groups
    on first_action.anonymous_id = ab_groups.anonymous_id
left join second_action
    on toNullable(first_action.order_number) = toNullable(second_action.order_number)
left join t1
    on t1.order_number = first_action.order_number
"""

first = get_query_clickhouse(q)

In [157]:
# без срезов
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          first)

STATZNACHIMO
ttest p-value for is_converted  is  1.3739724827699777e-07 
 chi2 p-value for is_converted  is  1.420440818551692e-07 
 mannwhitneyu p-value for is_converted  is  1.3753553175050233e-07 
 is_converted  mean value in control is  0.2057467627850218 
 is_converted  mean value in test is  0.19570446345503642 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  20.57 % --> (тестовая)  19.57 % 
абсолютные изменения: 1.004 % 



In [158]:
# алкосрез
print('Только алко:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', first[first.order_type == 'only alco'])
print('Еда:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', first[first.order_type == 'only food'])
print('Микс:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', first[first.order_type == 'mix'])


Только алко:
STATZNACHIMO
ttest p-value for is_converted  is  0.0019827931244718374 
 chi2 p-value for is_converted  is  0.002036472817302834 
 mannwhitneyu p-value for is_converted  is  0.001983166292187591 
 is_converted  mean value in control is  0.23195597061818812 
 is_converted  mean value in test is  0.22336479733481399 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  23.2 % --> (тестовая)  22.34 % 
абсолютные изменения: 0.859 % 

Еда:
STATZNACHIMO
ttest p-value for is_converted  is  0.005098185843991946 
 chi2 p-value for is_converted  is  0.005264403737929701 
 mannwhitneyu p-value for is_converted  is  0.005098952211728884 
 is_converted  mean value in control is  0.16167818586850974 
 is_converted  mean value in test is  0.15397894498528414 
 --------------------------------------------------- 
 p-value: 0.01 
 (контрольная)  16.17 % --> (тестовая)  15.4 % 
абсолютные изменения: 0.77 % 

Микс:
STATZNACHIMO
ttest p-value for is_converted  

# Прокси - отмены со стороны Сбермаркета

In [160]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
    , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )
    
    , t1 as (
        select distinct
            order_number,
            if(max(shipping_category_id) = 2, 1, 0) as flg_only_food,
            if(sum(shipping_category_id) %% 3 = 0, 1, 0) as flg_only_alco
        from bi_shipments_financial
        where toDate(completed_at) between start_date and end_date + interval 1 day
            and shipping_method_kind = 'pickup'
            and retailer_category_name != 'Аптека'
        group by order_number
    )

    , new_user as (
        select 
            anonymous_id,
            params['order_number'] as order_number,
            min(ts) as dt
        from event.new_app
        where 1=1
            and toDate(ts) < start_date
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
                
    , first_action as (
        select
            toString(anonymous_id) as anonymous_id,
            params['order_number'] as order_number,
            if(anonymous_id global in (select anonymous_id from new_user)
                    and order_number global not in (select order_number from new_user),
                    'old_user', 'new_user') as user_type
        from event.new_app
        where dwh_dt between start_date and end_date + Interval 1 day
            and ts between start_date and end_date + Interval 1 day
            and event = 'Order Completed'
            and params['type_delivery']='pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
        
    , second_action as (
        select
            toDate(completed_at) as dt,
            order_number,
            max(if(cancelation_reason_id global in (2, 5, 7, 8, 9, 10, 11, 12, 13, 14, 16, 17, 25,
            26, 27, 28, 32, 33, 34, 38, 39, 42, 44, 47, 49, 50, 54, 55, 56, 57, 58, 59, 60,
            61, 62, 63, 64, 65, 66, 67, 68, 70, 71, 72, 73, 75, 76, 77, 78, 79, 80, 81, 83,
            84, 85, 86, 87, 88, 89, 91, 92, 93, 94, 96), 1, 0)) as is_converted
        from bi_shipments_financial
        where completed_at between start_date and end_date + Interval 1 day
            and shipment_state = 'canceled'
            and order_number global in (select order_number from first_action)
        group by dt, order_number
    )
    
select group, is_converted,
    case
            when flg_only_alco = 1 and flg_only_food = 0 then 'only alco'
            when flg_only_food = 1 and flg_only_alco = 0 then 'only food'
            else 'mix'
    end as order_type,
    user_type
from second_action left join first_action
    on toNullable(second_action.order_number) = toNullable(first_action.order_number)
left join ab_groups
    on first_action.anonymous_id = ab_groups.anonymous_id
left join t1
    on t1.order_number = first_action.order_number
"""

second = get_query_clickhouse(q)

In [161]:
second[second.group=='control'].is_converted.value_counts()

0    9771
1    8437
Name: is_converted, dtype: int64

In [162]:
second[second.group=='test'].is_converted.value_counts()

0    8669
1    8575
Name: is_converted, dtype: int64

In [163]:
# без срезов
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          second)

STATZNACHIMO
ttest p-value for is_converted  is  1.67388819220599e-10 
 chi2 p-value for is_converted  is  1.8133134312068874e-10 
 mannwhitneyu p-value for is_converted  is  1.692638251343936e-10 
 is_converted  mean value in control is  0.4633677504393673 
 is_converted  mean value in test is  0.4972744142890281 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  46.34 % --> (тестовая)  49.73 % 
абсолютные изменения: 3.391 % 



In [164]:
# алкосрез
print('Только алко:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', second[second.order_type == 'only alco'])
print('Еда:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', second[second.order_type == 'only food'])
print('Микс:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', second[second.order_type == 'mix'])


Только алко:
STATZNACHIMO
ttest p-value for is_converted  is  1.0920862648625012e-05 
 chi2 p-value for is_converted  is  1.1684453702792604e-05 
 mannwhitneyu p-value for is_converted  is  1.0964559802562035e-05 
 is_converted  mean value in control is  0.4446001494768311 
 is_converted  mean value in test is  0.4750348120151184 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  44.46 % --> (тестовая)  47.5 % 
абсолютные изменения: 3.043 % 

Еда:
STATZNACHIMO
ttest p-value for is_converted  is  1.800326223874007e-05 
 chi2 p-value for is_converted  is  1.972931230153798e-05 
 mannwhitneyu p-value for is_converted  is  1.8124367355337794e-05 
 is_converted  mean value in control is  0.5322381204734146 
 is_converted  mean value in test is  0.5727155727155727 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  53.22 % --> (тестовая)  57.27 % 
абсолютные изменения: 4.048 % 

Микс:
ttest p-value for is_converted  is  0.0

In [165]:
# новичок/старичок
print('Новичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', second[second.user_type == 'new_user'])
print('Старичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', second[second.user_type == 'old_user'])


Новичок
STATZNACHIMO
ttest p-value for is_converted  is  0.0044022024739390885 
 chi2 p-value for is_converted  is  0.004854346737613966 
 mannwhitneyu p-value for is_converted  is  0.004414264465502582 
 is_converted  mean value in control is  0.4508272058823529 
 is_converted  mean value in test is  0.4944524843222383 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  45.08 % --> (тестовая)  49.45 % 
абсолютные изменения: 4.363 % 

Старичок
STATZNACHIMO
ttest p-value for is_converted  is  8.403939662533967e-09 
 chi2 p-value for is_converted  is  9.05648266335557e-09 
 mannwhitneyu p-value for is_converted  is  8.473592862114668e-09 
 is_converted  mean value in control is  0.46506986027944114 
 is_converted  mean value in test is  0.49766000922813264 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  46.51 % --> (тестовая)  49.77 % 
абсолютные изменения: 3.259 % 



# Контр - Доля пользователей, оформивших заказ, которые захотели обратиться в поддержку

In [166]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
     , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )
    
    , new_user as (
        select 
            anonymous_id,
            params['order_number'] as order_number,
            min(ts) as dt
        from event.new_app
        where 1=1
            and toDate(ts) < start_date
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
    
    , t1 as (
        select distinct
            order_number,
            if(max(shipping_category_id) = 2, 1, 0) as flg_only_food,
            if(sum(shipping_category_id) %% 3 = 0, 1, 0) as flg_only_alco
        from bi_shipments_financial
        where toDate(completed_at) between start_date and end_date + interval 1 day
            and shipping_method_kind = 'pickup'
            and retailer_category_name != 'Аптека'
        group by order_number
    )
                
    , first_action as (
        select
            toString(anonymous_id) as anonymous_id,
            params['order_number'] as order_number,
            min(ts) as timestamp,
            if(anonymous_id global in (select anonymous_id from new_user)
                    and order_number global not in (select order_number from new_user),
                    'old_user', 'new_user') as user_type,
            max(platform) as platform
        from event.new_app
        where dwh_dt between start_date and end_date + Interval 1 day
            and ts between start_date and end_date + Interval 1 day
            and event = 'Order Completed'
            and params['type_delivery']='pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
        
    , second_action as (
        select
            anonymous_id,
            min(ts) as timestamp
        from event.new_app
        where dwh_dt between start_date and end_date + interval 1 day
            and toDate(ts) between start_date and end_date
            and event = 'Contact Us Button Clicked'
            and delivery_method = 'pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id
    )
        
select 
    first_action.anonymous_id as anonymous_id,
    first_action.user_type as user_type,
    first_action.platform as platform,
    ab_groups.group as group
    , max(if(if(second_action.anonymous_id != '', 1,0)
        and second_action.timestamp > first_action.timestamp
        and toDate(second_action.timestamp) = toDate(first_action.timestamp)
        , 1, 0)) as is_converted
    , case
            when min(flg_only_alco) = 1 and min(flg_only_food) = 0 then 'only alco'
            when min(flg_only_food) = 1 and min(flg_only_alco) = 0 then 'only food'
            else 'mix'
    end as order_type
from first_action 
left join ab_groups
    on ab_groups.anonymous_id = first_action.anonymous_id
left join second_action
  on toNullable(second_action.anonymous_id) = toNullable(first_action.anonymous_id)
left join t1
    on t1.order_number = first_action.order_number
group by anonymous_id, user_type, platform, group

"""

c10 = get_query_clickhouse(q)

In [167]:
c10[c10.group=='control'].is_converted.value_counts()

0    41456
1     2518
Name: is_converted, dtype: int64

In [168]:
c10[c10.group=='test'].is_converted.value_counts()

0    40710
1     3195
Name: is_converted, dtype: int64

In [169]:
# без среза
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          c10)

STATZNACHIMO
ttest p-value for is_converted  is  1.0933987221423034e-20 
 chi2 p-value for is_converted  is  1.2696290125289495e-20 
 mannwhitneyu p-value for is_converted  is  1.1166176714897523e-20 
 is_converted  mean value in control is  0.05726110883704007 
 is_converted  mean value in test is  0.07277075503928937 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  5.73 % --> (тестовая)  7.28 % 
абсолютные изменения: 1.551 % 



In [170]:
# алкосрез
print('Только алко:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c10[c10.order_type == 'only alco'])
print('Еда:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c10[c10.order_type == 'only food'])
print('Микс:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c10[c10.order_type == 'mix'])

Только алко:
STATZNACHIMO
ttest p-value for is_converted  is  1.9284693304763496e-11 
 chi2 p-value for is_converted  is  2.2936661249917706e-11 
 mannwhitneyu p-value for is_converted  is  1.9554465176481457e-11 
 is_converted  mean value in control is  0.04735770057711893 
 is_converted  mean value in test is  0.06381985765954964 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  4.74 % --> (тестовая)  6.38 % 
абсолютные изменения: 1.646 % 

Еда:
STATZNACHIMO
ttest p-value for is_converted  is  0.0007700327219630046 
 chi2 p-value for is_converted  is  0.000835886013434135 
 mannwhitneyu p-value for is_converted  is  0.000770602154645493 
 is_converted  mean value in control is  0.05630762049247287 
 is_converted  mean value in test is  0.0649552927603115 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  5.63 % --> (тестовая)  6.5 % 
абсолютные изменения: 0.865 % 

Микс:
STATZNACHIMO
ttest p-value for is_converted

In [171]:
# новичок/старичок
print('Новичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c10[c10.user_type == 'new_user'])
print('Старичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c10[c10.user_type == 'old_user'])


Новичок
STATZNACHIMO
ttest p-value for is_converted  is  0.0008698819359929015 
 chi2 p-value for is_converted  is  0.0010147807216426081 
 mannwhitneyu p-value for is_converted  is  0.0008718486014528725 
 is_converted  mean value in control is  0.046318508949990775 
 is_converted  mean value in test is  0.06071230854401181 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  4.63 % --> (тестовая)  6.07 % 
абсолютные изменения: 1.439 % 

Старичок
STATZNACHIMO
ttest p-value for is_converted  is  2.7244759587964003e-18 
 chi2 p-value for is_converted  is  3.150441411785007e-18 
 mannwhitneyu p-value for is_converted  is  2.7747132805769055e-18 
 is_converted  mean value in control is  0.05879911814291272 
 is_converted  mean value in test is  0.07446863794626618 
 --------------------------------------------------- 
 p-value: 0.0 
 (контрольная)  5.88 % --> (тестовая)  7.45 % 
абсолютные изменения: 1.567 % 



# Контр - Конверсия в оформление повторного заказа

In [138]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
    , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )

    , t2 as (
        select anonymous_id, 
            if(count(distinct params['order_number'])>1, 1, 0) as is_converted
        from event.new_app
        where 1=1 
            and dwh_dt between start_date and end_date + Interval 1 day
            and ts between start_date and end_date + Interval 1 day
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
            and params['type_delivery']='pickup'
        group by anonymous_id
    )
    
    
    select group, is_converted
    from t2 inner join ab_groups
        on t2.anonymous_id = ab_groups.anonymous_id
"""

c4 = get_query_clickhouse(q)

In [139]:
c4[c4.group=='control'].is_converted.value_counts()

0    25983
1    17991
Name: is_converted, dtype: int64

In [140]:
c4[c4.group=='test'].is_converted.value_counts()

0    26278
1    17627
Name: is_converted, dtype: int64

In [141]:
# без среза
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          c4)

statznachimo?
ttest p-value for is_converted  is  0.020949925294998176 
 chi2 p-value for is_converted  is  0.021334465607553876 
 mannwhitneyu p-value for is_converted  is  0.02095078383803281 
 is_converted  mean value in control is  0.40912812116250513 
 is_converted  mean value in test is  0.40148046919485253 
 --------------------------------------------------- 
 p-value: 0.02 
 (контрольная)  40.91 % --> (тестовая)  40.15 % 
абсолютные изменения: 0.765 % 



# Прокси - конверсия в повторный завершенный заказ

In [116]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
    , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )
    
    , t1 as (
        select order_number 
        from analytics.bi_shipments_financial
        where 1=1
            and completed_at between start_date and end_date + Interval 1 day
            and shipping_method_kind = 'pickup'
            and shipment_state = 'shipped'
            and retailer_category_name != 'Аптека'
        group by order_number
    )

    , t2 as (
        select anonymous_id, 
            if(count(distinct params['order_number'])>1, 1, 0) as is_converted
        from event.new_app
        where 1=1 
            and dwh_dt between start_date and end_date + Interval 1 day
            and ts between start_date and end_date + Interval 1 day
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
            and params['type_delivery']='pickup'
            and params['order_number'] global in (select * from t1)
        group by anonymous_id
    )
    
    
    select group, is_converted
    from t2 inner join ab_groups
        on t2.anonymous_id = ab_groups.anonymous_id
"""

c3 = get_query_clickhouse(q)

In [121]:
c3[c3.group=='control'].is_converted.value_counts()

0    24394
1    13958
Name: is_converted, dtype: int64

In [122]:
c3[c3.group=='test'].is_converted.value_counts()

0    24542
1    13675
Name: is_converted, dtype: int64

In [120]:
# без среза
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          c3)

ttest p-value for is_converted  is  0.07791499985581374 
 chi2 p-value for is_converted  is  0.07919091268639118 
 mannwhitneyu p-value for is_converted  is  0.07791513777049956 
 is_converted  mean value in control is  0.3639445139758031 
 is_converted  mean value in test is  0.35782505167857237 
 --------------------------------------------------- 
 p-value: 0.08 
 (контрольная)  36.39 % --> (тестовая)  35.78 % 
абсолютные изменения: 0.612 % 



## Доп - Конверсия из чекаута в оформление заказа самовывозом

In [None]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
     , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )
    
    , new_user as (
        select 
            anonymous_id,
            order_id,
            min(ts) as dt
        from event.new_app
        where 1=1
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
            and toDate(ts) < start_date
        group by anonymous_id, order_id
    )
                
    , first_action as (
            select
                anonymous_id,
                min(ts) as timestamp,
                if(anonymous_id global in (select anonymous_id from new_user)
                    and min(order_id) global not in (select order_id from new_user),
                    'old_user', 'new_user') as user_type,
                max(platform) as platform
            from event.new_app
            where dwh_dt between start_date and end_date + interval 1 day
                and toDate(ts) between start_date and end_date
                and event = 'Checkout Button Clicked'
                and delivery_method = 'pickup'
                and toString(anonymous_id) global in (select anonymous_id from ab_groups)
            group by anonymous_id
    )
        
    , second_action as (
        select
            anonymous_id,
            min(ts) as timestamp
        from event.new_app
        where dwh_dt between start_date and end_date + interval 1 day
            and toDate(ts) between start_date and end_date
            and event = 'Order Completed'
            and params['type_delivery']='pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id
    )
        
select 
    first_action.anonymous_id as anonymous_id,
    first_action.user_type as user_type,
    first_action.platform as platform,
    ab_groups.group as group
    , max(if(if(second_action.anonymous_id != '', 1,0)
        and second_action.timestamp > first_action.timestamp
        and toDate(second_action.timestamp) = toDate(first_action.timestamp)
        , 1, 0)) as is_converted
from first_action 
left join ab_groups
    on ab_groups.anonymous_id = first_action.anonymous_id
left join second_action
  on toNullable(second_action.anonymous_id) = toNullable(first_action.anonymous_id)
group by anonymous_id, user_type, platform, group

"""

c7 = get_query_clickhouse(q)
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          c7)

In [106]:
# новичок/старичок
print('Новичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c7[c7.user_type == 'new_user'])
print('Старичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c7[c7.user_type == 'old_user'])


Новичок
ttest p-value for is_converted  is  0.3307277982099385 
 chi2 p-value for is_converted  is  0.33678194473517764 
 mannwhitneyu p-value for is_converted  is  0.33072101960247235 
 is_converted  mean value in control is  0.262901376146789 
 is_converted  mean value in test is  0.2675012921380578 
 --------------------------------------------------- 
 p-value: 0.33 
 (контрольная)  26.29 % --> (тестовая)  26.75 % 
абсолютные изменения: 0.46 

Старичок
ttest p-value for is_converted  is  0.8007342711683221 
 chi2 p-value for is_converted  is  0.804854301345132 
 mannwhitneyu p-value for is_converted  is  0.800733323834296 
 is_converted  mean value in control is  0.4461562162994916 
 is_converted  mean value in test is  0.44682108581238916 
 --------------------------------------------------- 
 p-value: 0.8 
 (контрольная)  44.62 % --> (тестовая)  44.68 % 
абсолютные изменения: 0.066 



In [107]:
# ios/android
print('IOS')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c7[c7.platform == 'ios'])
print('Android')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c7[c7.platform == 'android'])


IOS
ttest p-value for is_converted  is  0.16087238023650213 
 chi2 p-value for is_converted  is  0.16316914814994235 
 mannwhitneyu p-value for is_converted  is  0.1608708924942619 
 is_converted  mean value in control is  0.42130014624494594 
 is_converted  mean value in test is  0.4265509112861496 
 --------------------------------------------------- 
 p-value: 0.16 
 (контрольная)  42.13 % --> (тестовая)  42.66 % 
абсолютные изменения: 0.525 

Android
ttest p-value for is_converted  is  0.6874613572062556 
 chi2 p-value for is_converted  is  0.6920371419846845 
 mannwhitneyu p-value for is_converted  is  0.6874594910244946 
 is_converted  mean value in control is  0.40299807293210793 
 is_converted  mean value in test is  0.40179552608541785 
 --------------------------------------------------- 
 p-value: 0.69 
 (контрольная)  40.3 % --> (тестовая)  40.18 % 
абсолютные изменения: 0.12 



## Прокси - Конверсия из чекаута в завершенный заказ самовывозом

In [108]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
     , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )
    
    , new_user as (
        select 
            anonymous_id,
            order_id,
            min(ts) as dt
        from event.new_app
        where 1=1
            and toDate(ts) < start_date
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_id
    )
                
    , first_action as (
            select
                anonymous_id,
                min(ts) as timestamp,
                if(anonymous_id global in (select anonymous_id from new_user)
                    and min(order_id) global not in (select order_id from new_user),
                    'old_user', 'new_user') as user_type,
                max(platform) as platform
            from event.new_app
            where dwh_dt between start_date and end_date + interval 1 day
                and toDate(ts) between start_date and end_date
                and event = 'Checkout Button Clicked'
                and delivery_method = 'pickup'
                and toString(anonymous_id) global in (select anonymous_id from ab_groups)
            group by anonymous_id
    )
        
    , second_action as (
        select
            anonymous_id,
            min(ts) as timestamp
        from event.new_app
        where dwh_dt between start_date and end_date + interval 1 day
            and toDate(ts) between start_date and end_date
            and event = 'Order Completed'
            and params['order_number'] global in (
                select distinct order_number
                from bi_shipments_financial
                where completed_at between start_date and end_date + Interval 1 day
                    and shipment_state = 'shipped'
                    and shipping_method_kind = 'pickup'
                )
            and params['type_delivery']='pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id
    )
        
select 
    first_action.anonymous_id as anonymous_id,
    first_action.user_type as user_type,
    first_action.platform as platform,
    ab_groups.group as group
    , max(if(if(second_action.anonymous_id != '', 1,0)
        and second_action.timestamp > first_action.timestamp
        and toDate(second_action.timestamp) = toDate(first_action.timestamp)
        , 1, 0)) as is_converted
from first_action 
left join ab_groups
    on ab_groups.anonymous_id = first_action.anonymous_id
left join second_action
  on toNullable(second_action.anonymous_id) = toNullable(first_action.anonymous_id)
group by anonymous_id, user_type, platform, group

"""

c8 = get_query_clickhouse(q)
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          c8)

ttest p-value for is_converted  is  0.6675166100817223 
 chi2 p-value for is_converted  is  0.6711473103295327 
 mannwhitneyu p-value for is_converted  is  0.6675154103799268 
 is_converted  mean value in control is  0.34684436240024313 
 is_converted  mean value in test is  0.34781576716586604 
 --------------------------------------------------- 
 p-value: 0.67 
 (контрольная)  34.68 % --> (тестовая)  34.78 % 
абсолютные изменения: 0.097 



In [109]:
# ios/android
print('IOS')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c8[c8.platform == 'ios'])
print('Android')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c8[c8.platform == 'android'])


IOS
ttest p-value for is_converted  is  0.25133410370654885 
 chi2 p-value for is_converted  is  0.25460967674778334 
 mannwhitneyu p-value for is_converted  is  0.2513313323225691 
 is_converted  mean value in control is  0.35534654317093456 
 is_converted  mean value in test is  0.3595124475363652 
 --------------------------------------------------- 
 p-value: 0.25 
 (контрольная)  35.53 % --> (тестовая)  35.95 % 
абсолютные изменения: 0.417 

Android
ttest p-value for is_converted  is  0.7000298957980909 
 chi2 p-value for is_converted  is  0.7047960861646907 
 mannwhitneyu p-value for is_converted  is  0.7000280942521816 
 is_converted  mean value in control is  0.3413504298843759 
 is_converted  mean value in test is  0.340237292555272 
 --------------------------------------------------- 
 p-value: 0.7 
 (контрольная)  34.14 % --> (тестовая)  34.02 % 
абсолютные изменения: 0.111 



In [110]:
# новичок/старичок
print('Новичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c8[c8.user_type == 'new_user'])
print('Старичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c8[c8.user_type == 'old_user'])


Новичок
ttest p-value for is_converted  is  0.1215755988537355 
 chi2 p-value for is_converted  is  0.12476796038974207 
 mannwhitneyu p-value for is_converted  is  0.12157416862933385 
 is_converted  mean value in control is  0.20630733944954127 
 is_converted  mean value in test is  0.21305920863722505 
 --------------------------------------------------- 
 p-value: 0.12 
 (контрольная)  20.63 % --> (тестовая)  21.31 % 
абсолютные изменения: 0.675 

Старичок
ttest p-value for is_converted  is  0.8965207341645189 
 chi2 p-value for is_converted  is  0.9008376126167779 
 mannwhitneyu p-value for is_converted  is  0.896520256101396 
 is_converted  mean value in control is  0.381171132056974 
 is_converted  mean value in test is  0.3808364526751288 
 --------------------------------------------------- 
 p-value: 0.9 
 (контрольная)  38.12 % --> (тестовая)  38.08 % 
абсолютные изменения: 0.033 



## Дополнительная - доля пользователей, открывшая окно с информацией о заказе

In [172]:
q = f"""
with 
    toDate('{start_date}') as start_date, 
    toDate('{end_date}') as end_date,
    toString('{exp_id}') as exp_id
    
     , ab_groups as (
        select 
            toString(anonymous_id) as anonymous_id,
            group,
            min(date_msk) as dt
        from cdm.ab__groups__anon
        where 1=1 
            and toDate(date_msk) between start_date and end_date
            and test_id = exp_id
        group by anonymous_id, group
    )
    
    , t1 as (
        select distinct
            order_number,
            if(max(shipping_category_id) = 2, 1, 0) as flg_only_food,
            if(sum(shipping_category_id) %% 3 = 0, 1, 0) as flg_only_alco
        from bi_shipments_financial
        where toDate(completed_at) between start_date and end_date + interval 1 day
            and shipping_method_kind = 'pickup'
            and retailer_category_name != 'Аптека'
        group by order_number
    )
    
    , new_user as (
        select 
            anonymous_id,
            params['order_number'] as order_number,
            min(ts) as dt
        from event.new_app
        where 1=1
            and toDate(ts) < start_date
            and event = 'Order Completed'
            and anonymous_id global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
                
    , first_action as (
        select
            toString(anonymous_id) as anonymous_id,
            params['order_number'] as order_number,
            min(ts) as timestamp,
            if(anonymous_id global in (select anonymous_id from new_user)
                    and order_number global not in (select order_number from new_user),
                    'old_user', 'new_user') as user_type,
            max(platform) as platform
        from event.new_app
        where dwh_dt between start_date and end_date + Interval 1 day
            and ts between start_date and end_date + Interval 1 day
            and event = 'Order Completed'
            and params['type_delivery']='pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id, order_number
    )
        
    , second_action as (
        select
            anonymous_id,
            min(ts) as timestamp
        from event.new_app
        where dwh_dt between start_date and end_date + interval 1 day
            and toDate(ts) between start_date and end_date
            and ((event = 'Shipment Status Seen' and params['shipment_type'] = 'pickup') or (event = 'Order Info Viewed'))
            and delivery_method = 'pickup'
            and toString(anonymous_id) global in (select anonymous_id from ab_groups)
        group by anonymous_id
    )
        
select 
    first_action.anonymous_id as anonymous_id,
    first_action.user_type as user_type,
    first_action.platform as platform,
    ab_groups.group as group
    , max(if(if(second_action.anonymous_id != '', 1,0)
        and second_action.timestamp > first_action.timestamp
        and toDate(second_action.timestamp) = toDate(first_action.timestamp)
        , 1, 0)) as is_converted
    , case
            when min(flg_only_alco) = 1 and min(flg_only_food) = 0 then 'only alco'
            when min(flg_only_food) = 1 and min(flg_only_alco) = 0 then 'only food'
            else 'mix'
    end as order_type
from first_action 
left join ab_groups
    on ab_groups.anonymous_id = first_action.anonymous_id
left join second_action
  on toNullable(second_action.anonymous_id) = toNullable(first_action.anonymous_id)
left join t1
    on t1.order_number = first_action.order_number
group by anonymous_id, user_type, platform, group

"""

c9 = get_query_clickhouse(q)
conv = calculate_proportion_metrics(test, control, ['is_converted'], 'group', 
                                          c9)

ttest p-value for is_converted  is  0.5056507777441779 
 chi2 p-value for is_converted  is  0.5103962372934199 
 mannwhitneyu p-value for is_converted  is  0.5056477899986631 
 is_converted  mean value in control is  0.7061445399554283 
 is_converted  mean value in test is  0.708188133469992 
 --------------------------------------------------- 
 p-value: 0.51 
 (контрольная)  70.61 % --> (тестовая)  70.82 % 
абсолютные изменения: 0.204 % 



In [173]:
# алкосрез
print('Только алко:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.order_type == 'only alco'])
print('Еда:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.order_type == 'only food'])
print('Микс:')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.order_type == 'mix'])


Только алко:
ttest p-value for is_converted  is  0.10312520017692256 
 chi2 p-value for is_converted  is  0.10555395799952469 
 mannwhitneyu p-value for is_converted  is  0.10312448909777504 
 is_converted  mean value in control is  0.6753988910263664 
 is_converted  mean value in test is  0.6671916929179792 
 --------------------------------------------------- 
 p-value: 0.1 
 (контрольная)  67.54 % --> (тестовая)  66.72 % 
абсолютные изменения: 0.821 % 

Еда:
statznachimo?
ttest p-value for is_converted  is  0.04244695728362578 
 chi2 p-value for is_converted  is  0.043682582113352256 
 mannwhitneyu p-value for is_converted  is  0.04244870261615032 
 is_converted  mean value in control is  0.7154277045162796 
 is_converted  mean value in test is  0.7252379578886645 
 --------------------------------------------------- 
 p-value: 0.04 
 (контрольная)  71.54 % --> (тестовая)  72.52 % 
абсолютные изменения: 0.981 % 

Микс:
ttest p-value for is_converted  is  0.6047724014980527 
 chi2 p-

In [174]:
# ios/android
print('IOS')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.platform == 'ios'])
print('Android')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.platform == 'android'])


IOS
ttest p-value for is_converted  is  0.35944925261557914 
 chi2 p-value for is_converted  is  0.3654000195017031 
 mannwhitneyu p-value for is_converted  is  0.35944224981596906 
 is_converted  mean value in control is  0.6733117510081218 
 is_converted  mean value in test is  0.6687320452881204 
 --------------------------------------------------- 
 p-value: 0.36 
 (контрольная)  67.33 % --> (тестовая)  66.87 % 
абсолютные изменения: 0.458 % 

Android
ttest p-value for is_converted  is  0.07428603184155037 
 chi2 p-value for is_converted  is  0.07589510578347221 
 mannwhitneyu p-value for is_converted  is  0.07428634869924433 
 is_converted  mean value in control is  0.7280691773808169 
 is_converted  mean value in test is  0.7349724686448456 
 --------------------------------------------------- 
 p-value: 0.07 
 (контрольная)  72.81 % --> (тестовая)  73.5 % 
абсолютные изменения: 0.69 % 



In [175]:
# новичок/старичок
print('Новичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.user_type == 'new_user'])
print('Старичок')
p = calculate_proportion_metrics(test, control, ['is_converted'], 'group', c9[c9.user_type == 'old_user'])


Новичок
ttest p-value for is_converted  is  0.17034097630299314 
 chi2 p-value for is_converted  is  0.17708573486102952 
 mannwhitneyu p-value for is_converted  is  0.1703311669218297 
 is_converted  mean value in control is  0.7154456541797379 
 is_converted  mean value in test is  0.7272559512825244 
 --------------------------------------------------- 
 p-value: 0.17 
 (контрольная)  71.54 % --> (тестовая)  72.73 % 
абсолютные изменения: 1.181 % 

Старичок
ttest p-value for is_converted  is  0.8393523775627278 
 chi2 p-value for is_converted  is  0.8455309261368152 
 mannwhitneyu p-value for is_converted  is  0.8393510163792942 
 is_converted  mean value in control is  0.704837245493451 
 is_converted  mean value in test is  0.7055032999012628 
 --------------------------------------------------- 
 p-value: 0.84 
 (контрольная)  70.48 % --> (тестовая)  70.55 % 
абсолютные изменения: 0.067 % 

