In [224]:
import pandas as pd

import snowflake.connector

import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt 

from pydataset import data

from datetime import datetime, timedelta

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

import config as cfg

from scipy import stats

snowflake_user = cfg.snowflake_access['snowflake_user']
snowflake_pass = cfg.snowflake_access['snowflake_pass']
snowflake_acct = cfg.snowflake_access['snowflake_acct']
snowflake_wh = 'BIRDFACTS_PROD_WAREHOUSE'
snowflake_db = 'BIRDFACTSDEV'
snowflake_schema = 'BIRDFACTS_ANALYTICS'

con = snowflake.connector.connect(
    user = snowflake_user,
    password = snowflake_pass,
    account = snowflake_acct,
    warehouse = snowflake_wh,
    database = snowflake_db,
    schema = snowflake_schema
)

cs = con.cursor()

### Sales

Test: New York, San Francisco - Oakland - San Jose, Dallas - Ft. Worth, Denver

Control: Chicago, Boston, Seattle - Tacoma, Houston, Phoenix, San Diego

In [231]:
def daily_sales_values(test_group, control_group):
    daily_sales_query = """
    select
    case when a.dma in """ + test_group + """ then 'test' else 'control' end as test_control_group,
    case when b.happened_at_local_date between '2020-04-01' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
    b.happened_at_local_date,
    sum(b.gross_sales_usd) as gross_sales
    from
    birdfactsdev.birdfacts_analytics.dma_to_zip_mapping as a
    left join
    birdfactsdev.birdfacts_analytics.fact_sales as b
    on a.postal_code = left(b.shipping_postal_code, 5)
    where b.happened_at_local_date between '2020-04-01' and '2020-05-31'
    and b.sales_channel = 'eCommerce'
    and b.profit_center_country = 'United States'
    and b.event_type = 'order'
    and (a.dma in """ + test_group + """ or a.dma in """ + control_group + """)
    group by 1, 2, 3
    order by 1, 2, 3
    ;
    """
    
    daily_sales = pd.DataFrame(cs.execute(daily_sales_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))
    
    daily_sales['GROSS_SALES'] = pd.to_numeric(daily_sales['GROSS_SALES'])
    daily_sales_pivot = daily_sales.pivot_table(index = ['PRE_POST_PERIOD','HAPPENED_AT_LOCAL_DATE'], 
                        columns = 'TEST_CONTROL_GROUP', 
                        values = 'GROSS_SALES',
                       aggfunc = 'sum').reset_index().sort_values(by = 'HAPPENED_AT_LOCAL_DATE').reset_index(drop = True)
    
    daily_sales_pivot['diff'] = daily_sales_pivot['test'] - daily_sales_pivot['control']
    
    print(stats.ttest_ind(daily_sales_pivot[daily_sales_pivot['PRE_POST_PERIOD'] == 'pre']['diff'], daily_sales_pivot[daily_sales_pivot['PRE_POST_PERIOD'] == 'post']['diff'], equal_var = False))
    
    daily_sales_pivot.to_clipboard(index = False)

In [232]:
daily_sales_values("('New York')", "('Chicago', 'Boston')")

Ttest_indResult(statistic=-4.486957463972254, pvalue=4.1480834950535234e-05)


In [233]:
daily_sales_values("('San Francisco - Oakland - San Jose')", "('Seattle - Tacoma')")

Ttest_indResult(statistic=1.5060337301050926, pvalue=0.13968132972505765)


In [234]:
daily_sales_values("('Dallas - Ft. Worth')", "('Houston', 'Phoenix')")

Ttest_indResult(statistic=-1.1767971999106517, pvalue=0.24425855220911163)


In [235]:
daily_sales_values("('Denver')", "('San Diego')")

Ttest_indResult(statistic=-0.19929103219005373, pvalue=0.8429767419202714)


In [236]:
daily_sales_values("('Philadelphia')", "('Hartford & New Haven', 'Baltimore')")

Ttest_indResult(statistic=-0.7879434715561839, pvalue=0.43390824092610225)


### Orders

Test: New York, San Francisco - Oakland - San Jose, Dallas - Ft. Worth, San Diego, Philadelphia

Control: Chicago, Boston, Seattle - Tacoma, Houston, Phoenix, Denver, Hartford & New Haven, Baltimore

In [237]:
def daily_orders_values(test_group, control_group):
    daily_orders_query = """
    select
    case when a.dma in """ + test_group + """ then 'test' else 'control' end as test_control_group,
    case when b.happened_at_local_date between '2020-04-01' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
    b.happened_at_local_date,
    count(distinct b.order_name) as num_orders
    from
    birdfactsdev.birdfacts_analytics.dma_to_zip_mapping as a
    left join
    birdfactsdev.birdfacts_analytics.fact_sales as b
    on a.postal_code = left(b.shipping_postal_code, 5)
    where b.happened_at_local_date between '2020-04-01' and '2020-05-31'
    and b.sales_channel = 'eCommerce'
    and b.profit_center_country = 'United States'
    and b.event_type = 'order'
    and (a.dma in """ + test_group + """ or a.dma in """ + control_group + """)
    group by 1, 2, 3
    order by 1, 2, 3
    ;
    """
    
    daily_orders = pd.DataFrame(cs.execute(daily_orders_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))
    
    daily_orders['NUM_ORDERS'] = pd.to_numeric(daily_orders['NUM_ORDERS'])
    daily_orders_pivot = daily_orders.pivot_table(index = ['PRE_POST_PERIOD','HAPPENED_AT_LOCAL_DATE'], 
                        columns = 'TEST_CONTROL_GROUP', 
                        values = 'NUM_ORDERS',
                       aggfunc = 'sum').reset_index().sort_values(by = 'HAPPENED_AT_LOCAL_DATE').reset_index(drop = True)
    
    daily_orders_pivot['diff'] = daily_orders_pivot['test'] - daily_orders_pivot['control']
    
    print(stats.ttest_ind(daily_orders_pivot[daily_orders_pivot['PRE_POST_PERIOD'] == 'pre']['diff'], daily_orders_pivot[daily_orders_pivot['PRE_POST_PERIOD'] == 'post']['diff'], equal_var = False))
    
    daily_orders_pivot.to_clipboard(index = False)

In [238]:
daily_orders_values("('New York')", "('Chicago', 'Boston')")

Ttest_indResult(statistic=-5.314629979259995, pvalue=3.62950518374536e-06)


In [239]:
daily_orders_values("('San Francisco - Oakland - San Jose')", "('Seattle - Tacoma')")

Ttest_indResult(statistic=1.695793269338536, pvalue=0.09744140524170279)


In [240]:
daily_orders_values("('Dallas - Ft. Worth')", "('Houston', 'Phoenix')")

Ttest_indResult(statistic=-0.9896175180868448, pvalue=0.3264833969403833)


In [241]:
daily_orders_values("('Denver')", "('San Diego')")

Ttest_indResult(statistic=0.4079619823690547, pvalue=0.6851592420264843)


In [242]:
daily_orders_values("('Philadelphia')", "('Hartford & New Haven', 'Baltimore')")

Ttest_indResult(statistic=-1.3682260830130462, pvalue=0.17645907077359566)


### NCA

Test: New York, San Francisco - Oakland - San Jose, Dallas - Ft. Worth, Denver, Philadelphia

Control: Chicago, Boston, Seattle - Tacoma, Houston, Phoenix, San Diego, Hartford & New Haven, Baltimore

In [249]:
def daily_nca_values(test_group, control_group):
    daily_nca_query = """
    select
    case when a.dma in """ + test_group + """ then 'test' else 'control' end as test_control_group,
    case when b.happened_at_local_date between '2020-04-01' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
    b.happened_at_local_date,
    count(distinct b.customer_id) as nca
    from
    birdfactsdev.birdfacts_analytics.dma_to_zip_mapping as a
    left join
    birdfactsdev.birdfacts_analytics.fact_sales as b
    on a.postal_code = left(b.shipping_postal_code, 5)
    where b.happened_at_local_date between '2020-04-01' and '2020-05-31'
    and b.sales_channel = 'eCommerce'
    and b.profit_center_country = 'United States'
    and b.event_type = 'order'
    and b.is_new_customer = 'True'
    and (a.dma in """ + test_group + """ or a.dma in """ + control_group + """)
    group by 1, 2, 3
    order by 1, 2, 3
    ;
    """
    
    daily_nca = pd.DataFrame(cs.execute(daily_nca_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))
    
    daily_nca['NCA'] = pd.to_numeric(daily_nca['NCA'])
    daily_nca_pivot = daily_nca.pivot_table(index = ['PRE_POST_PERIOD','HAPPENED_AT_LOCAL_DATE'], 
                        columns = 'TEST_CONTROL_GROUP', 
                        values = 'NCA',
                       aggfunc = 'sum').reset_index().sort_values(by = 'HAPPENED_AT_LOCAL_DATE').reset_index(drop = True)
    
    daily_nca_pivot['diff'] = daily_nca_pivot['test'] - daily_nca_pivot['control']
    
    print(stats.ttest_ind(daily_nca_pivot[daily_nca_pivot['PRE_POST_PERIOD'] == 'pre']['diff'], daily_nca_pivot[daily_nca_pivot['PRE_POST_PERIOD'] == 'post']['diff'], equal_var = False))
    
    daily_nca_pivot.to_clipboard(index = False)

In [250]:
daily_nca_values("('New York')", "('Chicago', 'Boston')")

Ttest_indResult(statistic=-3.944442587587477, pvalue=0.00025652045253443196)


In [251]:
daily_nca_values("('San Francisco - Oakland - San Jose')", "('Seattle - Tacoma')")

Ttest_indResult(statistic=1.9793417747871571, pvalue=0.05449057661744126)


In [252]:
daily_nca_values("('Dallas - Ft. Worth')", "('Houston', 'Phoenix')")

Ttest_indResult(statistic=-2.447026854734558, pvalue=0.017486422006917204)


In [253]:
daily_nca_values("('Denver')", "('San Diego')")

Ttest_indResult(statistic=0.3518171533894892, pvalue=0.7267470945125262)


In [254]:
daily_nca_values("('Philadelphia')", "('Hartford & New Haven', 'Baltimore')")

Ttest_indResult(statistic=-2.387989982836635, pvalue=0.0206990178585469)


### Sessions

In [243]:
def daily_sessions_values(test_group, control_group):
    daily_sessions_query = """
    select
    case when GEO_NETWORK_METRO in """ + test_group + """ then 'test' else 'control' end as test_control_group,
    case when session_date between '2020-04-01' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
    session_date,
    num_sessions as daily_sessions
    from
    (
    select
    GEO_NETWORK_METRO,
    date(visit_start_time) as session_date,
    count(distinct visit_id, visitor_id, visit_start_time) as num_sessions
    from
    fivetran.google_analytics_360.ga_session
    where date(visit_start_time)  between '2020-04-01' and '2020-05-31'
    and (GEO_NETWORK_METRO in """ + test_group + """ or GEO_NETWORK_METRO in """ + control_group + """)
    group by 1, 2
    order by 1, 2
    )
    ;
    """
    
    daily_sessions = pd.DataFrame(cs.execute(daily_sessions_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))
    
    daily_sessions['DAILY_SESSIONS'] = pd.to_numeric(daily_sessions['DAILY_SESSIONS'])
    
    daily_sessions_pivot = daily_sessions.pivot_table(index = ['PRE_POST_PERIOD','SESSION_DATE'], 
                        columns = 'TEST_CONTROL_GROUP', 
                        values = 'DAILY_SESSIONS',
                       aggfunc = 'sum').reset_index().sort_values(by = 'SESSION_DATE').reset_index(drop = True)
    
    daily_sessions_pivot['diff'] = daily_sessions_pivot['test'] - daily_sessions_pivot['control']
    
    print(stats.ttest_ind(daily_sessions_pivot[daily_sessions_pivot['PRE_POST_PERIOD'] == 'pre']['diff'], daily_sessions_pivot[daily_sessions_pivot['PRE_POST_PERIOD'] == 'post']['diff'], equal_var = False))
    
    daily_sessions_pivot.to_clipboard(index = False)
    
    

In [244]:
daily_sessions_values("('New York, NY', 'New York NY')", "('Boston MA-Manchester NH', 'Chicago IL')")

Ttest_indResult(statistic=-7.72800267117263, pvalue=6.698476003576652e-10)


In [245]:
daily_sessions_values("('San Francisco-Oakland-San Jose CA')", "('Seattle-Tacoma WA')")

Ttest_indResult(statistic=2.2834455601199597, pvalue=0.02690858171717745)


In [246]:
daily_sessions_values("('Dallas-Ft. Worth TX')", "('Houston TX', 'Phoenix AZ')")

Ttest_indResult(statistic=-11.391680744904338, pvalue=2.0189334291327192e-15)


In [247]:
daily_sessions_values("('Denver CO')", "('San Diego CA')")

Ttest_indResult(statistic=1.4776041591325029, pvalue=0.1448605844121858)


In [248]:
daily_sessions_values("('Philadelphia PA')", "('Baltimore MD', 'Hartford & New Haven CT')")

Ttest_indResult(statistic=-2.773898550618548, pvalue=0.007485231771770038)
