In [1]:
import pandas as pd

import snowflake.connector

import numpy as np

%matplotlib inline

import matplotlib.pyplot as plt 

from pydataset import data

from datetime import datetime, timedelta

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.options.display.max_rows = 1000
pd.options.display.max_columns = 1000

import config as cfg

snowflake_user = cfg.snowflake_access['snowflake_user']
snowflake_pass = cfg.snowflake_access['snowflake_pass']
snowflake_acct = cfg.snowflake_access['snowflake_acct']
snowflake_wh = 'BIRDFACTS_PROD_WAREHOUSE'
snowflake_db = 'BIRDFACTSDEV'
snowflake_schema = 'BIRDFACTS_ANALYTICS'

con = snowflake.connector.connect(
    user = snowflake_user,
    password = snowflake_pass,
    account = snowflake_acct,
    warehouse = snowflake_wh,
    database = snowflake_db,
    schema = snowflake_schema
)

cs = con.cursor()

Sales

In [143]:
daily_sales_query = """
select
case when a.dma in ('New York') then 'test' else 'control' end as test_control_group,
case when b.happened_at_local_date between '2020-04-29' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
b.happened_at_local_date,
sum(b.gross_sales_usd) as gross_sales
from
birdfactsdev.birdfacts_analytics.dma_to_zip_mapping as a
left join
birdfactsdev.birdfacts_analytics.fact_sales as b
on a.postal_code = left(b.shipping_postal_code, 5)
where b.happened_at_local_date between '2020-04-29' and '2020-05-24'
and b.sales_channel = 'eCommerce'
and b.profit_center_country = 'United States'
and a.dma in ('New York', 'Chicago', 'Boston')
group by 1, 2, 3
order by 1, 2, 3
;
"""

In [144]:
daily_sales = pd.DataFrame(cs.execute(daily_sales_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))

In [145]:
daily_sales.shape
daily_sales.head()

(52, 4)

Unnamed: 0,TEST_CONTROL_GROUP,PRE_POST_PERIOD,HAPPENED_AT_LOCAL_DATE,GROSS_SALES
0,control,post,2020-05-11,38950.0
1,control,post,2020-05-12,60100.0
2,control,post,2020-05-13,47464.0
3,control,post,2020-05-14,70819.0
4,control,post,2020-05-15,66865.0


In [146]:
daily_sales['GROSS_SALES'] = pd.to_numeric(daily_sales['GROSS_SALES'])

In [147]:
daily_sales_pivot = daily_sales.pivot_table(index = ['PRE_POST_PERIOD','HAPPENED_AT_LOCAL_DATE'], 
                        columns = 'TEST_CONTROL_GROUP', 
                        values = 'GROSS_SALES',
                       aggfunc = 'sum').reset_index().sort_values(by = 'HAPPENED_AT_LOCAL_DATE').reset_index(drop = True)

In [148]:
daily_sales_pivot.to_clipboard(index = False)

In [149]:
avg_sales_query = """
select
test_control_group,
pre_post_period,
avg(gross_sales) as avg_daily_gross_sales
from
(
select
case when a.dma in ('New York') then 'test' else 'control' end as test_control_group,
case when b.happened_at_local_date between '2020-04-29' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
b.happened_at_local_date,
sum(b.gross_sales_usd) as gross_sales
from
birdfactsdev.birdfacts_analytics.dma_to_zip_mapping as a
left join
birdfactsdev.birdfacts_analytics.fact_sales as b
on a.postal_code = left(b.shipping_postal_code, 5)
where b.happened_at_local_date between '2020-04-29' and '2020-05-24'
and b.sales_channel = 'eCommerce'
and b.profit_center_country = 'United States'
and a.dma in ('New York', 'Chicago', 'Boston')
group by 1, 2, 3
order by 1, 2, 3
)
group by 1, 2
order by 1, 2
"""

In [150]:
avg_sales = pd.DataFrame(cs.execute(avg_sales_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))

In [151]:
avg_sales.shape
avg_sales.head()

(4, 3)

Unnamed: 0,TEST_CONTROL_GROUP,PRE_POST_PERIOD,AVG_DAILY_GROSS_SALES
0,control,post,52935.56142857
1,control,pre,64250.27666667
2,test,post,60545.28285714
3,test,pre,67589.96083333


In [152]:
avg_sales['AVG_DAILY_GROSS_SALES'] = pd.to_numeric(avg_sales['AVG_DAILY_GROSS_SALES'])

In [153]:
avg_sales.to_clipboard(index = False)

Sessions

In [12]:
daily_sessions_query = """
select
case when GEO_NETWORK_METRO in ('New York, NY', 'New York NY') then 'test' else 'control' end as test_control_group,
case when session_date between '2020-04-27' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
session_date,
num_sessions as daily_sessions
from
(
select
GEO_NETWORK_METRO,
date(visit_start_time) as session_date,
count(distinct visit_id, visitor_id, visit_start_time) as num_sessions
from
fivetran.google_analytics_360.ga_session
where date(visit_start_time)  between '2020-04-27' and '2020-05-24'
and GEO_NETWORK_METRO in ('New York, NY', 'New York NY', 'Boston MA-Manchester NH', 'Chicago IL')
group by 1, 2
order by 1, 2
)
"""

In [13]:
daily_sessions = pd.DataFrame(cs.execute(daily_sessions_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))

In [14]:
daily_sessions.shape
daily_sessions.head()

(84, 4)

Unnamed: 0,TEST_CONTROL_GROUP,PRE_POST_PERIOD,SESSION_DATE,DAILY_SESSIONS
0,control,pre,2020-04-27,3842
1,control,pre,2020-04-28,13879
2,control,pre,2020-04-29,8263
3,control,pre,2020-04-30,5794
4,control,pre,2020-05-01,5177


In [15]:
avg_sessions_query = """
select
case when GEO_NETWORK_METRO in ('New York, NY', 'New York NY') then 'test' else 'control' end as test_control_group,
case when session_date between '2020-04-27' and '2020-05-10' then 'pre' else 'post' end as pre_post_period,
avg(num_sessions) as avg_daily_sessions
from
(
select
GEO_NETWORK_METRO,
date(visit_start_time) as session_date,
count(distinct visit_id, visitor_id, visit_start_time) as num_sessions
from
fivetran.google_analytics_360.ga_session
where date(visit_start_time)  between '2020-04-27' and '2020-05-24'
and GEO_NETWORK_METRO in ('New York, NY', 'New York NY', 'Boston MA-Manchester NH', 'Chicago IL')
group by 1, 2
order by 1, 2
)
group by 1, 2
order by 1, 2
;
"""

In [16]:
avg_sessions = pd.DataFrame(cs.execute(avg_sessions_query).fetchall(),
                           columns = ([col[0] for col in cs.description]))

In [18]:
avg_sessions.shape
avg_sessions.head()

(4, 3)

Unnamed: 0,TEST_CONTROL_GROUP,PRE_POST_PERIOD,AVG_DAILY_SESSIONS
0,control,post,4076.035714
1,control,pre,5763.464286
2,test,post,8310.071429
3,test,pre,10716.928571


In [25]:
avg_sessions['AVG_DAILY_SESSIONS'] = pd.to_numeric(avg_sessions['AVG_DAILY_SESSIONS'])