In [0]:
%%sh

#aws s3 ls s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_BROWSER_METRICS/version=v1.0.0/granularity=MONTH/
#aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/city_level/report/routine/version=1.0.0/granularity=monthly/date=2020-08-31/ --recursive --human --summarize
#aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/store.product.v10/dimension/_update_date=2020-09-18/
#aws s3 ls s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_RETENTION_METRICS/version=v1.0.0/granularity=MONTH/date=2019-01-31/
#aws s3 ls s3://b2c-prod-data-pipeline-unified-constants/
#aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/
#aws s3 cp s3://b2c-prod-data-pipeline-unified-usage/unified/usage.domain-referral.v6/fact/_delta_log/00000000000000000008.json -

aws s3 ls s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_UNIFIED_PRODUCT_METRICS/version=v1.0.0/granularity=WEEK/

In [0]:

import traceback
from applications.auto_pipeline import schemas
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from aadatapipelinecore.core.urn import Urn
from applications.common import views

raw_path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_REFERRAL_METRICS/version=v1.0.0/granularity=MONTH/date=2020-08-31/"
df = spark.read.option("basePath", "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_REFERRAL_METRICS/version=v1.0.0/").parquet(raw_path)
df.createOrReplaceTempView("raw_data")


context = {"date": "2020-09-14"}
urn = Urn(namespace="aa.dna.v1", identifier=120200914080939459)
# sql_text = schemas.of_transform(urn.namespace)
# print("transfrom sql_text:\n{}".format(sql_text))

sql_text = """
CREATE TABLE unified_data (
    product_key BIGINT NOT NULL,
    referral_product_key INT NOT NULL,
    country_key INT NOT NULL,
    market_key INT NOT NULL,
    device_key INT NOT NULL,
    date_key INT NOT NULL,
    granularity_key INT NOT NULL,
    est_active_users double,
    est_usage_penetration double,
    est_total_time double,
    est_average_session_per_user double,
    est_average_session_duration double,
    est_average_time_per_user_milliseconds double,
    est_total_session_count double,
    est_share_of_sessions double,
    PRIMARY KEY(product_key, referral_product_key, country_key, market_key, device_key, date_key, granularity_key)
)
PARTITION BY COLUMNS(granularity_key, date_key, country_key, market_key, device_key)
WITH (
    NAMESPACE = 'aa.usage.domain-referral.v6',
    COALESCE = 1,
    DEDUPLICATE = True,
    DIMENSIONS = COLUMNS(product_key, referral_product_key, country_key, market_key, device_key, date_key, granularity_key),
    METRICS = COLUMNS(
        est_active_users,
        est_usage_penetration,
        est_total_time,
        est_average_session_per_user,
        est_average_session_duration,
        est_average_time_per_user_milliseconds,
        est_total_session_count,
        est_share_of_sessions
    )
);

--!INGEST;
--!SET raw_data =
--[Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='google.com', referral='google.com', value=16.05759251896051, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='ebaumsworld.com', referral='imgur.com', value=28.34194740564347, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='funsubstance.com', referral='Direct Visit', value=246.6297041100929, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='michaelhill.com', referral='All Others', value=68.82365362584602, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='t3.com', referral='All Others', value=62.179323866638086, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='t3.com', referral='Direct Visit', value=2.8329685679626153, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='visitnsw.com', referral='Direct Visit', value=23.111070261179343, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='visitnsw.com', referral='google.com.au', value=75.7895327802973, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='animesgratisbr.com', referral='All Others', value=97.33982756764964, platform=2, metric_name='ADU'),
-- Row(date="2018-10-31", granularity="MONTH", device_type=1, country='US', domain='animesgratisbr.com', referral='google.com', value=300.35012986691004, platform=2, metric_name='ADU')];

WITH raw_data_with_est AS
(
    SELECT * FROM
    (
        SELECT
            platform,
            device_type,
            CASE granularity WHEN 'DAY' THEN 'daily'
                             WHEN 'WEEK' THEN 'weekly'
                             WHEN 'MONTH' THEN 'monthly' END AS granularity,
            date,
            country,
            domain,
            referral,
            metric_name,
            value
        FROM
            raw_data
        GROUP BY
            platform,
            device_type,
            granularity,
            date,
            country,
            domain,
            referral,
            metric_name,
            value
    )
    PIVOT (FIRST(value) for metric_name
    IN (
        'AU' AS est_active_users,
        'UP' AS est_usage_penetration,
        'TT' AS est_total_time,
        'AFU' AS est_average_session_per_user,
        'ADU' AS est_average_session_duration,
        'ATU' AS est_average_time_per_user_milliseconds,
        'TV' AS est_total_session_count,
        'TS' AS est_share_of_sessions
        )
    )
);
--!DEBUG:SHOW SELECT count(*) AS raw_data_with_est FROM raw_data_with_est;

WITH domain_unified_data AS
(
    SELECT domain_id(domain_name_col='domain', another_domain_col='referral') FROM raw_data_with_est
);
--!DEBUG:SHOW SELECT * FROM domain_unified_data;
--!DEBUG:SHOW SELECT count(*) AS domain_unified_data FROM domain_unified_data;

WITH all_key_data AS (
    SELECT change_to_key(country_col="country",date_col="date",granularity_col="granularity",
    platform_col="platform",device_type_col="device_type")
    FROM domain_unified_data
);
--!DEBUG:SHOW SELECT count(*) AS all_key_data FROM all_key_data;

WITH unified_data AS
(
    SELECT
        country_key,
        market_key,
        device_key,
        date_key,
        granularity_key,
        domain_id AS product_key,
        another_domain_id AS referral_product_key,
        est_active_users,
        est_usage_penetration,
        est_total_time,
        est_average_session_per_user,
        est_average_session_duration,
        est_average_time_per_user_milliseconds,
        est_total_session_count,
        est_share_of_sessions
    FROM
        all_key_data
);

--!DEBUG:SHOW SELECT * FROM unified_data;
--!DEBUG:SHOW SELECT count(*) AS unified_data FROM unified_data;

"""
views.init(spark, sql_text)
tasks = SqlParser(spark, sql_text, parser_type="transform", context=context).parse()
# print(tasks)
SqlExecutor(urn, spark, tasks, "transform", context).run()
print("done")

In [0]:
%%sh
#aws s3 ls s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_UNIFIED_PRODUCT_METRICS/
#aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/city_level/report/routine/version=1.0.0/granularity=monthly/date=2020-08-31/country_code=AE/device_code=android-phone/
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-free/unified/store.product.v10/dimension/_update_date=2020-09-17/

In [0]:
%%sh
aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/usage_retention/batch=retention_batch_D30_routine/version=v3.0.0/range_type=MONTH/date=2020-07-31/metric_name=RRD30/platform=1/

In [0]:
%%sh
aws s3 ls s3://aardvark-prod-pdx-to-int-usage-cohorts/retention/retention_batch/v3.0.0/MONTH/2018-08-31/RRD30/US/ --summarize --recursive --human-readable

In [0]:

city_level_path = "s3://aardvark-prod-pdx-mdm-to-int/city_level/report/routine/version=1.0.0/granularity=monthly/date=2020-08-31/"
product_attr_path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_UNIFIED_PRODUCT_METRICS/version=v1.0.0/granularity=MONTH/date=2020-08-31/"
mobile_web_browser_path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_BROWSER_METRICS/version=v1.0.0/granularity=MONTH/date=2020-08-31/"
mobile_web_referral_path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_REFERRAL_METRICS/version=v1.0.0/granularity=MONTH/date=2020-08-31/"
mobile_web_outbound_path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_OUTBOUND_METRICS/version=v1.0.0/granularity=MONTH/date=2020-08-31/"
domain_path = "s3://b2c-prod-data-pipeline-unified-store-free/unified/store.product.v10/dimension/_update_date=2020-09-18/"
retetion_path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_RETENTION_METRICS/version=v1.0.0/granularity=MONTH/date=2019-01-31/"

unidifed_referral_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.domain-referral.v6/fact/granularity=monthly/date=2020-08-31/"

#df = spark.read.option('basePath', 's3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_RETENTION_METRICS/version=v1.0.0/').parquet(retetion_path).show()
#df = spark.read.parquet(unidifed_referral_path)
#df.show(10)
df = spark.read.format("delta").load(unidifed_referral_path)
df.count()
#df.printSchema()
#df.groupBy("_identifier").count().orderBy("count", ascending=False).show(10)
#df.distinct().count()
#df.createOrReplaceTempView("temp_table_ls")
#spark.sql("select * from temp_table_ls limit 10").collect()