In [0]:


import datetime
import pandas as pd

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list


def get_date_granularity_mapping_list(begin_date, end_date):
    date_granularity_mapping_list = {
        "daily": get_date_list(begin_date, end_date, "D"),
        "weekly": get_date_list(begin_date, end_date, "W-SAT"),
        "monthly": get_date_list(begin_date, end_date, "M")
    }
    return date_granularity_mapping_list

begin_date = datetime.datetime.strptime("2018-12-30", '%Y-%m-%d')
end_date = datetime.datetime.strptime("2021-07-15", '%Y-%m-%d')


DATE_DICT = get_date_granularity_mapping_list(begin_date, end_date)
range_type_mapping = {"daily":"DAY","monthly":"MONTH","weekly":"WEEK"}

spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/domain_list/2021-07-08/").createOrReplaceTempView("white")


for granularity in DATE_DICT:
    for date in DATE_DICT[granularity]:
        try:
            s3path = "s3://b2c-prod-dca-mobile-web-to-int/oss/MOBILE_WEB_DOMAIN_BASIC_METRICS/version=ios_diamond_2.0.0/granularity={range_type}/date={date}/".format(range_type=range_type_mapping[granularity], date=date)
            spark.read.parquet(s3path).createOrReplaceTempView("test_mw_basic_kpi")
            df = spark.sql("select '{granularity}' as granularity, '{date}' as date, count(distinct(domain)) as count_app_id, count(distinct(country)) as count_country, count(distinct(device_type)) as count_device_type, sum(value) as sum from test_mw_basic_kpi where metric_name='AU' and  domain in (select domain_name from white) and value <> 0 and value is not null".format(date=date, granularity=granularity))
            df.write.format("parquet").save("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/2021-07-08/mw_basic_kpi/", mode="append") #append
            print("PASS on {} {}".format(granularity, date))
        except Exception as e:
            print("ERROR on {} {}".format(granularity, date))


In [0]:


spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/basic_kpi/").show()


In [0]:
%%sh

# "aa.usage.basic-kpi.v6", s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=ios_diamond_1.0.0/
# "aa.usage.audience.v6", s3://aardvark-prod-pdx-mdm-to-int/to_tech/audience/version=ios_diamond_1.0.0/
# "aa.usage.seg-by-age-gender.v6", s3://aardvark-prod-pdx-mdm-to-int/to_tech/demographics/version=ios_diamond_1.0.0/
# "aa.usage.app-cross-app.v6",  s3://aardvark-prod-pdx-mdm-to-int/to_tech/crossapps/version=ios_diamond_1.0.0/
# "aa.usage.seg-by-product.v6", s3://aardvark-prod-pdx-mdm-to-int/to_tech/appused/version=ios_diamond_1.0.0/


# aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=ios_diamond_1.0.0/range_type=DAY/date=2018-06-03/platform=2/
aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=ios_diamond_1.0.0/range_type=MONTH/
# aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=ios_diamond_1.0.0/range_type=WEEK/

In [0]:
%%sh
