In [0]:


import datetime
import pandas as pd

def get_date_list(begin_date, end_date, freq):
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=begin_date, end=end_date, freq=freq))]
    return date_list


def get_date_granularity_mapping_list(begin_date, end_date):
    date_granularity_mapping_list = {
        "weekly": get_date_list(begin_date, end_date, "W-SAT"),
        "monthly": get_date_list(begin_date, end_date, "M")
    }
    return date_granularity_mapping_list

begin_date = datetime.datetime.strptime("2021-06-03", '%Y-%m-%d')
end_date = datetime.datetime.strptime("2021-06-03", '%Y-%m-%d')


DATE_DICT = get_date_granularity_mapping_list(begin_date, end_date)
range_type_mapping = {"daily":"DAY","monthly":"MONTH","weekly":"WEEK"}


for granularity in DATE_DICT:
    for date in DATE_DICT[granularity]:
        
        try:
            s3path = "s3://aardvark-prod-pdx-mdm-to-int/to_tech/audience/version=fix_crow_1.0.0/range_type={range_type}/date={date}/".format(range_type=range_type_mapping[granularity], date=date)
            spark.read.parquet(s3path).createOrReplaceTempView("test_audience")
            df = spark.sql("select  '{granularity}' as granularity, '{date}' as date, count(distinct(app_id)) as count_app_id, count(distinct(country)) as count_country, count(distinct(device_type)) as count_device_type,count(distinct(age)) as count_age, count(distinct(gender)) as count_gender, sum(IDX) as sum from test_audience".format(date=date, granularity=granularity))
            df.write.format("parquet").save("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/2021-08-11/audience/", mode="append") #append
            print("PASS on {} {}".format(granularity, date))
        except Exception as e:
            print("ERROR on {} {}".format(granularity, date))

In [0]:


"aa.usage.basic-kpi.v6", s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=ios_diamond_1.0.0/ day week month
"aa.usage.audience.v6", s3://aardvark-prod-pdx-mdm-to-int/to_tech/audience/version=ios_diamond_1.0.0/   month
"aa.usage.seg-by-age-gender.v6", s3://aardvark-prod-pdx-mdm-to-int/to_tech/demographics/version=ios_diamond_1.0.0/ month week
"aa.usage.app-cross-app.v6",  s3://aardvark-prod-pdx-mdm-to-int/to_tech/crossapps/version=ios_diamond_1.0.0/ month
"aa.usage.seg-by-product.v6", s3://aardvark-prod-pdx-mdm-to-int/to_tech/appused/version=ios_diamond_1.0.0/ MONTH WEEK

spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/basic_kpi/").show(3)
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/audience/").show(3)
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/demographics/").show(3)
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/crossapps/").show(3)
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/appused/").show(3)

date='2021-07-16'

spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/basic_kpi/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/basic_kpi.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/audience/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/audience.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/demographics/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/demographics.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/crossapps/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/crossapps.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/appused/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/appused.csv".format(date),header = 'true')

spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/domain_outbound/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/domain_outbound.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/domain_referral/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/domain_referral.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/test_app_cross_domain/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/test_app_cross_domain.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/test_domain_cross_app/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/test_domain_cross_app.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/test_domain_cross_domain/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/test_domain_cross_domain.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/test_domain_unified_attribution/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/test_domain_unified_attribution.csv".format(date),header = 'true')
spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/{}/mw_basic_kpi/".format(date)).coalesce(1).write.csv("s3://b2c-prod-data-pipeline-qa/aa.tom.s6usage/adhoc/{}/mw_basic_kpi.csv".format(date),header = 'true')


