In [0]:

from pyspark.sql.functions import sum
from pyspark.sql.functions import desc
import datetime
from dateutil.relativedelta import relativedelta


start_date = datetime.datetime.strptime('2020-01-01', '%Y-%m-%d')
end_date = datetime.datetime.strptime('2020-01-31', '%Y-%m-%d')
granularity = 'quarterly'


def get_date_range(granularity, start_date):
    if granularity == 'weekly':
        end_date = start_date + relativedelta(weeks=1)
    elif granularity == 'monthly':
        end_date = start_date + relativedelta(months=1)
    elif granularity == 'quarterly':
        end_date = start_date + relativedelta(months=3)
    elif granularity == 'yearly':
        end_date = start_date + relativedelta(months=12)
    return start_date


start = start_date
while start < end_date:
    end = get_date_range(granularity, start)

    category_daily_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
        "store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))

    agg_df = category_daily_df.groupBy('app_id', 'country_code', 'device_code', 'category_id').agg(
        sum('est_free_app_download').alias('free_app_download'), sum('est_paid_app_download').alias('paid_app_download'), 
            sum('est_revenue').alias('revenue'))
    agg_df.createOrReplaceTempView("agg_df")

    if granularity == 'weekly' or granularity == 'monthly':
        pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
            "store.app-est-category.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))
    else:
        pre_agg_df = spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
            "store.app-est-category-pre-aggr.v3/fact/").where("granularity='{}' and date='{}' and data_stage='final'".format(granularity, end))

    pre_agg_df.createOrReplaceTempView("pre_agg_df")

    diff_df1 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                                from agg_df 
                            except all 
                            select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                                from pre_agg_df""")
    diff_df2 = spark.sql("""select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                                from pre_agg_df 
                            except all 
                            select app_id, category_id, country_code, device_code, free_app_download, paid_app_download, revenue 
                                from agg_df""")
    # print agg_df.count(), pre_agg_df.count()
    diff_count1 = diff_df1.count()
    diff_count2 = diff_df2.count()
    if diff_count1 != 0 or diff_count2 != 0:
        if diff_count1 != 0:
            df_write_result = diff_df1.take(20)
        else:
            df_write_result = diff_df2.take(20)

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_store_dump_v1_count_0608/daily/",
                mode="append",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)

        print "Store Category Test FAIL!!!!! date: {}, diff_count1: {}, diff_count2: {}".format(end, diff_count1, diff_count2)
    elif diff_count1 == 0 and diff_count2 == 0:
        print "Store Category Test PASS! date: {}, diff_count1: {}, diff_count2: {}".format(end, diff_count1, diff_count2)
    start = end

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v3/fact/granularity=weekly/

In [0]:
%%sh
