In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 02, 29)
    start = datetime.date(2019, 10, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 03, 28)
    start = datetime.date(2019, 10, 05)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 03, 28)
    start = datetime.date(2019, 11, 22)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                   'usage.basic-kpi.v1/fact/granularity={unified_granularity}/date={unified_date}/'
    routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=v3.0.0/' \
               'range_type={raw_granularity}/date={raw_date}/'
    for date in date_list:
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.parquet(v1_path_parse).count()
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                    routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:
%%sh
aws s3 ls s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=v3.0.0/range_type=DAY/

In [0]:

df = spark.read.parquet('s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=v3.0.0/range_type=DAY/date=2019-11-23/')
print df.count()

In [0]:

df = spark.read.parquet('s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=v3.0.0/range_type=DAY/date=2019-11-24/')
print df.count()

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 02, 29)
    start = datetime.date(2019, 10, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 03, 28)
    start = datetime.date(2019, 10, 05)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 06, 14)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                  'usage.basic-kpi.v5/fact/granularity={unified_granularity}/date={unified_date}/'
        routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/' \
                       'range_type={raw_granularity}/date={raw_date}/'
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.parquet(v1_path_parse).count()
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])
        test_result.append((_granularity, routine_count, v1_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'routine_count', 'v1_count'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0629/daily/",
            mode="append",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/granularity=daily/


In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 02, 29)
    start = datetime.date(2019, 10, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 06, 20)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 06, 14)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                  'usage.basic-kpi.v5/fact/granularity={unified_granularity}/date={unified_date}/'
        routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/' \
                       'range_type={raw_granularity}/date={raw_date}/'
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.parquet(v1_path_parse).count()
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])
        test_result.append((_granularity, routine_count, v1_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'routine_count', 'v1_count'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0629/weekly/",
            mode="append",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["weekly"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

spark.read.format("delta").load("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/").where("granularity='daily' and date between '{}' and '{}'".format(start, end))
df = spark.read.format("delta").load('s3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type=WEEK/date=2020-06-20/')
print df.count()

In [0]:

df = spark.read.parquet('s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0629/weekly/')
df.orderBy('date').show()

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 05, 31)
    start = datetime.date(2020, 05, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 03, 28)
    start = datetime.date(2019, 10, 05)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 06, 14)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                  'usage.basic-kpi.v5/fact/granularity={unified_granularity}/date={unified_date}/'
        routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/' \
                       'range_type={raw_granularity}/date={raw_date}/'
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.format("delta").load(v1_path_parse).count()
        print v1_count, routine_count
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])
        test_result.append((_granularity, routine_count, v1_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'routine_count', 'v1_count'])



granularity_list = ["monthly"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 02, 29)
    start = datetime.date(2019, 10, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 03, 28)
    start = datetime.date(2019, 10, 05)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 27)
    start = datetime.date(2020, 05, 24)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                  'usage.basic-kpi.v5/fact/granularity={unified_granularity}/date={unified_date}/'
        routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/' \
                       'range_type={raw_granularity}/date={raw_date}/'
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.format("delta").load(v1_path_parse).count()
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])
        test_result.append((_granularity, routine_count, v1_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'routine_count', 'v1_count'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0709/daily/",
            mode="overwrite",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 02, 29)
    start = datetime.date(2019, 10, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 06, 27)
    start = datetime.date(2020, 05, 30)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 05, 24)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                  'usage.basic-kpi.v5/fact/granularity={unified_granularity}/date={unified_date}/'
        routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/' \
                       'range_type={raw_granularity}/date={raw_date}/'
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.format("delta").load(v1_path_parse).count()
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])
        test_result.append((_granularity, routine_count, v1_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'routine_count', 'v1_count'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0709/weekly/",
            mode="overwrite",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["weekly"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry

raw_granularity_dict = {'daily': 'DAY', 'monthly': 'MONTH', 'weekly': 'WEEK'}
test_result = []


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 06, 27)
    start = datetime.date(2020, 05, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 05, 30)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 20)
    start = datetime.date(2020, 05, 24)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_routine_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/' \
                  'usage.basic-kpi.v5/fact/granularity={unified_granularity}/date={unified_date}/'
        routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/' \
                       'range_type={raw_granularity}/date={raw_date}/'
        v1_path_parse = v1_path.format(unified_granularity=_granularity, unified_date=date[1])
        routine_path_parse = routine_path.format(raw_granularity=raw_granularity_dict[_granularity], raw_date=date[1])
        routine_count = spark.read.parquet(routine_path_parse).count()
        v1_count = spark.read.format("delta").load(v1_path_parse).count()
        if routine_count != v1_count:
            print 'Completeness Test Fail!!! routine data: {}, unified data: {}, date: {}'.format(
                routine_count, v1_count, date[1])
        else:
            print 'Completeness Test Pass! date: {} '.format(date[1])
        test_result.append((_granularity, routine_count, v1_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'routine_count', 'v1_count'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0709/monthly/",
            mode="overwrite",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["monthly"]
for granularity in granularity_list:
    check_usage_routine_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

df = spark.read.format("delta").load('s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_routine_v1_count_0709/monthly/')
result = df.distinct().orderBy('v1_count').collect()
for row in result:
    print row['date'], '\t', row['routine_count'], '\t', row['v1_count']

In [0]:

start = '2020-05-24'
end = '2020-06-20'
df = spark.read.format("delta").load('s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/').where("granularity='daily' and date between '{}' and '{}'".format(start, end))
df.createOrReplaceTempView("df")
result = spark.sql("select date, count(app_id) as count from df group by date order by date asc").collect()
for r in result:
    print r['date'], '\t', r['count']

In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="zidong-application-autopipeline")

# reload dependencies from temp
spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")
# spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

In [0]:

from aadatapipelinecore.core.urn import Urn
import psycopg2
from dateutil.relativedelta import relativedelta
from pyspark.sql.functions import count
from pyspark.sql import Row
from pyspark.sql import functions
from aaplproxy.connection import ClusterConnection
from conf import settings
from aadatapipelinecore.core.loader.plproxy import build_db_settings
import datetime as d
import datetime
from datetime import timedelta


test_result = []
PG_AA_HOSTS = [('internal-aa-prod-plproxy-internal-2-31838298.us-east-1.elb.amazonaws.com', 7432)]
PG_AA_NAME = 'cohort'
PG_AA_ACCESS_ID = 'app_bdp_usage_qa'
PG_AA_SECRET_KEY = '2mHdFW6%#REu'

aa_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_AA_NAME,
        user=PG_AA_ACCESS_ID,
        host=PG_AA_HOSTS[0][0],
        password=PG_AA_SECRET_KEY,
        port=PG_AA_HOSTS[0][1]
    )
)

urn = Urn(
    namespace="app-qa.db-check.v1",
    owner="app_qa"
)

sql = """select count(uniqlo_id) from plproxy.execute_select_nestloop($proxy$ 
    select max(app_id) as uniqlo_id
    from mu.app_{}
    where 
        date='{}'
    group by
    app_id,
    store_id,
    device_id
$proxy$) t (uniqlo_id BIGINT);"""


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 06, 27)
    start = datetime.date(2020, 05, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 06, 27)
    start = datetime.date(2020, 05, 30)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 06, 27)
    start = datetime.date(2020, 05, 24)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = collect_date
    return date_list


def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result


def check_usage_plproxy_dump_completeness(_granularity, date_list):
    for date in date_list:
        plproxy_count = query(aa_dsn, sql.format(_granularity, date[0]))
        print date[0], '\t', plproxy_count[0][0]


granularity_list = ["daily", 'weekly', 'monthly']
for granularity in granularity_list:
    print granularity
    check_usage_plproxy_dump_completeness(granularity, get_path_date_list(granularity))
    print 'pass'


In [0]:
%%sh
PGPASSWORD='wZw8cfBuuklIskVG' psql -h 10.2.6.141  -U citus_bdp_prod_app_int_qa -d aa_store_db -p 5432 << EOF 
set search_path=usage;
select date, count(1) from usage_basic_kpi_fact_v6 where granularity='monthly' and date between '2020-05-24' and '2020-06-27' group by date order by date asc;
EOF

In [0]:
%%sh
PGPASSWORD='wZw8cfBuuklIskVG' psql -h 10.2.6.141  -U citus_bdp_prod_app_int_qa -d aa_store_db -p 5432 << EOF 
set search_path=usage;
select * from usage_basic_kpi_fact_v6 where granularity='daily' and date between '2020-07-03' and '2020-07-03' limit 3;
EOF

In [0]:
%%sh
PGPASSWORD='2mHdFW6%#REu' psql -h internal-aa-prod-plproxy-internal-2-31838298.us-east-1.elb.amazonaws.com -U app_bdp_usage_qa -d cohort -p 7432 << EOF 
select count(uniqlo_id) from plproxy.execute_select_nestloop(\$proxy\$ 
    select max(app_id) as uniqlo_id
    from mu.app_daily
    where 
        date='2020-07-03'
    group by
    app_id,
    store_id,
    device_id
\$proxy\$) t (uniqlo_id BIGINT);
EOF

In [0]:

routine_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type=DAY/date=2020-07-03/'
df = spark.read.parquet(routine_path)
print df.filter("platform=2").count()
print df.filter("platform=1").count()
print df.count()

In [0]:

start = '2020-06-20'
end = '2020-06-20'
df = spark.read.format("delta").load('s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/').where("granularity='weekly' and date between '{}' and '{}'".format(start, end))
print df.filter("device_code like 'ios%'").count()
print df.filter("device_code like 'android%'").count()

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log-pre-aggr.v1/fact/

In [0]:
%%sh
