In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def write_test_result(df_write_result):
    df_write_result.write.format("delta").save(
        "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_unified_v1_daily_count_0511/daily/",
        mode="append",
        partitionBy=["type"])


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 01, 31)
    start = datetime.date(2013, 01, 31)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 02, 15)
    start = datetime.date(2013, 01, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 02, 15)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    date_list = {}
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if date_list.has_key(x[0][:7]):
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda x: datetime.datetime.strptime(x[0] + str(-01), '%Y-%m-%d'),
                        reverse=False)
    return date_list


def check_au_app_data_count(_granularity, date_list):
    for month_day_list_tuple in date_list:
        test_result = []
        raw_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity={raw_granularity}/" \
                   "month={raw_month}/"
        raw_path_parse = raw_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        raw_df = spark.read.parquet(raw_path_parse)
        for day in month_day_list_tuple[1]:
            try:
                raw_count_with_KPI = raw_df.filter(
                    "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                    "kpi").agg(count("kpi")).collect()
                # print raw_count_with_KPI
            except AnalysisException as e:
                break
            for row in raw_count_with_KPI:
                unified_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                               "granularity={unified_granularity}/date={unified_date}/"
                unified_path_parse = unified_path.format(unified_date=day, unified_granularity=_granularity)
                unified_count = spark.read.parquet(unified_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != unified_count:
                    print 'Count Test Wrong !!!! raw data: {}, unified data: {}, date: {}, KPI {}'.format(
                                      row["count(kpi)"], unified_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, row["count(kpi)"], unified_count, kpi_mapping[row["kpi"]]))
            print "date={} test complete!".format(day)
        df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'dump', 'unified_v1', 'kpi'])
        write_test_result(df_write_result)
        print "month={} Test complete!".format(month_day_list_tuple[0])



granularity_list = ["daily"]
for granularity in granularity_list:
    check_au_app_data_count(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:
%%sh
aws s3 ls 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=v3.0.0/range_type=DAY/'

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 01, 31)
    start = datetime.date(2013, 01, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 02, 15)
    start = datetime.date(2013, 01, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 02, 15)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    date_list = {}
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if date_list.has_key(x[0][:7]):
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda x: datetime.datetime.strptime(x[0] + str(-01), '%Y-%m-%d'),
                        reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity={raw_granularity}/" \
                   "month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                      "granularity={unified_granularity}/date={unified_date}/"
            v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
            v1_df = spark.read.parquet(v1_path_parse)

            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_count = v1_df.filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                                      row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, row["count(kpi)"], v1_count, kpi_mapping[row["kpi"]]))
            print "Completeness Test Pass! date: {}".format(day)



granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 01, 31)
    start = datetime.date(2013, 01, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 02, 15)
    start = datetime.date(2013, 01, 12)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 02, 15)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    date_list = {}
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if date_list.has_key(x[0][:7]):
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda x: datetime.datetime.strptime(x[0] + str(-01), '%Y-%m-%d'),
                        reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/granularity={raw_granularity}/" \
                   "month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                      "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                                      row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, row["count(kpi)"], v1_count, kpi_mapping[row["kpi"]]))
            print "Completeness Test Pass! date: {}".format(day)



granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/granularity=monthly/

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 2, 15)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/" \
                          "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                        row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, kpi_mapping[row["kpi"]], row["count(kpi)"], v1_count))
            print "Completeness Test Pass! date: {}".format(day)
        df_write_result = spark.createDataFrame(test_result,
                                                schema=['type', 'date', 'kpi', 'dump_count', 'unified_v1_count'])

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0608/monthly/",
                mode="append",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)
        print "month={} Test complete!".format(month_day_list_tuple[0])


granularity_list = ["monthly"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:

import datetime
import numpy as np
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from pyspark.sql import functions


TEST_RESULT = []
COUNTRY_CODE_MAPPING_BY_MARKET_CODE = {
    'google-play': {1: 'AU', 2: 'CA', 3: 'CN', 4: 'DE', 5: 'ES', 6: 'FR', 7: 'GB', 8: 'IT', 9: 'JP', 10: 'US',
                    11: 'BE', 12: 'CH', 13: 'CL', 14: 'ZA', 15: 'VN', 16: 'HK', 17: 'AR', 18: 'BR', 19: 'IN',
                    20: 'FI', 21: 'ID', 22: 'RU', 23: 'NL', 24: 'MY', 25: 'TR', 26: 'MX', 27: 'KR', 28: 'PL',
                    29: 'TH', 30: 'TW', 31: 'PH', 32: 'SG', 33: 'EG', 34: 'SE', 35: 'AT', 36: 'CZ', 37: 'HU',
                    38: 'DK', 39: 'IE', 40: 'IL', 41: 'NZ', 42: 'NO', 43: 'PT', 44: 'RO', 45: 'SK', 46: 'GR',
                    47: 'BG', 48: 'UA', 49: 'AE', 50: 'KW', 51: 'SA', 52: 'CO', 53: 'KZ', 54: 'PK', 55: 'IQ',
                    56: 'PE', 57: 'MA', 58: 'BY', 59: 'DZ', 60: 'VE', 61: 'AZ', 62: 'EC', 63: 'JO', 64: 'CR',
                    65: 'LB', 66: 'BD', 67: 'GT', 68: 'RS', 69: 'DO', 70: 'IR', 71: 'OM', 72: 'BO', 73: 'QA',
                    74: 'NG', 75: 'SV', 76: 'KH', 77: 'PA', 78: 'LT', 79: 'TN', 80: 'HR', 81: 'JM', 82: 'LK',
                    83: 'HN', 84: 'PR', 85: 'UY', 86: 'LV', 87: 'BA', 88: 'KG', 89: 'PY', 90: 'MD', 91: 'NP',
                    92: 'TZ', 93: 'BH', 94: 'GH', 95: 'KE', 96: 'SI', 97: 'AM', 98: 'UZ', 99: 'TT', 100: 'MK',
                    101: 'YE', 102: 'MO', 103: 'LU', 1000: 'WW'},
    'apple-store': {143441: 'US', 143442: 'FR', 143443: 'DE', 143444: 'GB', 143445: 'AT', 143446: 'BE', 143447: 'FI',
                    143448: 'GR', 143449: 'IE', 143450: 'IT', 143451: 'LU', 143452: 'NL', 143453: 'PT', 143454: 'ES',
                    143455: 'CA', 143456: 'SE', 143457: 'NO', 143458: 'DK', 143459: 'CH', 143460: 'AU', 143461: 'NZ',
                    143462: 'JP', 143463: 'HK', 143464: 'SG', 143465: 'CN', 143466: 'KR', 143467: 'IN', 143468: 'MX',
                    143469: 'RU', 143470: 'TW', 143471: 'VN', 143472: 'ZA', 143473: 'MY', 143474: 'PH', 143475: 'TH',
                    143476: 'ID', 143477: 'PK', 143478: 'PL', 143479: 'SA', 143480: 'TR', 143481: 'AE', 143482: 'HU',
                    143483: 'CL', 143484: 'NP', 143485: 'PA', 143486: 'LK', 143487: 'RO', 143489: 'CZ', 143491: 'IL',
                    143492: 'UA', 143493: 'KW', 143494: 'HR', 143495: 'CR', 143496: 'SK', 143497: 'LB', 143498: 'QA',
                    143499: 'SI', 143501: 'CO', 143502: 'VE', 143503: 'BR', 143504: 'GT', 143505: 'AR', 143506: 'SV',
                    143507: 'PE', 143508: 'DO', 143509: 'EC', 143510: 'HN', 143511: 'JM', 143512: 'NI', 143513: 'PY',
                    143514: 'UY', 143515: 'MO', 143516: 'EG', 143517: 'KZ', 143518: 'EE', 143519: 'LV', 143520: 'LT',
                    143521: 'MT', 143523: 'MD', 143524: 'AM', 143525: 'BW', 143526: 'BG', 143528: 'JO', 143529: 'KE',
                    143530: 'MK', 143531: 'MG', 143532: 'ML', 143533: 'MU', 143534: 'NE', 143535: 'SN', 143536: 'TN',
                    143537: 'UG', 143538: 'AI', 143539: 'BS', 143540: 'AG', 143541: 'BB', 143542: 'BM', 143543: 'VG',
                    143544: 'KY', 143545: 'DM', 143546: 'GD', 143547: 'MS', 143548: 'KN', 143549: 'LC', 143550: 'VC',
                    143551: 'TT', 143552: 'TC', 143553: 'GY', 143554: 'SR', 143555: 'BZ', 143556: 'BO', 143557: 'CY',
                    143558: 'IS', 143559: 'BH', 143560: 'BN', 143561: 'NG', 143562: 'OM', 143563: 'DZ', 143564: 'AO',
                    143565: 'BY', 143566: 'UZ', 143568: 'AZ', 143571: 'YE', 143572: 'TZ', 143573: 'GH', 143575: 'AL',
                    143576: 'BJ', 143577: 'BT', 143578: 'BF', 143579: 'KH', 143580: 'CV', 143581: 'TD', 143582: 'CG',
                    143583: 'FJ', 143584: 'GM', 143585: 'GW', 143586: 'KG', 143587: 'LA', 143588: 'LR', 143589: 'MW',
                    143590: 'MR', 143591: 'FM', 143592: 'MN', 143593: 'MZ', 143594: 'NA', 143595: 'PW', 143597: 'PG',
                    143598: 'ST', 143599: 'SC', 143600: 'SL', 143601: 'SB', 143602: 'SZ', 143603: 'TJ', 143604: 'TM',
                    143605: 'ZW', 0: 'WW'},
    'amazon-store': {
        'android-all': {
            'UK': 'GB',
        }
    }
}
KPI_MAPPING = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}
DEVICE_ID_CODE_MAPPING = {
    1001: 'android-phone',
    1002: 'android-tablet',
    2001: 'ios-phone',
    2002: 'ios-tablet'
}
ANDROID_COUNTRY_ID_CODES = COUNTRY_CODE_MAPPING_BY_MARKET_CODE['google-play']
IOS_COUNTRY_ID_CODES = COUNTRY_CODE_MAPPING_BY_MARKET_CODE['apple-store']


def sample_date(month, day, date_list):
    sample_date_list = []
    month_random_list = np.random.randint(0, len(date_list), month).tolist()
    for m in month_random_list:
        sample_date_per_month = []
        if len(date_list[m][1]) == 1:
            day_random_list = [0]
        else:
            day_random_list = np.random.randint(0, len(date_list[m][1]), day).tolist()
        for d in day_random_list:
            sample_date_per_month.append(date_list[m][1][d])
        sample_date_list.append((date_list[m][0], sample_date_per_month))
    return sample_date_list


def _merge_dicts(dict1, dict2):
    dict1.update(dict2)
    return dict1


def write_test_result(result_df):
    result_df.write.format("delta").save(
        "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_unified_v1_mapping_0512/",
        mode="append",
        partitionBy=["type"])


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_mapping(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    sample_date_list = sample_date(3, 1, date_list)
    for month_day_list_tuple in sample_date_list:
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        dump_df = (
            dump_df
            .withColumn('device_id', functions.UserDefinedFunction(
                lambda x: DEVICE_ID_CODE_MAPPING[x])(dump_df['device_id']))
            .withColumn('store_id', functions.UserDefinedFunction(
                lambda x: _merge_dicts(IOS_COUNTRY_ID_CODES, ANDROID_COUNTRY_ID_CODES)[x])(dump_df['store_id']))
            .withColumnRenamed('device_id', 'device_code')
            .withColumnRenamed('store_id', 'country_code')
        )
        for day in month_day_list_tuple[1]:
            dump_df_date_filtered = dump_df.filter(
                "date='{}'".format(day)).select("app_id", "device_code", "country_code", "kpi", "estimate")
            kpi_list = dump_df_date_filtered.select('kpi').distinct().collect()

            v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/app-tech.usage.basic-kpi.v3/fact/" \
                      "granularity={unified_granularity}/date={unified_date}/"
            v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
            v1_df = spark.read.parquet(v1_path_parse)
            for row in kpi_list:
                dump_kpi_df = dump_df_date_filtered.filter("kpi='{}'".format(row["kpi"]))\
                    .withColumnRenamed('estimate', KPI_MAPPING[row["kpi"]]).drop('kpi')
                v1_kpi_df = v1_df.filter(
                    "{} is not null".format(KPI_MAPPING[row["kpi"]])).select(
                        'app_id', 'device_code', 'country_code', KPI_MAPPING[row["kpi"]])
                subtract_count = v1_kpi_df.subtract(dump_kpi_df).count()
                subtract_count_reverse = dump_kpi_df.subtract(v1_kpi_df).count()
                if subtract_count != 0 or subtract_count_reverse != 0:
                    print "Accuracy Test Wrong!!! granularity: {} , subtract_count: {}, date: {}, kpi: {}".format(
                        _granularity, max(subtract_count, subtract_count_reverse), day, KPI_MAPPING[row["kpi"]])
                else:
                    print max(subtract_count, subtract_count_reverse), day, KPI_MAPPING[row["kpi"]]
                TEST_RESULT.append((
                    _granularity, max(subtract_count, subtract_count_reverse), day, KPI_MAPPING[row["kpi"]]))
            print "date={} test complete!".format(day)


if __name__ == '__main__':
    granularity_list = ["monthly"]
    for granularity in granularity_list:
        check_usage_dump_v1_mapping(granularity, get_path_date_list(granularity))
    print 'pass'

In [0]:

from pyspark.sql.functions import sum
df = spark.read.parquet('s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0608/daily/').groupBy('date').agg(sum('dump_count').alias('dump_count'), sum('unified_v1_count').alias('unified_v1_count')).orderBy('date').collect()
for row in df:
    print row['date'],'\t',row['dump_count'],'\t',row['unified_v1_count']

In [0]:

from pyspark.sql.functions import sum
df = spark.read.parquet('s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0608/monthly/').groupBy('date').agg(sum('dump_count').alias('dump_count'), sum('unified_v1_count').alias('unified_v1_count')).orderBy('date').collect()
for row in df:
    print row['date'],'\t',row['dump_count'],'\t',row['unified_v1_count']

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 12, 31)
    start = datetime.date(2018, 4, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                          "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                        row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, kpi_mapping[row["kpi"]], row["count(kpi)"], v1_count))
            print "Completeness Test Pass! date: {}".format(day)
        df_write_result = spark.createDataFrame(test_result,
                                                schema=['type', 'date', 'kpi', 'dump_count', 'unified_v1_count'])

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0616/daily/",
                mode="overwrite",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)
        print "month={} Test complete!".format(month_day_list_tuple[0])


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

from pyspark.sql.functions import count
from pyspark.sql import Row
from pyspark.sql import functions
import datetime
from dateutil.relativedelta import relativedelta


test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2019, 12, 31)
    start = datetime.date(2018, 4, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_unified_v1_v3_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact' \
                          '/granularity={v3_granularity}/date={v3_date}/'
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date[1])
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date[1])

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1_count = unified_v1.count()
        unified_v3_count = spark.read.format("delta").load(unified_v3_path_parse).count()

        if unified_v1_count != unified_v3_count:
            print 'Completeness Test FAIL!!!! unified_v1 data: {}, unified_v3 data: {}, date: {}'.format(
                unified_v1_count, unified_v3_count, date[1])
        else:
            print 'Completeness Test Pass! unified_v1 data: {}, unified_v3 data: {}, date: {}'.format(
                unified_v1_count, unified_v3_count, date[1])
        test_result.append((_granularity, unified_v1_count, unified_v3_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'raw_count', 'unified_count', 'date'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_unified_v1_v3_count_0616/daily/",
            mode="append",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_completeness(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:

from pyspark.sql.functions import sum
df = spark.read.parquet('s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0616/daily/')
result = df.distinct().groupBy('date').agg(sum('dump_count').alias('dump_count'),sum('unified_v1_count').alias('unified_v1_count')).orderBy('date').collect()
for row in result:
    print row['date'],'\t',row['dump_count'],'\t',row['unified_v1_count']

In [0]:

df = spark.read.parquet('s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_unified_v1_v3_count_0616/daily/').orderBy('date').collect()
for row in df:
    print row['date'],'\t',int(row['raw_count']),'\t',row['unified_count']

In [0]:

from pyspark.sql.functions import count
from pyspark.sql import Row
from pyspark.sql import functions
import datetime
from dateutil.relativedelta import relativedelta


test_result = []
device_code_agg_mapping = {'android-phone': 'android-all', 'android-tablet': 'android-all',
                           'ios-phone': 'ios-all', 'ios-tablet': 'ios-all'}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2018, 3, 31)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(granularity):
    if granularity == 'daily':
        collect_date = get_daily_date_list()
    if granularity == 'weekly':
        collect_date = get_weekly_date_list()
    if granularity == 'monthly':
        collect_date = get_monthly_date_list()
    date_list = [(x[0][:7], x[0]) for x in collect_date]
    return date_list


def check_usage_unified_v1_v3_completeness(_granularity, date_list):
    """
        date_list:
                [(month1,day1), (month1,day2), (month2,day1), (month2,day2)]
        sample:
            [('2015-12', '2015-12-27'), ('2015-12', '2015-12-28'),
            ('2016-12', '2016-12-27'), ('2016-12', '2016-12-28')]
    """
    for date in date_list:
        unified_v1_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/' \
                          'granularity={v1_granularity}/date={v1_date}/'
        unified_v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact' \
                          '/granularity={v3_granularity}/date={v3_date}/'
        unified_v1_path_parse = unified_v1_path.format(v1_granularity=_granularity, v1_date=date[1])
        unified_v3_path_parse = unified_v3_path.format(v3_granularity=_granularity, v3_date=date[1])

        unified_v1 = spark.read.parquet(unified_v1_path_parse)
        unified_v1_count = unified_v1.count()
        unified_v3_count = spark.read.format("delta").load(unified_v3_path_parse).count()

        if unified_v1_count != unified_v3_count:
            print 'Completeness Test FAIL!!!! unified_v1 data: {}, unified_v3 data: {}, date: {}'.format(
                unified_v1_count, unified_v3_count, date[1])
        else:
            print 'Completeness Test Pass! unified_v1 data: {}, unified_v3 data: {}, date: {}'.format(
                unified_v1_count, unified_v3_count, date[1])
        test_result.append((_granularity, unified_v1_count, unified_v3_count, date[1]))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'raw_count', 'unified_count', 'date'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_unified_v1_v3_count_0616/daily/",
            mode="append",
            partitionBy=["type"])
    retry(write_test_result, (df_write_result,), {}, interval=10)


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_unified_v1_v3_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="zidong-application-autopipeline")

# reload dependencies from temp
spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")
# spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2018, 3, 31)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                          "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                        row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, kpi_mapping[row["kpi"]], row["count(kpi)"], v1_count))
            print "Completeness Test Pass! date: {}".format(day)
        df_write_result = spark.createDataFrame(test_result,
                                                schema=['type', 'date', 'kpi', 'dump_count', 'unified_v1_count'])

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0616/daily/",
                mode="append",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)
        print "month={} Test complete!".format(month_day_list_tuple[0])


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql import Row
from pyspark.sql import functions


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 17)
    start = datetime.date(2020, 5, 17)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


date_list = get_daily_date_list()
for date in date_list:
    print date[0]
    plproxy_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/granularity=daily/date={}/'.format(date[0])
    routine_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/granularity=daily/date={}/'.format(date[0])
    plproxy_df = spark.read.format("delta").load(plproxy_path)
    routine_df = spark.read.format("delta").load(routine_path)
    kpi_mapping_dict = {1: 'est_average_active_users', 2: 'est_average_session_per_user', 3: 'est_average_session_duration',
                        4: 'est_install_penetration', 5: 'est_average_active_days', 6: 'est_percentage_active_days',
                        7: 'est_average_bytes_per_user', 8: 'est_average_time_per_user', 9: 'est_usage_penetration',
                        10: 'est_open_rate', 17: 'est_average_bytes_per_session', 23: 'est_installs', 24: 'est_share_of_users',
                        25: 'est_share_of_installs'}
    for i,kpi in kpi_mapping_dict.items():
        plproxy_count = plproxy_df.filter("est_average_active_users!=0 and {}!=0".format(kpi)).count()
        # routine_count =  routine_df.filter("est_average_active_users!=0 and {}!=0".format(kpi)).count()
        plproxy_df = plproxy_df.withColumn(kpi, functions.UserDefinedFunction(lambda x: format(x, '.0f'))(plproxy_df[kpi]))
        routine_df = routine_df.withColumn(kpi, functions.UserDefinedFunction(lambda x: format(x, '.0f'))(routine_df[kpi]))
        plproxy_df.createOrReplaceTempView("plproxy_df")
        routine_df.createOrReplaceTempView("routine_df")
        subtract_count = plproxy_df.filter("est_average_active_users!=0 and {}!=0".format(kpi)).select('app_id', 'country_code', 'device_code', kpi).subtract(routine_df.filter("est_average_active_users!=0 and {}!=0".format(kpi)).select('app_id', 'country_code', 'device_code', kpi)).count()
        print kpi, '\t', float(subtract_count) / plproxy_count
        if subtract_count!=0:
            print float(subtract_count) / plproxy_count
            sql = """
                select
                    plproxy_df.app_id, plproxy_df.device_code, plproxy_df.country_code,
                    plproxy_df.{kpi} as v3_{kpi},
                    v5.{kpi} as v5_{kpi}
                from plproxy_df inner join
                    (select app_id, device_code, country_code, {kpi}
                     from routine_df
                     ) AS v5
                on (plproxy_df.app_id=v5.app_id) and (plproxy_df.device_code=v5.device_code) and (plproxy_df.country_code=v5.country_code) and (plproxy_df.{kpi}!=v5.{kpi})
                """.format(kpi=kpi)
            unified_v1 = spark.sql(sql)
            unified_v1 = unified_v1= unified_v1.withColumn('difference', unified_v1['v3_{}'.format(kpi)] - unified_v1['v5_{}'.format(kpi)])
            unified_v1.show(3)

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                          "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                        row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, kpi_mapping[row["kpi"]], row["count(kpi)"], v1_count))
            print "Completeness Test Pass! date: {}".format(day)
        df_write_result = spark.createDataFrame(test_result,
                                                schema=['type', 'date', 'kpi', 'dump_count', 'unified_v1_count'])

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0616/daily/",
                mode="append",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)
        print "month={} Test complete!".format(month_day_list_tuple[0])


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'

In [0]:

routine_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/granularity=daily/date={}/'.format('2020-05-17')
routine_df = spark.read.format("delta").load(routine_path)
routine_df.filter("app_id=20600000000425 and country_code='WW' and device_code='android-phone'").select('est_average_active_users').show(3)
routine_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v3/fact/granularity=daily/date={}/'.format('2020-05-17')
routine_df = spark.read.format("delta").load(routine_path)
routine_df.filter("app_id=20600000000425 and country_code='WW' and device_code='android-phone'").select('est_average_active_users').show(3)

In [0]:

from aadatapipelinecore.core.urn import Urn
import psycopg2
from pyspark.sql import Row
from dateutil.relativedelta import relativedelta
from aaplproxy.connection import ClusterConnection
from conf import settings
from aadatapipelinecore.core.loader.plproxy import build_db_settings
import datetime as d
import datetime
from datetime import timedelta


test_result = []
PG_AA_HOSTS = [('10.2.6.141', 5432)]
PG_AA_NAME = 'aa_store_db'
PG_AA_ACCESS_ID = 'citus_bdp_prod_app_int_qa'
PG_AA_SECRET_KEY = 'wZw8cfBuuklIskVG'

aa_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_AA_NAME,
        user=PG_AA_ACCESS_ID,
        host=PG_AA_HOSTS[0][0],
        password=PG_AA_SECRET_KEY,
        port=PG_AA_HOSTS[0][1]
    )
)

urn = Urn(
    namespace="app-qa.db-check.v1",
    owner="app_qa"
)
sql = """select count(*) from store.usage_basic_kpi_fact_v6_p_{} where date='{}';"""


def query(dsn, sql):
    with psycopg2.connect(dsn) as conn:
        conn.autocommit = True
        with conn.cursor() as cur:
            cur.execute(sql)
            result = cur.fetchall()
            conn.commit()
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2018, 3, 31)
    start = datetime.date(2015, 12, 27)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_v3_db_completeness(date_list, graularity):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month in date_list:
        m = month[0][:4] + month[0][5:8]
        for day in month[1]:
            result = query(aa_dsn, sql.format(m, day))
            db_count = result[0][0]

            v3_path = 's3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v5/fact/' \
                      'granularity={}/date={}/'
            v3_count = spark.read.format("delta").load(v3_path.format(graularity, day)).count()

            if db_count != v3_count:
                print "Completeness Test Fail!!!! date: {}, v3: {}, db: {}".format(day, v3_count, db_count)
            else:
                print "Completeness Test Pass! date : {}".format(day)
            test_result.append((graularity, day, v3_count, db_count))
    df_write_result = spark.createDataFrame(test_result, schema=['type', 'date', 'v3_count', 'db_count'])

    from aadatapipelinecore.core.utils.retry import retry

    def write_test_result(df_write_result):
        df_write_result.write.format("delta").save(
            "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_v3_db_count_0616/daily/",
            mode="append",
            partitionBy=["type"])

    retry(write_test_result, (df_write_result,), {}, interval=10)


graularity_list = ["daily"]
for graularity in graularity_list:
    check_v3_db_completeness(get_path_date_list(graularity), graularity)

In [0]:
%%sh
PGPASSWORD='wZw8cfBuuklIskVG' psql -h 10.2.6.141 -p 5432 -U citus_bdp_prod_app_int_qa -d aa_store_db << EOF
SET search_path=store;
\d
EOF


In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                          "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                        row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, kpi_mapping[row["kpi"]], row["count(kpi)"], v1_count))
            print "Completeness Test Pass! date: {}".format(day)
        df_write_result = spark.createDataFrame(test_result,
                                                schema=['type', 'date', 'kpi', 'dump_count', 'unified_v1_count'])

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0616/daily/",
                mode="append",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)
        print "month={} Test complete!".format(month_day_list_tuple[0])


granularity_list = ["daily"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'


In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/granularity=monthly/

In [0]:

import datetime
from dateutil.relativedelta import relativedelta
from pyspark.sql.utils import AnalysisException
from pyspark.sql.functions import count
from pyspark.sql import Row
from aadatapipelinecore.core.utils.retry import retry


kpi_mapping = {1: "est_average_active_users", 2: "est_average_session_per_user", 3: "est_average_session_duration",
               4: "est_install_penetration", 5: "est_average_active_days", 6: "est_percentage_active_days",
               7: "est_average_bytes_per_user", 8: "est_average_time_per_user", 9: "est_usage_penetration",
               10: "est_open_rate", 11: "est_total_time", 12: "est_share_of_category_time", 14: "est_total_sessions",
               15: "est_share_of_category_session", 17: "est_average_bytes_per_session",
               18: "est_share_of_category_bytes", 20: "est_percent_of_wifi_total", 21: "est_mb_per_second",
               22: "est_panel_size", 23: "est_installs", 24: "est_average_active_users_country_share",
               25: "est_installs_country_share", 26: "est_audience_index", 27: "est_audience_percentage",
               28: "est_cross_product_affinity"}


def last_day_of_month(check_month):
    next_month = check_month.replace(day=28) + datetime.timedelta(days=4)
    return next_month - datetime.timedelta(days=next_month.day)


def get_monthly_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2013, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result


def get_weekly_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 4)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(weeks=1)
    return result


def get_daily_date_list():
    result = []
    end = datetime.date(2020, 5, 23)
    start = datetime.date(2020, 1, 1)
    while start <= end:
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(days=1)
    return result


def get_path_date_list(gran):
    date_list = {}
    if gran == 'daily':
        collect_date = get_daily_date_list()
    if gran == 'weekly':
        collect_date = get_weekly_date_list()
    if gran == 'monthly':
        collect_date = get_monthly_date_list()
    for x in collect_date:
        if x[0][:7] in date_list:
            date_list[x[0][:7]].append(x[0])
        else:
            date_list[x[0][:7]] = [x[0]]
    date_list = sorted(date_list.items(), key=lambda d: datetime.datetime.strptime(d[0] + str(-1), '%Y-%m-%d'),
                       reverse=False)
    return date_list


def check_usage_dump_v1_completeness(_granularity, date_list):
    """
        date_list:
                [(month,[day1,day2,day3])]
        sample:
            [('2015-12', ['2015-12-27', '2015-12-28', '2015-12-29', '2015-12-30', '2015-12-31'])]
    """
    for month_day_list_tuple in date_list:
        test_result = []
        dump_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.legacy-mu_app.v1/fact/" \
                    "granularity={raw_granularity}/month={raw_month}/"
        dump_path_parse = dump_path.format(raw_month=month_day_list_tuple[0], raw_granularity=_granularity)
        dump_df = spark.read.parquet(dump_path_parse)
        for day in month_day_list_tuple[1]:
            dump_kpi_count = dump_df.filter(
                "date='{}'".format(day)).select("kpi", "app_id").groupBy(
                "kpi").agg(count("kpi")).collect()
            # print raw_count_with_KPI

            for row in dump_kpi_count:
                v1_path = "s3://b2c-prod-data-pipeline-unified-usage/unified/usage.basic-kpi.v1/fact/" \
                          "granularity={unified_granularity}/date={unified_date}/"
                v1_path_parse = v1_path.format(unified_date=day, unified_granularity=_granularity)
                v1_count = spark.read.parquet(v1_path_parse).filter(
                    "{} is not null".format(kpi_mapping[row["kpi"]])).select(kpi_mapping[row["kpi"]]).count()
                if row["count(kpi)"] != v1_count:
                    print 'Completeness Test Fail!!!! dump data: {}, v1 data: {}, date: {}, KPI {}'.format(
                        row["count(kpi)"], v1_count, day, kpi_mapping[row["kpi"]])
                test_result.append((_granularity, day, kpi_mapping[row["kpi"]], row["count(kpi)"], v1_count))
            print "Completeness Test Pass! date: {}".format(day)
        df_write_result = spark.createDataFrame(test_result,
                                                schema=['type', 'date', 'kpi', 'dump_count', 'unified_v1_count'])

        from aadatapipelinecore.core.utils.retry import retry

        def write_test_result(result_df):
            result_df.write.format("delta").save(
                "s3://b2c-prod-data-pipeline-qa/aa.usage/result_usage_dump_v1_count_0616/monthly/",
                mode="append",
                partitionBy=["type"])
        retry(write_test_result, (df_write_result,), {}, interval=10)
        print "month={} Test complete!".format(month_day_list_tuple[0])


granularity_list = ["monthly"]
for granularity in granularity_list:
    check_usage_dump_v1_completeness(granularity, get_path_date_list(granularity))
print 'pass'