In [0]:

from bdce.common.utils import update_application_code
update_application_code(
    spark, role="BDP-PROD-APP-INT-QA", application_name="jji-application"
)


In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="zidong-application-autopipeline")

# reload dependencies from temp
spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")
# spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

In [0]:

from applications.db_check_v1.common.db_check_utils import etl_skip


In [0]:

from aaplproxy.connection import ClusterConnection

In [0]:

import random
import datetime
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config, etl_skip
from applications.db_check_v1.common.constants import query
from applications.db_check_v1.common.db_check_utils import query_df
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.utils import string_to_datetime, datetime_to_string
from pyspark.sql.types import DoubleType
from pyspark.sql import functions
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import LongType
CITUS_USAGE_HOSTS = [('10.2.10.254', 5432)]
CITUS_USAGE_NAME = 'aa_store_db'
CITUS_USAGE_ACCESS_ID = 'citus_bdp_prod_app_int_qa'
CITUS_USAGE_SECRET_KEY = 'wZw8cfBuuklIskVG'

DEVICE_CODE_MAPPING = {
    1: {'1': 'android-phone', '2': 'android-tablet'},
    2: {'1': 'ios-phone', '2': 'ios-tablet'}}

GRANULARITY_IN_RAW_PATH_MAPPING = {
    "daily": "DAY",
    "weekly": "WEEK",
    "monthly": "MONTH"
}

CITUS_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=CITUS_USAGE_NAME,
        user=CITUS_USAGE_ACCESS_ID,
        host=CITUS_USAGE_HOSTS[0][0],
        password=CITUS_USAGE_SECRET_KEY,
        port=CITUS_USAGE_HOSTS[0][1]
    )
)


class UsageRoutineRawData(object):
    """
    Get data from Data Foundation
    """
    _raw_s3_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type={granularity}/date={date}'

    def __init__(self, spark):
        self.spark = spark

    def get(self, granularity, date):
        """
        :return: Raw data from DF team
        :rtype: pyspark.sql.DataFrame
        """
        raw_data = self.spark.read.parquet(self._raw_s3_path.format(
            granularity=GRANULARITY_IN_RAW_PATH_MAPPING[granularity], date=date))
        return raw_data


class TestUsageRoutineRawCompleteness(PipelineTest):
    table_name = 'usage.usage_basic_kpi_fact_v6'
    db_name = 'usage'

    def setUp(self):
        # super(PipelineTest, self).setUp()
        self.check_date = None
        self.granularity = None

    def check_routine_raw_completeness(self):
        date_list = [self.check_date]

        if self.granularity == 'daily':
            date = string_to_datetime(self.check_date)
            weekly_day_nums = 7
            date_list = [datetime_to_string(date - datetime.timedelta(days=x)) for x in range(weekly_day_nums)]

        for date in date_list:
            routine_df = UsageRoutineRawData(self.spark).get(self.granularity, date)
            routine_count = routine_df.count()

            citus_db_count = self.get_citus_db_count(date)
            print routine_count, citus_db_count

            self.assertEqual(routine_count, citus_db_count[0][0],
                             msg="fount count mismatch when compare usage routine raw and citus db. "
                                 "granularity is {}, date is {}, raw count is:{}, citus db count is:{}".format(
                                    self.granularity, date, routine_count, citus_db_count[0][0]))

    def get_citus_db_count(self, date):
        sql = """select count(1) as cnt from {table_name} where date='{date}' and granularity='{granularity}';
        """.format(table_name=self.table_name, date=date, granularity=self.granularity)
        result = query(CITUS_DSN, sql)
        return result


class TestUsageRoutineRawCompletenessDaily(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("0 12 * * 5", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "daily"

    def test_routine_raw_completeness_daily(self):
        self.check_routine_raw_completeness()


class TestUsageRoutineRawCompletenessWeekly(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("0 12 * * 5", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "weekly"

    @etl_skip()
    def test_routine_raw_completeness_weekly(self):
        self.check_routine_raw_completeness()


class TestUsageRoutineRawCompletenessMonthly(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("0 12 6 * * ", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "monthly"

    @etl_skip()
    def test_routine_raw_completeness_monthly(self):
        self.check_routine_raw_completeness()


class TestUsageRoutineRawAccuracy(PipelineTest):
    table_name = 'usage.usage_basic_kpi_fact_v6'
    raw_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type={granularity}/date={date}'
    db_name = 'usage'

    def setUp(self):
        super(PipelineTest, self).setUp()
        self.check_date = None
        self.granularity = None

    def check_routine_raw_accuracy(self):
        routine_df = UsageRoutineRawData(self.spark).get(self.granularity, self.check_date)
        unified_v1 = (
            routine_df
            .withColumn('device_code', functions.UserDefinedFunction(
                lambda x, y: DEVICE_CODE_MAPPING[x][y])(routine_df['platform'], routine_df['device_type']))
            .withColumnRenamed('country', 'country_code')
            .withColumn('app_id', routine_df['app_id'].cast(LongType()))
            .withColumnRenamed('AU', 'est_average_active_users')
            .withColumnRenamed('AFU', 'est_average_session_per_user')
            .withColumnRenamed('ADU', 'est_average_session_duration')
            .withColumnRenamed('IP', 'est_install_penetration')
            .withColumnRenamed('AAD', 'est_average_active_days')
            .withColumnRenamed('PAD', 'est_percentage_active_days')
            .withColumnRenamed('MBPU', 'est_average_bytes_per_user')
            .withColumnRenamed('ATU', 'est_average_time_per_user')
            .withColumnRenamed('UP', 'est_usage_penetration')
            .withColumnRenamed('OR', 'est_open_rate')
            .withColumnRenamed('MBPS', 'est_average_bytes_per_session')
            .withColumnRenamed('MBWFT', 'est_percent_of_wifi_total')
            .withColumnRenamed('MBS', 'est_mb_per_second')
            .withColumnRenamed('IS', 'est_installs')
            .withColumnRenamed('SOU', 'est_average_active_users_country_share')
            .withColumnRenamed('SOI', 'est_installs_country_share')
            .withColumn('est_share_of_category_time', lit(None).cast(DoubleType()))
            .withColumn('est_share_of_category_session', lit(None).cast(DoubleType()))
            .withColumn('est_share_of_category_bytes', lit(None).cast(DoubleType()))
            .withColumn('est_panel_size', lit(None).cast(DoubleType()))
            .drop('device_type')
            .drop('platform')
        )

        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] *
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population',
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v3 = self.spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v3 = unified_v3.na.fill(0)

        result = self.get_citus_db_df()
        citus_db_df = self.spark.createDataFrame(result)
        citus_db_df = citus_db_df.drop('date').drop('granularity').drop('_disable_idx_4_query')

        diff_count1 = citus_db_df.select(unified_v3.columns).subtract(unified_v3).count()
        diff_count2 = unified_v3.select(citus_db_df.columns).subtract(citus_db_df).count()
        self.assertTrue(diff_count1 == 0 and diff_count2 == 0,
                        msg="fount mismatch when compare usage routine raw and citus db.granularity is {}, "
                            "date is {}, diff_count1 is:{}, diff_count2 is:{}".format(
                                self.granularity, self.check_date, diff_count1, diff_count2))

    def get_citus_db_df(self):
        sql = """select * from {table_name} where date='{date}' and granularity='{granularity}';
        """.format(table_name=self.table_name, date=self.check_date, granularity=self.granularity)
        result = query_df(CITUS_DSN, sql)
        return result


class TestUsageRoutineRawAccuracyDaily(TestUsageRoutineRawAccuracy):

    trigger_date_config = ("0 12 * * 5", random.randint(6, 12))
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "daily"

    @etl_skip()
    def test_routine_raw_accuracy_daily(self):
        self.check_routine_raw_accuracy()


class TestUsageRoutineRawAccuracyWeekly(TestUsageRoutineRawAccuracy):
    trigger_date_config = ("0 12 * * 5", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "weekly"

    @etl_skip()
    def test_routine_raw_accuracy_weekly(self):
        self.check_routine_raw_accuracy()


class TestUsageRoutineRawAccuracyMonthly(TestUsageRoutineRawAccuracy):
    trigger_date_config = ("0 12 6 * * ", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "monthly"

    @etl_skip()
    def test_routine_raw_accuracy_monthly(self):
        self.check_routine_raw_accuracy()


suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawCompletenessDaily))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawCompletenessWeekly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawCompletenessMonthly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyDaily))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyWeekly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyMonthly))
runner = unittest.TextTestRunner()
runner.run(suite)


In [0]:
%%sh
PGPASSWORD='wZw8cfBuuklIskVG' psql -h 10.2.6.141  -U citus_bdp_prod_app_int_qa -d aa_store_db -p 5432 << EOF 
set search_path=usage;
select count(1) from usage_basic_kpi_fact_v6 where granularity='monthly' and date='2020-06-30';
EOF

In [0]:

import random
import datetime
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config, etl_skip
from applications.db_check_v1.common.constants import query
from applications.db_check_v1.common.db_check_utils import query_df
from applications.db_check_v1.common.base_test import PipelineTest
from applications.db_check_v1.common.utils import string_to_datetime, datetime_to_string
from pyspark.sql.types import DoubleType
from pyspark.sql import functions
from pyspark.sql.functions import lit, coalesce
from pyspark.sql.types import LongType
CITUS_USAGE_HOSTS = [('10.2.10.254', 5432)]
CITUS_USAGE_NAME = 'aa_store_db'
CITUS_USAGE_ACCESS_ID = 'citus_bdp_prod_app_int_qa'
CITUS_USAGE_SECRET_KEY = 'wZw8cfBuuklIskVG'

DEVICE_CODE_MAPPING = {
    1: {'1': 'android-phone', '2': 'android-tablet'},
    2: {'1': 'ios-phone', '2': 'ios-tablet'}}

GRANULARITY_IN_RAW_PATH_MAPPING = {
    "daily": "DAY",
    "weekly": "WEEK",
    "monthly": "MONTH"
}

CITUS_DSN = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=CITUS_USAGE_NAME,
        user=CITUS_USAGE_ACCESS_ID,
        host=CITUS_USAGE_HOSTS[0][0],
        password=CITUS_USAGE_SECRET_KEY,
        port=CITUS_USAGE_HOSTS[0][1]
    )
)


class UsageRoutineRawData(object):
    """
    Get data from Data Foundation
    """
    _raw_s3_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type={granularity}/date={date}'

    def __init__(self, spark):
        self.spark = spark

    def get(self, granularity, date):
        """
        :return: Raw data from DF team
        :rtype: pyspark.sql.DataFrame
        """
        raw_data = self.spark.read.parquet(self._raw_s3_path.format(
            granularity=GRANULARITY_IN_RAW_PATH_MAPPING[granularity], date=date))
        return raw_data


class TestUsageRoutineRawCompleteness(PipelineTest):
    table_name = 'usage.usage_basic_kpi_fact_v6'
    db_name = 'usage'

    def setUp(self):
        # super(PipelineTest, self).setUp()
        self.check_date = None
        self.granularity = None

    def check_routine_raw_completeness(self):
        date_list = [self.check_date]

        if self.granularity == 'daily':
            date = string_to_datetime(self.check_date)
            weekly_day_nums = 7
            date_list = [datetime_to_string(date - datetime.timedelta(days=x)) for x in range(weekly_day_nums)]

        for date in date_list:
            routine_df = UsageRoutineRawData(self.spark).get(self.granularity, date)
            routine_count = routine_df.count()

            citus_db_count = self.get_citus_db_count(date)
            print routine_count, citus_db_count

            self.assertEqual(routine_count, citus_db_count[0][0],
                             msg="fount count mismatch when compare usage routine raw and citus db. "
                                 "granularity is {}, date is {}, raw count is:{}, citus db count is:{}".format(
                                    self.granularity, date, routine_count, citus_db_count[0][0]))

    def get_citus_db_count(self, date):
        sql = """select count(1) as cnt from {table_name} where date='{date}' and granularity='{granularity}';
        """.format(table_name=self.table_name, date=date, granularity=self.granularity)
        result = query(CITUS_DSN, sql)
        return result


class TestUsageRoutineRawCompletenessDaily(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("0 12 * * 5", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "daily"

    def test_routine_raw_completeness_daily(self):
        self.check_routine_raw_completeness()


class TestUsageRoutineRawCompletenessWeekly(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("0 12 * * 5", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "weekly"

    @etl_skip()
    def test_routine_raw_completeness_weekly(self):
        self.check_routine_raw_completeness()


class TestUsageRoutineRawCompletenessMonthly(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("0 12 6 * * ", 6)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = "monthly"

    @etl_skip()
    def test_routine_raw_completeness_monthly(self):
        self.check_routine_raw_completeness()


suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawCompletenessDaily))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawCompletenessWeekly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawCompletenessMonthly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyDaily))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyWeekly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyMonthly))
runner = unittest.TextTestRunner()
runner.run(suite)

In [0]:

import datetime
import croniter
import random
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.utils import string_to_datetime, datetime_to_string
def _get_pre_date_from_refresh_routing_config(config):
    schedule = config[0]
    # here use UTC now
    cron = croniter.croniter(schedule, datetime.datetime.utcnow())
    date = cron.get_prev(datetime.datetime)
    return date
print datetime.datetime.utcnow()
trigger_date_config = ("0 12 * * 5", 6)
pre_etl_date = _get_pre_date_from_refresh_routing_config(trigger_date_config)
check_date = _get_date_from_refresh_routing_config(trigger_date_config)
print pre_etl_date, check_date
delta_days=1
skipped_condition = datetime.datetime.utcnow() - pre_etl_date > datetime.timedelta(days=delta_days)
print skipped_condition


In [0]:

import unittest
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.constants import query, citus_settings
from pyspark.sql.types import DoubleType
from pyspark.sql.types import LongType
from applications.db_check_v1.common.db_check_utils import query_df, etl_skip
from pyspark.sql import functions
from pyspark.sql.functions import lit, coalesce
from applications.db_check_v1.common.table_common_info import urn


class TestUsageRoutineRawCompleteness(unittest.TestCase):
    table_name = 'usage.usage_basic_kpi_fact_v6'
    raw_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type={granularity}/date={date}'
    db_name = 'usage'

    def setUp(self):
        self.check_date = None
        self.granularity = None  # granularity type is list, like ['DAY', 'daily']

    def test_routine_raw_completeness(self):
        routine_df = spark.read.parquet(self.raw_path.format(
            granularity=self.granularity[0], date=self.check_date))
        routine_count = routine_df.count()

        citus_db_count = self.get_citus_db_count()

        self.assertEqual(routine_count, citus_db_count[0][0],
                         msg="fount count mismatch when compare usage routine raw and citus db. "
                             "granularity is {}, date is {}, raw count is:{}, citus db count is:{}".format(
                             self.granularity[1], self.check_date, routine_count, citus_db_count[0][0]))

    def get_citus_db_count(self):
        sql = """select count(1) as cnt from {table_name} where date='{date}' and granularity='{granularity}';
        """.format(table_name=self.table_name, date=self.check_date, granularity=self.granularity[1])
        result = query(citus_settings(self.db_name), sql)
        return result


class TestUsageRoutineRawCompletenessDaily(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("* * * * *", 2)
    check_date = '2020-06-14'
    # check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['DAY', 'daily']

    def test_routine_raw_completeness_daily(self):
        self.test_routine_raw_completeness()


class TestUsageRoutineRawCompletenessWeekly(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['WEEK', 'weekly']

    def test_routine_raw_completeness_weekly(self):
        self.test_routine_raw_completeness()


class TestUsageRoutineRawCompletenessMonthly(TestUsageRoutineRawCompleteness):

    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['MONTH', 'monthly']

    def test_routine_raw_completeness_monthly(self):
        self.test_routine_raw_completeness()


class TestUsageRoutineRawAccuracy(unittest.TestCase):
    table_name = 'usage.usage_basic_kpi_fact_v6'
    raw_path = 's3://aardvark-prod-pdx-mdm-to-int/basic_kpi/version=1.0.0/range_type={granularity}/date={date}'
    db_name = 'usage'
    device_code_mapping = {
            1: {'1': 'android-phone', '2': 'android-tablet'},
            2: {'1': 'ios-phone', '2': 'ios-tablet'}}

    def setUp(self):
        self.check_date = None
        self.granularity = None  # granularity type is list, like ['DAY', 'daily']

    def test_routine_raw_accuracy(self):
        print self.device_code_mapping
        routine_df = spark.read.parquet(self.raw_path.format(
            granularity=self.granularity[0], date=self.check_date))
        unified_v1 = (
            routine_df
                .withColumn('device_code', functions.UserDefinedFunction(
                lambda x, y: self.device_code_mapping[x][y])(routine_df['platform'], routine_df['device_type']))
                .withColumnRenamed('country', 'country_code')
                .withColumn('app_id', routine_df['app_id'].cast(LongType()))
                .withColumnRenamed('AU', 'est_average_active_users')
                .withColumnRenamed('AFU', 'est_average_session_per_user')
                .withColumnRenamed('ADU', 'est_average_session_duration')
                .withColumnRenamed('IP', 'est_install_penetration')
                .withColumnRenamed('AAD', 'est_average_active_days')
                .withColumnRenamed('PAD', 'est_percentage_active_days')
                .withColumnRenamed('MBPU', 'est_average_bytes_per_user')
                .withColumnRenamed('ATU', 'est_average_time_per_user')
                .withColumnRenamed('UP', 'est_usage_penetration')
                .withColumnRenamed('OR', 'est_open_rate')
                .withColumnRenamed('MBPS', 'est_average_bytes_per_session')
                .withColumnRenamed('MBWFT', 'est_percent_of_wifi_total')
                .withColumnRenamed('MBS', 'est_mb_per_second')
                .withColumnRenamed('IS', 'est_installs')
                .withColumnRenamed('SOU', 'est_average_active_users_country_share')
                .withColumnRenamed('SOI', 'est_installs_country_share')
                .withColumn('est_share_of_category_time', lit(None).cast(DoubleType()))
                .withColumn('est_share_of_category_session', lit(None).cast(DoubleType()))
                .withColumn('est_share_of_category_bytes', lit(None).cast(DoubleType()))
                .withColumn('est_panel_size', lit(None).cast(DoubleType()))
                .drop('device_type')
                .drop('platform')
        )

        unified_v1 = (
            unified_v1
            .drop("est_mb_per_second", "est_share_of_device_time", "est_share_of_device_session",
                  "est_share_of_device_mb", "est_panel_size")
            .withColumn('est_total_time',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_time_per_user'] / 60)
            .withColumn('est_average_time_per_user', coalesce(unified_v1['est_average_time_per_user'],
                                                              unified_v1['est_average_session_duration'] *
                                                              unified_v1['est_average_session_per_user']) * 1000)
            .withColumn('est_average_session_duration', unified_v1['est_average_session_duration'] * 1000)
        )
        unified_v1 = (
            unified_v1
            .withColumn('est_average_bytes_per_session', unified_v1['est_average_bytes_per_session'] * 1024 * 1024)
            .withColumn('est_average_bytes_per_user', unified_v1['est_average_bytes_per_user'] * 1024 * 1024)
            .withColumn('est_share_of_category_bytes', unified_v1['est_share_of_category_bytes'] * 1024 * 1024)
            .withColumn('est_install_base',
                        unified_v1['est_install_penetration'] *
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumn('est_population',
                        unified_v1['est_average_active_users'] / unified_v1['est_usage_penetration'])
            .withColumnRenamed('est_average_active_users_country_share', 'est_share_of_users')
            .withColumnRenamed('est_installs_country_share', 'est_share_of_installs')
            .withColumn('est_total_sessions',
                        unified_v1['est_average_active_users'] * unified_v1['est_average_session_per_user'])
        )
        unified_v1.createOrReplaceTempView("v1_df")
        unified_v3 = spark.sql("""
                    select
                        v1_df.*,
                        ww.est_average_active_users as est_average_active_users_worldwide,
                        ww.est_installs as est_installs_worldwide
                    from v1_df left join
                        (select device_code, app_id, est_installs, est_average_active_users
                         from v1_df where country_code ='WW'
                         ) AS ww
                    on (v1_df.app_id=ww.app_id) and (v1_df.device_code = ww.device_code)
                    """)
        unified_v3 = unified_v3.na.fill(0)

        result = self.get_citus_db_df()
        citus_db_df = spark.createDataFrame(result)
        citus_db_df = citus_db_df.drop('date').drop('granularity').drop('_disable_idx_4_query')

        diff_count1 = citus_db_df.select(unified_v3.columns).subtract(unified_v3).count()
        diff_count2 = unified_v3.select(citus_db_df.columns).subtract(citus_db_df).count()
        print diff_count1, diff_count2
        self.assertTrue(diff_count1 == 0 and diff_count2 == 0,
                        msg="fount mismatch when compare usage routine raw and citus db.granularity is daily, "
                            "date is {}, diff_count1 is:{}, diff_count2 is:{}".format(
                                self.check_date, diff_count1, diff_count2))

    def get_citus_db_df(self):
        sql = """select * from {table_name} where date='{date}' and granularity='{granularity}';
        """.format(table_name=self.table_name, date=self.check_date, granularity=self.granularity[1])
        result = query_df(citus_settings(self.db_name), sql)
        return result


class TestUsageRoutineRawAccuracyDaily(TestUsageRoutineRawAccuracy):

    trigger_date_config = ("* * * * *", 2)
    check_date = '2020-06-14'
    # check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['DAY', 'daily']

    def test_routine_raw_accuracy_daily(self):
        self.test_routine_raw_accuracy()


class TestUsageRoutineRawAccuracyWeekly(TestUsageRoutineRawAccuracy):
    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['WEEK', 'weekly']

    def test_routine_raw_accuracy_weekly(self):
        self.test_routine_raw_accuracy()


class TestUsageRoutineRawAccuracyMonthly(TestUsageRoutineRawAccuracy):
    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['MONTH', 'monthly']

    def test_routine_raw_accuracy_monthly(self):
        self.test_routine_raw_accuracy()

suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutineRawAccuracyDaily))
runner = unittest.TextTestRunner()
runner.run(suite)


In [0]:

import unittest
from applications.db_check_v1.common.db_check_utils import _get_date_from_refresh_routing_config
from applications.db_check_v1.common.constants import query, citus_settings
from conf.settings import PG_USAGE_HOSTS, PG_USAGE_NAME, PG_USAGE_ACCESS_ID, PG_USAGE_SECRET_KEY
from applications.db_check_v1.common.table_common_info import urn


plproxy_dsn = (
    "dbname='{db}' user='{user}' password='{password}' "
    "host='{host}' port='{port}'".format(
        db=PG_USAGE_NAME,
        user=PG_USAGE_ACCESS_ID,
        host=PG_USAGE_HOSTS[0][0],
        password=PG_USAGE_SECRET_KEY,
        port=PG_USAGE_HOSTS[0][1]
    )
)


class TestUsageRoutinePlproxyCompleteness(unittest.TestCase):
    citus_table_name = 'usage.usage_basic_kpi_fact_v6'
    db_name = 'usage'
    plproxy_table_name = 'mu.app_{granularity}'

    def setUp(self):
        self.check_date = None
        self.granularity = None  # granularity type is list, like ['DAY', 'daily']

    def test_routine_plproxy_completeness(self):
        plproxy_db_count = self.get_plproxy_db_count()
        citus_db_count = self.get_citus_db_count()

        self.assertEqual(plproxy_db_count[0][0], citus_db_count[0][0],
                         msg="fount count mismatch when compare usage routine plproxy and citus db. "
                             "granularity is {}, date is {}, plproxy count is:{}, citus db count is:{}".format(
                             self.granularity[1], self.check_date, plproxy_db_count, citus_db_count[0][0]))

    def get_citus_db_count(self):
        sql = """select count(1) as cnt from {table_name} where date='{date}' and granularity='{granularity}';
        """.format(table_name=self.citus_table_name, date=self.check_date, granularity=self.granularity[1])
        result = query(citus_settings(self.db_name), sql)
        return result

    def get_plproxy_db_count(self):
        sql = """select count(uniqlo_id) from plproxy.execute_select_nestloop($proxy$ 
                    select max(app_id) as uniqlo_id
                from {table_name}
                where 
                date='{date}'
                group by
                app_id,
                store_id,
                device_id
            $proxy$) t (uniqlo_id BIGINT);""".format(table_name=self.plproxy_table_name.format(
            granularity=self.granularity[1]), date=self.check_date)
        result = query(plproxy_dsn, sql)
        return result


class TestUsageRoutinePlproxyCompletenessDaily(TestUsageRoutinePlproxyCompleteness):

    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['DAY', 'daily']  # granularity type is list, like ['DAY', 'daily']

    def test_routine_plproxy_completeness_daily(self):
        self.test_routine_plproxy_completeness()


class TestUsageRoutinePlproxyCompletenessWeekly(TestUsageRoutinePlproxyCompleteness):
    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['WEEK', 'weekly']  # granularity type is list, like ['DAY', 'daily']

    def test_routine_plproxy_completeness_weekly(self):
        self.test_routine_plproxy_completeness()


class TestUsageRoutinePlproxyCompletenessMonthly(TestUsageRoutinePlproxyCompleteness):
    trigger_date_config = ("* * * * *", 2)
    check_date = _get_date_from_refresh_routing_config(trigger_date_config)

    def setUp(self):
        self.granularity = ['MONTH', 'monthly']  # granularity type is list, like ['DAY', 'daily']

    def test_routine_plproxy_completeness_monthly(self):
        self.test_routine_plproxy_completeness()
        

suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestUsageRoutinePlproxyCompletenessDaily))
runner = unittest.TextTestRunner()
runner.run(suite)

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log-pre-aggr.v1/fact/granularity=weekly/

In [0]:

import datetime
from dateutil.relativedelta import relativedelta


def get_date_list():
    result = []
    end = datetime.date(2020, 4, 30)
    start = datetime.date(2020, 1, 31)
    while start <= end:
        start = last_day_of_month(start)
        month_data_raw = datetime.datetime.strftime(start, '%Y-%m-%d')
        result.append(Row(month_data_raw))
        start += relativedelta(months=1)
    return result
    
    
date_range = ['2020-06-18', '2020-06-19']
spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/date={%s}/" % ",".join(date_range)).show()

In [0]:

from pyspark.sql import types as T
from pyspark.sql import functions as F
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()
    
    
class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


date_range = ['2020-06-14', '2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20']

spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/date={%s}/" % ",".join(date_range)).cache().createOrReplaceTempView("daily_df")

sql_text ="""
WITH daily_agg_df AS (
    SELECT app_id, device_code, country_code, publisher_id, company_id, parent_company_id,
    sum(est_free_app_download) as est_free_app_download,
    sum(est_paid_app_download) as est_paid_app_download,
    sum(est_revenue) as est_revenue,
    sum(est_organic_download) as est_organic_download,
    sum(est_paid_download) as est_paid_download
    from daily_df
    group by
    app_id, device_code, country_code, publisher_id, company_id, parent_company_id
);
"""


test_date='2020-06-20'
namespace = "aa.store.market-size.v1"
ingest_msg = {
    "namespace": "aa.store.market-size.v1",
    "job_type": "routine",
    "options":{},
    "source": [
        {
            "name":"unified_df",
            "data_encoding": "parquet",
            "compression": "gzip",
            "path": ["s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log-pre-aggr.v1/fact/granularity=weekly/date={}".format(test_date)],
        }    
    ]
}
run(spark, ingest_msg, sql_text)
diff_df1 = spark.sql("""select app_id, country_code, device_code, publisher_id, company_id, parent_company_id, est_free_app_download, est_paid_app_download,                 est_revenue, est_paid_download, est_organic_download
                                from unified_df 
                            except all 
                            select app_id, country_code, device_code, publisher_id, company_id, parent_company_id, est_free_app_download, est_paid_app_download, est_revenue, est_paid_download, est_organic_download
                                from daily_agg_df""")
diff_df2 = spark.sql("""select app_id, country_code, device_code, publisher_id, company_id, parent_company_id, est_free_app_download, est_paid_app_download,                 est_revenue, est_paid_download, est_organic_download
                                from daily_agg_df 
                            except all 
                            select app_id, country_code, device_code, publisher_id, company_id, parent_company_id, est_free_app_download, est_paid_app_download, est_revenue, est_paid_download, est_organic_download
                                from unified_df""")
diff_df1.show()
diff_df2.show()


In [0]:

from pyspark.sql import types as T
from pyspark.sql import functions as F
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()
    
    
class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


date_range = ['2020-06-14', '2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20']
granularity = 'weekly'
daily_category_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
                     "store.app-est-category-load.v3/fact/"
agg_category_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v1/fact/"
spark.read.format("delta").load(daily_category_path).where(
        "granularity='daily' and date between '{}' and '{}'".format(
            '2020-06-14', '2020-06-20')).createOrReplaceTempView("daily_df")
spark.read.format("delta").load(agg_category_path).where(
    "granularity='{}' and date between '{}' and '{}'".format(
        granularity, '2020-06-20', '2020-06-20')).createOrReplaceTempView("unified_df")
sql_text = """
    SELECT app_id, category_id, device_code, country_code,
    sum(est_free_app_download) as est_free_app_download,
    sum(est_paid_app_download) as est_paid_app_download,
    sum(est_revenue) as est_revenue,
    sum(est_organic_download) as est_organic_download,
    sum(est_paid_download) as est_paid_download
    from daily_df
    group by
    app_id, device_code, country_code, category_id
"""

spark.sql(sql_text).createOrReplaceTempView("daily_agg_df")
diff_df1 = spark.sql("""select app_id, country_code, device_code, category_id,
                            est_free_app_download, est_paid_app_download,
                            est_revenue
                            from daily_agg_df
                            except all
                        select app_id, country_code, device_code, category_id,
                            est_free_app_download, est_paid_app_download,
                            est_revenue
                            from unified_df""")
diff_df2 = spark.sql("""select app_id, country_code, device_code, category_id,
                            est_free_app_download, est_paid_app_download,
                            est_revenue
                            from unified_df
                            except all
                        select app_id, country_code, device_code, category_id,
                            est_free_app_download, est_paid_app_download,
                            est_revenue
                            from daily_agg_df""")
diff_df1.show()
diff_df2.show()

In [0]:

date_range = ['2020-06-14', '2020-06-15', '2020-06-16', '2020-06-17', '2020-06-18', '2020-06-19', '2020-06-20']
spark.read.option("basePath", "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/").parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/date={%s}/" % ",".join(date_range)).cache().createOrReplaceTempView("daily_df")
spark.sql("select * from daily_df where app_id=443948765 and country_code='HR' and device_code='ios-phone' and publisher_id=390200178").show()
spark.sql("""SELECT app_id, device_code, country_code, publisher_id, company_id, parent_company_id,
    sum(est_free_app_download) as est_free_app_download,
    sum(est_paid_app_download) as est_paid_app_download,
    sum(est_revenue) as est_revenue,
    sum(est_organic_download) as est_organic_download,
    sum(est_paid_download) as est_paid_download
    from daily_df
    group by
    app_id, device_code, country_code, publisher_id, company_id, parent_company_id""").filter("app_id=443948765 and country_code='HR' and device_code='ios-phone' and publisher_id=390200178").show()

In [0]:

daily_category_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
                     "store.app-est-category-load.v3/fact/"
test_date = ['2020-07-06', '2020-07-06']
daily_df = spark.read.format("delta").load(daily_category_path).where("granularity='daily' and date between '{}' and '{}'".format(test_date[0], test_date[1]))
daily_df.show()

In [0]:
%%sh
aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-pre-aggr.v1/fact/granularity=weekly/date=2020-06-20/

In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="qa-auto-pipeline-fiona-1")

# reload dependencies from temp
spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")
# spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy
from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest

In [0]:

import aaplproxy
import datetime
import unittest
from dateutil.relativedelta import relativedelta
import croniter

from aadatapipelinecore.core.utils.commandline import env
from aadatapipelinecore.core.utils.encode import activate_system_utf8
from aadatapipelinecore.core.utils.spark import create_spark


class PipelineTest(unittest.TestCase):
    trigger_date_config = None
    trigger_datetime = None
    prev_etl_datetime = None
    only_check_in_24hr = False

    def __init__(self, methodName='runTest', trigger_datetime=None):
        super(PipelineTest, self).__init__(methodName)
        self.trigger_datetime = trigger_datetime or datetime.datetime.utcnow()
        self.check_date_str = self._get_check_date_from_routing_config(self.trigger_datetime).strftime("%Y-%m-%d")
        self.check_date = self.check_date_str  # for compatibility with send email
        self.prev_etl_datetime = self._get_pre_etl_completed_date()

    def setUp(self):
        super(PipelineTest, self).setUp()
        # print "Triggered Datetime : {}".format(self.trigger_datetime)
        # print "Check date str : {}".format(self.check_date_str)
        self._verify_config()

    @classmethod
    def setUpClass(cls):
        super(PipelineTest, cls).setUpClass()
        activate_system_utf8()
        env(PYTHONIOENCODING='utf8')
        cls.spark = create_spark()
        cls.sc = cls.spark.sparkContext

    def _verify_config(self):
        self.assertIsNotNone(self.trigger_date_config)
        self.assertIsNotNone(self.trigger_datetime)
        self.assertIsNotNone(self.prev_etl_datetime)
        self.assertIsNotNone(self.check_date_str)
        self.assertIsNotNone(self.check_date)

    def _get_check_date_from_routing_config(self, trigger_datetime):
        """
        return the date of : <days_delta> ago from previous scheduled date&time according to <cron_time>.
        e.g.
        config = ("0 9 * * *", 1), today is 2019-10-27 8:00
        so previous scheduled date&time is 2019-10-26 9:00
        will return "2019-10-25"

        e.g.
        config = ("0 7 * * *", 1), today is 2019-10-27 8:00
        so previous scheduled date&time is 2019-10-27 7:00
        will return "2019-10-26"

        Cron Time Format
        Character	Descriptor	        Acceptable values
        1	        Minute	            0 to 59, or * (no specific value)
        2	        Hour	            0 to 23, or * for any value. All times UTC.
        3	        Day of the month	1 to 31, or * (no specific value)
        4	        Month	            1 to 12, or * (no specific value)
        5	        Day of the week	    0 to 7 (0 and 7 both represent Sunday), or * (no specific value)

        :param trigger_datetime: the test trigger date
        :type trigger_datetime: object
        :return: date obj of "%Y-%m-%d"
        :type return: object
        """
        schedule, days_delta = self.trigger_date_config
        # here use UTC now
        cron = croniter.croniter(schedule, trigger_datetime)
        date = cron.get_prev(datetime.datetime) - datetime.timedelta(days=days_delta)
        return date

    def _get_pre_etl_completed_date(self):
        schedule, _ = self.trigger_date_config
        cron = croniter.croniter(schedule, self.trigger_datetime)
        date = cron.get_prev(datetime.datetime)
        return date

def string_to_datetime(date_str, convert_format=None):
    return datetime.datetime.strptime(date_str, convert_format if convert_format else "%Y-%m-%d")


def datetime_to_string(date_time, convert_format="%Y-%m-%d"):
    return datetime.datetime.strftime(date_time, convert_format)


def get_date_list(granularity, check_date):
    print granularity, check_date
    if granularity == 'weekly':
        end = string_to_datetime(check_date)
        start = end - relativedelta(days=6)
        result = [datetime_to_string(start), datetime_to_string(end)]
    if granularity == 'monthly':
        end = string_to_datetime(check_date)
        start = string_to_datetime(check_date[0:7] + '-01')
        result = [datetime_to_string(start), datetime_to_string(end)]
    if granularity == 'quarterly':
        end = string_to_datetime(check_date)
        temp_date = end - relativedelta(months=2)
        temp_date = datetime_to_string(temp_date)
        start = string_to_datetime(temp_date[0:7] + '-01')
        result = [datetime_to_string(start), datetime_to_string(end)]
    if granularity == 'yearly':
        end = string_to_datetime(check_date)
        temp_date = end - relativedelta(months=11)
        temp_date = datetime_to_string(temp_date)
        start = string_to_datetime(temp_date[0:7] + '-01')
        result = [datetime_to_string(start), datetime_to_string(end)]
    return result


def test_store_category_pre_aggr(spark, test_date, granularity):
    daily_category_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
                     "store.app-est-category-load.v3/fact/"
    agg_category_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
                        "store.app-est-category-pre-aggr.v1/fact/"
    spark.read.format("delta").load(daily_category_path).where(
        "granularity='daily' and date between '{}' and '{}'".format(
            test_date[0], test_date[1])).cache().createOrReplaceTempView("daily_df")
    spark.read.format("delta").load(agg_category_path).where(
        "granularity='{}' and date between '{}' and '{}'".format(
            granularity, test_date[0], test_date[1])).cache().createOrReplaceTempView("unified_df")

    sql_text = """
        SELECT app_id, category_id, device_code, country_code,
        sum(est_free_app_download) as est_free_app_download,
        sum(est_paid_app_download) as est_paid_app_download,
        sum(est_revenue) as est_revenue,
        sum(est_organic_download) as est_organic_download,
        sum(est_paid_download) as est_paid_download
        from daily_df
        group by
        app_id, device_code, country_code, category_id
    """

    spark.sql(sql_text).createOrReplaceTempView("daily_agg_df")
    diff_df1 = spark.sql("""select app_id, country_code, device_code, category_id,
                                est_free_app_download, est_paid_app_download,
                                est_revenue
                                from daily_agg_df
                                except all
                            select app_id, country_code, device_code, category_id,
                                est_free_app_download, est_paid_app_download,
                                est_revenue
                                from unified_df""")
    diff_df2 = spark.sql("""select app_id, country_code, device_code, category_id,
                                est_free_app_download, est_paid_app_download,
                                est_revenue
                                from unified_df
                                except all
                            select app_id, country_code, device_code, category_id,
                                est_free_app_download, est_paid_app_download,
                                est_revenue
                                from daily_agg_df""")
    return diff_df1, diff_df2


class TestStoreCategoryPreAggr(PipelineTest):

    def setUp(self):
        super(TestStoreCategoryPreAggr, self).setUp()
        self.check_date = None
        self.granularity = None

    def check_store_category_pre_aggr_accuracy(self):
        print self.check_date
        date_list = get_date_list(self.granularity, self.check_date)
        print date_list
        diff_df1, diff_df2 = test_store_category_pre_aggr(self.spark, date_list, self.granularity)
        self.assertTrue(diff_df1.count() == 0, "daily agg except pre agg is not empty. date: {}".format(
            self.check_date))
        self.assertTrue(diff_df2.count() == 0, "pre agg except daily agg is not empty. date: {}".format(
            self.check_date))


class TestStoreCategoryPreAggrWeekly(TestStoreCategoryPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2020-06-20'

    def setUp(self):
        self.granularity = "weekly"
        self.check_date = '2020-06-20'

    def test_store_category_pre_agg_accuracy_weekly(self):
        self.check_store_category_pre_aggr_accuracy()


class TestStoreCategoryPreAggrMonthly(TestStoreCategoryPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2020-05-31'

    def setUp(self):
        self.granularity = "monthly"
        self.check_date = '2020-05-31'

    def test_store_category_pre_agg_accuracy_monthly(self):
        self.check_store_category_pre_aggr_accuracy()


class TestStoreCategoryPreAggrQuarterly(TestStoreCategoryPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2020-03-31'

    def setUp(self):
        self.granularity = "quarterly"
        self.check_date = '2020-05-31'

    def test_store_category_pre_agg_accuracy_quarterly(self):
        self.check_store_category_pre_aggr_accuracy()


class TestStoreCategoryPreAggrYearly(TestStoreCategoryPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2019-12-31'

    def setUp(self):
        self.granularity = "yearly"
        self.check_date = '2019-12-31'
    
    def test_store_category_pre_agg_accuracy_yearly(self):
        self.check_store_category_pre_aggr_accuracy()


suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreCategoryPreAggrWeekly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreCategoryPreAggrMonthly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreCategoryPreAggrQuarterly))
# suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreCategoryPreAggrYearly))
runner = unittest.TextTestRunner()
runner.run(suite)


In [0]:

from bdce.common.utils import update_application_code
update_application_code(spark, role="BDP-PROD-APP-INT-QA", application_name="zidong-application-autopipeline")

spark.sparkContext.addPyFile("/tmp/zeppelin_application_code/libs/python/dependencies.zip")
import aaplproxy
from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest

In [0]:

from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest

In [0]:

import datetime
import unittest
import aaplproxy
from dateutil.relativedelta import relativedelta
from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest


def string_to_datetime(date_str, convert_format=None):
    return datetime.datetime.strptime(date_str, convert_format if convert_format else "%Y-%m-%d")


def datetime_to_string(date_time, convert_format="%Y-%m-%d"):
    return datetime.datetime.strftime(date_time, convert_format)


def get_date_list(granularity, check_date):
    if granularity == 'weekly':
        end = string_to_datetime(check_date)
        start = end - relativedelta(days=6)
        result = [datetime_to_string(start), datetime_to_string(end)]
    if granularity == 'monthly':
        end = string_to_datetime(check_date)
        start = string_to_datetime(check_date[0:7] + '-01')
        result = [datetime_to_string(start), datetime_to_string(end)]
    if granularity == 'quarterly':
        end = string_to_datetime(check_date)
        temp_date = end - relativedelta(months=2)
        temp_date = datetime_to_string(temp_date)
        start = string_to_datetime(temp_date[0:7] + '-01')
        result = [datetime_to_string(start), datetime_to_string(end)]
    if granularity == 'yearly':
        end = string_to_datetime(check_date)
        temp_date = end - relativedelta(months=11)
        temp_date = datetime_to_string(temp_date)
        start = string_to_datetime(temp_date[0:7] + '-01')
        result = [datetime_to_string(start), datetime_to_string(end)]
    return result


def test_store_est_pre_aggr(spark, test_date, granularity):
    print granularity, test_date
    daily_est_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/" \
                     "store.app-est-dna-log.v1/fact/"
    agg_est_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log-pre-aggr.v1/fact/"
    
    spark.read.format("delta").load(daily_est_path).where(
        "granularity='daily' and date between '{}' and '{}'".format(
            test_date[0], test_date[1])).cache().createOrReplaceTempView("daily_df")
    spark.read.format("delta").load(agg_est_path).where(
        "granularity='{}' and date between '{}' and '{}'".format(
            granularity, test_date[0], test_date[1])).cache().createOrReplaceTempView("unified_df")
    
    sql_text = """
        SELECT app_id, device_code, country_code, publisher_id, company_id, parent_company_id,
        sum(est_free_app_download) as est_free_app_download,
        sum(est_paid_app_download) as est_paid_app_download,
        sum(est_revenue) as est_revenue,
        sum(est_organic_download) as est_organic_download,
        sum(est_paid_download) as est_paid_download
        from daily_df
        group by
        app_id, device_code, country_code, publisher_id, company_id, parent_company_id
    """
    
    spark.sql(sql_text).createOrReplaceTempView("daily_agg_df")
    
    diff_df1 = spark.sql("""select app_id, country_code, device_code, publisher_id, company_id,
                                parent_company_id, est_free_app_download, est_paid_app_download,
                                est_revenue, est_paid_download, est_organic_download
                                from daily_agg_df
                                except all
                            select app_id, country_code, device_code, publisher_id, company_id, 
                                parent_company_id, est_free_app_download, est_paid_app_download,
                                est_revenue, est_paid_download, est_organic_download
                                from unified_df""")
    diff_df2 = spark.sql("""select app_id, country_code, device_code, publisher_id, company_id,
                                parent_company_id, est_free_app_download, est_paid_app_download,
                                est_revenue, est_paid_download, est_organic_download
                                from unified_df
                                except all
                            select app_id, country_code, device_code, publisher_id, company_id,
                                parent_company_id, est_free_app_download, est_paid_app_download,
                                est_revenue, est_paid_download, est_organic_download
                                from daily_agg_df""")
    diff_df1.show()
    diff_df2.show()
    return diff_df1, diff_df2


class TestStoreEstPreAggr(PipelineTest):
    agg_est_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log-pre-aggr.v1/fact/" \
                   "granularity={granularity}/date={date}"

    def setUp(self):
        super(TestStoreEstPreAggr, self).setUp()
        self.check_date = None
        self.granularity = None

    def check_store_est_pre_aggr_accuracy(self):
        date_list = get_date_list(self.granularity, self.check_date)
        diff_df1, diff_df2 = test_store_est_pre_aggr(self.spark, date_list, self.granularity)
        self.assertTrue(diff_df1.count() == 0, "daily agg except pre agg is not empty. date: {}".format(
            self.check_date))
        self.assertTrue(diff_df2.count() == 0, "pre agg except daily agg is not empty. date: {}".format(
            self.check_date))


class TestStoreEstPreAggrWeekly(TestStoreEstPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2020-06-20'

    def setUp(self):
        self.granularity = "weekly"
        self.check_date = '2020-06-20'

    def test_store_est_pre_agg_accuracy_weekly(self):
        self.check_store_est_pre_aggr_accuracy()


class TestStoreEstPreAggrMonthly(TestStoreEstPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2020-05-31'

    def setUp(self):
        self.granularity = "monthly"
        self.check_date = '2020-05-31'

    def test_store_est_pre_agg_accuracy_monthly(self):
        self.check_store_est_pre_aggr_accuracy()


class TestStoreEstPreAggrQuarterly(TestStoreEstPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2020-03-31'

    def setUp(self):
        self.granularity = "quarterly"
        self.check_date = '2020-03-31'

    def test_store_est_pre_agg_accuracy_quarterly(self):
        self.check_store_est_pre_aggr_accuracy()


class TestStoreEstPreAggrYearly(TestStoreEstPreAggr):
    trigger_date_config = ("0 12 * * *", 8)
    check_date = '2019-12-31'

    def setUp(self):
        self.granularity = "yearly"
        self.check_date = '2019-12-31'

    def test_store_est_pre_agg_accuracy_yearly(self):
        self.check_store_est_pre_aggr_accuracy()
        

suite = unittest.TestSuite()
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreEstPreAggrWeekly))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreEstPreAggrMonthly))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreEstPreAggrQuarterly))
suite.addTests(unittest.TestLoader().loadTestsFromTestCase(TestStoreEstPreAggrYearly))
runner = unittest.TextTestRunner()
runner.run(suite)

In [0]:

agg_est_path = "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log-pre-aggr.v1/fact/"
print spark.read.format("delta").load(agg_est_path).where(
        "granularity='weekly' and date between '{}' and '{}'".format(
            '2020-06-20', '2020-06-20')).count()spark.read.format("delta").load(agg_est_path).where(
        "granularity='weekly' and date between '{}' and '{}'".format(
            '2020-06-20', '2020-06-20')).count()

In [0]:

spark.read.format("delta").load(daily_category_path).where(
        "granularity='daily' and date between '{}' and '{}'".format(
            '2020-06-14', '2020-06-20')).createOrReplaceTempView("daily_df")