In [0]:
%md
store download revenue est/category daily 
aa.store.app-est.v4
aa.store.app-est-category.v4

store download revenue preload est/category daily
aa.store.app-est-dna-log.v1
aa.store.app-est-category-load.v4

store download attr daily
aa.store.download-attribution.v4


store download attr daily
aa.store.download-attribution-dna-log.v2
aa.store.download-attribution-category-load.v4


In [0]:


spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/date=2020-07-18/").show()
spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/granularity=daily/date=2020-07-18/").show()


In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/date=2020-07-18




In [0]:

import unittest
import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")

from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import aaplproxy
from aadatapipelinecore.core.utils import email
from applications.auto_pipeline.temp_script.utils.html_report_test_runner import HTMLTestRunner



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()


sql_download_attribution_with_est = """

    WITH download_attribution AS (
        select *, CAST(est_non_organic_download_share as decimal(36,20)) as new_est_non_organic_download_share from download_attribution
    );

    WITH download_attribution_1 AS (

     select device_code, country_code, granularity, date, product_id, new_est_non_organic_download_share, 
        case when device_code='android-phone' THEN 'android-all' 
             when device_code='ios-phone' THEN 'ios-tablet' END AS new_device_code from download_attribution
    );

    WITH download_attribution_2
    AS (
        select distinct * from ( select new_device_code as device_code, country_code, granularity, date, product_id, new_est_non_organic_download_share as est_non_organic_download_share from download_attribution_1 
        union all
        select device_code, country_code, granularity, date, product_id, est_non_organic_download_share from download_attribution ) as t1 where device_code!='android-phone'

    );

    WITH union_data AS (
    select *, store_unified.device_code as unified_device_code , store_unified.country_code as unified_country_code
    from store_unified left join download_attribution_2
    on store_unified.device_code=download_attribution_2.device_code and
    store_unified.country_code=UPPER(download_attribution_2.country_code) and
    store_unified.app_id=download_attribution_2.product_id
    where est_non_organic_download_share is not null

    );


    WITH calculate_data_prepare AS (
    select app_id, coalesce(est_free_app_download, 0) as free_app_download,  coalesce(est_paid_app_download, 0 )  as paid_app_download, est_revenue as revenue, unified_device_code, unified_country_code, est_non_organic_download_share from union_data where not (est_free_app_download is null and est_paid_app_download is null) 
    );


    WITH caculate_data AS (
    select app_id, free_app_download, paid_app_download, revenue, unified_device_code, unified_country_code, est_non_organic_download_share, 
    round (( 1 - est_non_organic_download_share ) *(free_app_download+paid_app_download)) as organic_download, ( free_app_download+paid_app_download - ROUND(( 1- est_non_organic_download_share) * (free_app_download+paid_app_download))) as paid_download    from calculate_data_prepare
    );


    WITH compare_data_raw AS (
    select app_id,free_app_download, paid_app_download, organic_download, paid_download, unified_device_code as device_code, unified_country_code as country_code from caculate_data
    );


    WITH compare_data_unified AS (
    select app_id,coalesce(est_free_app_download, 0 ) as free_app_download,  coalesce(est_paid_app_download, 0 ) as paid_app_download,  est_organic_download as organic_download, est_paid_download as paid_download, device_code,  country_code from test_unified_download_attribution
    );



    """


def test_daily_download_attr_with_est(spark, test_date_list):
    print test_date_list
    spark.read.option("basePath",
                      "s3://b2c-mktint-prod-dca-kpi/download_attribution/week_and_month_routine/v1.0.0/WEEK/").parquet(
        "s3://b2c-mktint-prod-dca-kpi/download_attribution/week_and_month_routine/v1.0.0/WEEK/{}/*/".format(
            test_date_list[-1])).createOrReplaceTempView(
        "download_attribution")

    spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution-dna-log.v1/fact/").where(
        "granularity='daily' and  date in ('{}') ".format("','".join(
            test_date_list))).createOrReplaceTempView("test_unified_download_attribution")

    spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/").where(
        "granularity='daily' and  date in ('{}') ".format("','".join(
            test_date_list))).createOrReplaceTempView("store_unified")

    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
            }

        ]
    }
    run(spark, ingest_msg, sql_download_attribution_with_est)
    qa_df_1 = spark.sql("select * from compare_data_raw where except select * from compare_data_unified")
    qa_df_2 = spark.sql("select * from compare_data_unified except select * from compare_data_raw")
    count_1 = spark.sql("select count(*) from compare_data_raw ").take(1)
    count_2 = spark.sql("select count(*) from compare_data_unified ").take(1)
    qa_df_1.show()
    qa_df_2.show()
    # if count_1[0][0] != count_2[0][0]:
    #     print 'failed!!!!!!!!!!!!!'

    # spark.sql("select * from compare_data_unified_add_to_est where diff !=0  ").show()
    # eject_all_caches(spark)
    return qa_df_1, qa_df_2, count_1, count_2


DEFAULT_RECIPIENTS = "fzhang@appannie.com"


def send_db_check_email(title, text_context, key=None):
    default_recipients = [DEFAULT_RECIPIENTS]
    email.send(title, text_context, default_recipients, sender='dev-qa-data-quality@appannie.com')


def get_last_week_date(check_date):
    index = (check_date.isoweekday() + 1) % 7
    print index
    if index == 0:
        end = check_date
    else:
        end = check_date - datetime.timedelta(index)
    start = end + datetime.timedelta(-7)
    date_range = end - start
    dates = list()
    print date_range.days
    for days in xrange(date_range.days + 1):
        dates.append(str(start + datetime.timedelta(days)))
    return dates[1:]


class demoTest(PipelineTest):
    trigger_date_config = ("0 12 * * *", 8)

    def test_daily_unified_est_category_data(self):
        trigger_datetime = datetime.datetime.strptime("2020-07-04", '%Y-%m-%d')

        raw_except_unified, unified_except_raw, count_1, count_2 = test_daily_download_attr_with_est(
            self.spark, get_last_week_date(datetime.date(2020, 07, 05)))

        self.assertTrue(raw_except_unified.rdd.isEmpty(), "raw except unified is not empty")
        self.assertTrue(unified_except_raw.rdd.isEmpty(), "unified except raw is not empty")

        # currently store_est has duplicate data...
        self.assertEqual(count_1, count_2, " count is not same")


def send_message(spark):
    log_file = "/tmp/db_check.log"
    with open(log_file, "w") as html_file:
        suite = unittest.TestSuite()
        suite.addTests(unittest.TestLoader().loadTestsFromTestCase(demoTest))
        runner = HTMLTestRunner(
            stream=html_file,
            title='Transform Test Report',
            description='This db_check the report output by Tech Team.'
        )

        failed_count = 0
        result_list = runner.run(suite).result
        for result in result_list:
            if result[0] == 1 or result[0] == 2:
                failed_count += 1

    with open(log_file, 'r') as html_file:
        today = datetime.date.today()
        str_today = today.strftime("%Y-%m-%d")

        title = "store daily check - store download attribution daily - " + str_today + " - "
        if failed_count == 0:
            title += "Passed"
        else:
            title += "Failed"
        send_db_check_email(title, html_file.read())

send_message(spark)

In [0]:

def get_last_week_date(check_date):
    index = (check_date.isoweekday() + 1) % 7
    print index
    if index == 0:
        end = check_date
    else:
        end = check_date - datetime.timedelta(index)
    start = end + datetime.timedelta(-7)
    date_range = end - start
    dates = list()
    print date_range.days
    for days in xrange(date_range.days + 1):
        dates.append(str(start + datetime.timedelta(days)))
    return dates

print get_last_week_date()

In [0]:

import unittest
import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")

from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
import aaplproxy
from aadatapipelinecore.core.utils import email
from applications.auto_pipeline.temp_script.utils.html_report_test_runner import HTMLTestRunner


class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()


sql_download_attribution_with_est = """

    WITH download_attribution AS (
        select *, CAST(est_non_organic_download_share as decimal(36,20)) as new_est_non_organic_download_share from download_attribution
    );

    WITH download_attribution_1 AS (

     select device_code, country_code, granularity, date, product_id, new_est_non_organic_download_share, 
        case when device_code='android-phone' THEN 'android-all' 
             when device_code='ios-phone' THEN 'ios-tablet' END AS new_device_code from download_attribution
    );

    WITH download_attribution_2
    AS (
        select distinct * from ( select new_device_code as device_code, country_code, granularity, date, product_id, new_est_non_organic_download_share as est_non_organic_download_share from download_attribution_1 
        union all
        select device_code, country_code, granularity, date, product_id, est_non_organic_download_share from download_attribution ) as t1 where device_code!='android-phone'

    );

    WITH union_data AS (
    select *, store_unified.device_code as unified_device_code , store_unified.country_code as unified_country_code
    from store_unified left join download_attribution_2
    on store_unified.device_code=download_attribution_2.device_code and
    store_unified.country_code=UPPER(download_attribution_2.country_code) and
    store_unified.app_id=download_attribution_2.product_id
    where est_non_organic_download_share is not null

    );


    WITH calculate_data_prepare AS (
    select app_id, coalesce(est_free_app_download, 0) as free_app_download,  coalesce(est_paid_app_download, 0 )  as paid_app_download, est_revenue as revenue, unified_device_code, unified_country_code, est_non_organic_download_share from union_data where not (est_free_app_download is null and est_paid_app_download is null) 
    );


    WITH caculate_data AS (
    select app_id, free_app_download, paid_app_download, revenue, unified_device_code, unified_country_code, est_non_organic_download_share, 
    round (( 1 - est_non_organic_download_share ) *(free_app_download+paid_app_download)) as organic_download, ( free_app_download+paid_app_download - ROUND(( 1- est_non_organic_download_share) * (free_app_download+paid_app_download))) as paid_download    from calculate_data_prepare
    );
    

 



    WITH compare_data_raw AS (
    select app_id,free_app_download, paid_app_download, unified_device_code as device_code, unified_country_code as country_code from caculate_data
    );


    WITH compare_data_unified AS (
    select app_id,coalesce(est_free_app_download, 0 ) as free_app_download,  coalesce(est_paid_app_download, 0 ) as paid_app_download,  device_code,  country_code from test_unified_download_attribution
    );



    """


start = "2020-06-28"
end = "2020-07-05"
# end = "2012-05-01"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
sar_list = list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))
    if (real_date1 + datetime.timedelta(days)).weekday() == 5:
        temp = list()
        while dates:
            temp.append(dates.pop())
        sar_list.append({real_date1 + datetime.timedelta(days): temp})

test_path = list()

for x in sar_list:
    for key, item in x.items():
        test_path.append(
            (
                [
                    "s3://b2c-mktint-prod-dca-kpi/download_attribution/week_and_month_routine/v1.0.0/WEEK/{}/*/".format(
                        key)],
                [i.strftime("%Y-%m-%d") for i in item],
                [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/granularity=daily/date={}".format(
                        i) for i in item]
            )
        )
print test_path[0][0][0]
print test_path[0][1]
def test_daily_download_attr_with_est(spark, test_date):
    spark.read.option("basePath",
                      "s3://b2c-mktint-prod-dca-kpi/download_attribution/week_and_month_routine/v1.0.0/WEEK/").parquet(
        test_path[0][0][0]).createOrReplaceTempView(
        "download_attribution")


    spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution-dna-log.v1/fact/").where(
        "granularity='daily' and  date in ('{}') ".format("','".join(
            test_path[0][1]))).createOrReplaceTempView("test_unified_download_attribution")

    spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/").where(
        "granularity='daily' and  date in ('{}') ".format("','".join(
            test_path[0][1]))).createOrReplaceTempView("store_unified")


    print test_path[0][2][0]
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                # "data_encoding": "parquet",
                # "compression": "gzip",
                # "name": "store_unified",
                # "path": test_path[0][2]
            }
           
        ]
    }
    run(spark, ingest_msg, sql_download_attribution_with_est)
    qa_df_1 = spark.sql("select * from compare_data_raw where except select * from compare_data_unified")
    qa_df_2 = spark.sql("select * from compare_data_unified except select * from compare_data_raw")
    count_1 = spark.sql("select count(*) from compare_data_raw ").take(1)
    count_2 = spark.sql("select count(*) from compare_data_unified ").take(1)
    qa_df_1.show()
    qa_df_2.show()
    # if count_1[0][0] != count_2[0][0]:
    #     print 'failed!!!!!!!!!!!!!'

    # spark.sql("select * from compare_data_unified_add_to_est where diff !=0  ").show()
    # eject_all_caches(spark)
    return qa_df_1, qa_df_2, count_1, count_2

DEFAULT_RECIPIENTS = "fzhang@appannie.com"

test_daily_download_attr_with_est(spark, "2020-06-27")
def send_db_check_email(title, text_context, key=None):
    default_recipients = [DEFAULT_RECIPIENTS]
    email.send(title, text_context, default_recipients, sender='dev-qa-data-quality@appannie.com')



def get_last_week_date(check_date):
    idx = (check_date.isoweekday() + 1) % 7 
    print idx
    if idx == 0:
        print check_date
    else:
        sat = check_date - datetime.timedelta(idx)
        print sat

    end = sat
    start = end + datetime.timedelta(-7)

    date_range = end - start
    dates = list()
    sar_list = list()
    print date_range.days
    for days in xrange(date_range.days + 1):
        dates.append(str(start + datetime.timedelta(days)))


class demoTest(PipelineTest):
    trigger_date_config = ("0 12 * * *", 8)

    def test_daily_unified_est_category_data(self):
        trigger_datetime = datetime.datetime.strptime("2020-07-02", '%Y-%m-%d')

        raw_except_unified, unified_except_raw, count_1, count_2 = test_daily_download_attr_with_est(self.spark, "2020-06-20")

        self.assertTrue(raw_except_unified.rdd.isEmpty(), "raw except unified is not empty")
        self.assertTrue(unified_except_raw.rdd.isEmpty(), "unified except raw is not empty")
        self.assertEqual(count_1,count_2, " count is not same")



def send_message(spark):
    log_file = "/tmp/db_check.log"
    with open(log_file, "w") as html_file:
        suite = unittest.TestSuite()
        suite.addTests(unittest.TestLoader().loadTestsFromTestCase(demoTest))
        runner = HTMLTestRunner(
            stream=html_file,
            title='Transform Test Report',
            description='This db_check the report output by Tech Team.'
        )

        failed_count = 0
        result_list = runner.run(suite).result
        for result in result_list:
            if result[0] == 1 or result[0] == 2:
                failed_count += 1

    with open(log_file, 'r') as html_file:
        today = datetime.date.today()
        str_today = today.strftime("%Y-%m-%d")

        title = "store daily check - store est/category daily - " + str_today + " - "
        if failed_count == 0:
            title += "Passed"
        else:
            title += "Failed"
        send_db_check_email(title, html_file.read())


send_message(spark)



#   WITH caculate_data AS (
#     select app_id,free_app_download,paid_app_download,revenue, unified_device_code, unified_country_code, est_non_organic_download_share from calculate_data_prepare
#     );


In [0]:

import datetime
check_date = datetime.date(2020, 07, 11) #datetime.date.today()
print check_date
def get_last_week_date(check_date):
    index = (check_date.isoweekday() + 1) % 7
    print index
    if index == 0:
        end = check_date
    else:
        end = check_date - datetime.timedelta(index)

    start = end + datetime.timedelta(-7)

    date_range = end - start
    dates = list()
    print date_range.days
    for days in xrange(date_range.days + 1):
        dates.append(str(start + datetime.timedelta(days)))
    return dates[1:]
    
print get_last_week_date(check_date)

In [0]:

spark.sql("select * from test_unified_download_attribution where app_id=20600006439056 and country_code='WW'").show()

In [0]:

spark.sql("select * from store_unified where app_id=289560144 and country_code='WW'").show()
spark.sql("select * from download_attribution where product_id=289560144 ").show()
spark.sql("select * from compare_data_unified where app_id=289560144").show()

In [0]:

spark.sql("select * from test_unified_download_attribution where app_id=289560144 and country_code='WW'").show()

In [0]:

# spark.sql("select * from test_unified_download_attribution limit 2 ").show()
print spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution-dna-log.v1/fact/granularity=daily/date=2020-06-21/").count()
spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution-dna-log.v1/fact/granularity=daily/date=2020-06-21/").show()


In [0]:

eject_all_caches(spark)


In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.download-attribution.v4/fact/granularity=daily/



In [0]:

import unittest
import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_

from applications.auto_pipeline.temp_script.utils.base_test import PipelineTest
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy
from aadatapipelinecore.core.utils import email
from applications.auto_pipeline.temp_script.utils.html_report_test_runner import HTMLTestRunner


class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()



d1 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("ios"))
d1 = spark.createDataFrame([(0, 'WW', 'Worldwide', 'ios')],
                           schema=["store_id", "country_code", "_c2", "market_code"]).union(d1)

d2 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("android"))
country_code_df = d1.union(d2).where("country_code is not null").cache()
country_code_df = country_code_df.withColumnRenamed("store_id", "country_code_store_id")
print 'country mapping table'
country_code_df.show(2)
country_code_df.createOrReplaceTempView("country_code_mapping")

category_mapping_table = spark.read.parquet(
    "s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping")
category_mapping_table.createOrReplaceTempView("category_mapping_deminsion_service")


sql_text_category = '''
        WITH filter_top_n_raw_data AS 
        ( 
               SELECT * 
               FROM   ( 
                               SELECT   id, 
                                        Sum(est) AS est, 
                                        category_id, 
                                        store_id, 
                                        platform_id, 
                                        feed, 
                                        vertical, 
                                        platform 
                               FROM     ( 
                                                        SELECT DISTINCT d1.id, 
                                                                        d1.est, 
                                                                        d1.store_id, 
                                                                        d1.date, 
                                                                        d1.feed, 
                                                                        d1.vertical, 
                                                                        d1.platform_id, 
                                                                        d1.platform, 
                                                                        d2.category_id 
                                                        FROM            daily_data AS d1 
                                                        JOIN            daily_data AS d2 
                                                        ON              d1.id = d2.id 
                                                        AND             d1.store_id = d2.store_id 
                                                        AND             d1.feed = d2.feed 
                                                        AND             d1.vertical = d2.vertical 
                                                        AND             d1.platform_id = d2.platform_id 
                                                        AND             d1.platform = d2.platform 
                                                        WHERE           ( 
                                                                                        d1.rank <= 4000 
                                                                        AND             d2.rank<=4000 
                                                                        AND             d1.store_id == 0 
                                                                        AND             d1.platform = 'ios' )
                                                        OR              ( 
                                                                                        d1.rank <= 1000 
                                                                        AND             d2.rank<=1000 
                                                                        AND             d1.store_id != 0 
                                                                        AND             d1.platform = 'ios' )
                                                        OR              ( 
                                                                                        d1.rank <= 4000 
                                                                        AND             d2.rank<=4000 
                                                                        AND             d1.store_id == 1000
                                                                        AND             d1.platform = 'android' )
                                                        OR              ( 
                                                                                        d1.rank <= 1000 
                                                                        AND             d2.rank<=1000 
                                                                        AND             d1.store_id != 1000
                                                                        AND             d1.platform = 'android' ) ) AS t
                               WHERE    feed IN ( 0, 
                                                 1, 
                                                 2, 
                                                 101, 
                                                 100, 
                                                 102 ) 
                               GROUP BY id, 
                                        store_id, 
                                        category_id, 
                                        platform_id, 
                                        platform, 
                                        vertical, 
                                        feed ) ); 
        with replace_metric AS 
        ( 
               SELECT * , 
                      CASE 
                             WHEN feed='0' 
                             AND    platform='ios' THEN 'free_app_download' 
                             WHEN feed='1' 
                             AND    platform='ios' THEN 'paid_app_download' 
                             WHEN feed='2' 
                             AND    platform='ios' THEN 'revenue' 
                             WHEN feed='101' 
                             AND    platform='ios' THEN 'free_app_download' 
                             WHEN feed='100' 
                             AND    platform='ios' THEN 'paid_app_download' 
                             WHEN feed='102' 
                             AND    platform='ios' THEN 'revenue' 
                             WHEN feed='0' 
                             AND    platform='android' THEN 'free_app_download' 
                             WHEN feed='1' 
                             AND    platform='android' THEN 'paid_app_download' 
                             WHEN feed='2' 
                             AND    platform='android' THEN 'revenue' 
                      END AS metric 
               FROM   filter_top_n_raw_data); 
        with replace_metric_device_code AS 
        ( 
               SELECT * , 
                      CASE 
                             WHEN feed='0' 
                             AND    platform='ios' THEN 'ios-phone' 
                             WHEN feed='1' 
                             AND    platform='ios' THEN 'ios-phone' 
                             WHEN feed='2' 
                             AND    platform='ios' THEN 'ios-phone' 
                             WHEN feed='101' 
                             AND    platform='ios' THEN 'ios-tablet' 
                             WHEN feed='100' 
                             AND    platform='ios' THEN 'ios-tablet' 
                             WHEN feed='102' 
                             AND    platform='ios' THEN 'ios-tablet' 
                             WHEN feed='0' 
                             AND    platform='android' THEN 'android-all' 
                             WHEN feed='1' 
                             AND    platform='android' THEN 'android-all' 
                             WHEN feed='2' 
                             AND    platform='android' THEN 'android-all' 
                      END AS device_code 
               FROM   replace_metric); 
        with group_by_metric_1 AS 
        ( 
                 SELECT   max(est) AS est, 
                          id, 
                          metric, 
                          device_code, 
                          store_id, 
                          platform, 
                          category_id 
                 FROM     replace_metric_device_code 
                 WHERE    store_id NOT IN (3,4,5,6) 
                 AND      device_code  IN ('ios-phone' , 
                                           'ios-tablet' ) 
                 AND      feed NOT IN (1000, 
                                       1001, 
                                       1002) 
                 GROUP BY id, 
                          store_id, 
                          metric, 
                          device_code, 
                          platform, 
                          category_id ); 
        with group_by_metric_2 AS 
        ( 
                 SELECT   max(est) AS est, 
                          id, 
                          metric, 
                          device_code, 
                          store_id, 
                          platform, 
                          category_id 
                 FROM     replace_metric_device_code 
                 WHERE    store_id NOT IN ( 1003, 
                                           1005, 
                                           1006,1007) 
                 AND      device_code='android-all' 
                 AND      feed NOT IN (1000, 
                                       1001, 
                                       1002) 
                 GROUP BY id, 
                          store_id, 
                          metric, 
                          device_code, 
                          platform, 
                          category_id ); 
        with group_by_metric AS 
        ( 
               SELECT * 
               FROM   group_by_metric_1 
               UNION ALL 
               SELECT * 
               FROM   group_by_metric_2 ); 
        -- pivot metric column 
        with pivot_metric_raw AS 
        ( 
                        SELECT DISTINCT id AS app_id, 
                                        store_id, 
                                        platform, 
                                        device_code, 
                                        free_app_download, 
                                        revenue, 
                                        paid_app_download, 
                                        category_id AS category_id_pivot 
                        FROM            group_by_metric PIVOT ( max(est) FOR metric IN ('free_app_download',
                                                                                        'revenue', 
                                                                                        'paid_app_download') ) );
        -- map raw with category 
        with category_mapping_raw AS 
        ( 
                        SELECT          * 
                        FROM            ( 
                                               SELECT *, 
                                                      'ios' AS mapping_platform 
                                               FROM   category_mapping_deminsion_service 
                                               WHERE  market_code='apple-store' 
                                               UNION ALL 
                                               SELECT *, 
                                                      'android' AS mapping_platform 
                                               FROM   category_mapping_deminsion_service 
                                               WHERE  market_code='google-play' ) AS mapping 
                        FULL OUTER JOIN pivot_metric_raw 
                        ON              mapping.legacy_category_id=pivot_metric_raw.category_id_pivot 
                        AND             mapping.mapping_platform=pivot_metric_raw.platform ); 
        -- union all platform with country_code mapping
        WITH country_code_mapping AS 
        ( 
               SELECT *, 
                      'android' AS market_code 
               FROM   android_country_mapping 
               UNION ALL 
               SELECT *, 
                      'ios' market_code 
               FROM   ios_country_mapping 
               UNION ALL 
               SELECT 143502, 
                      'VE', 
                      'VESA', 
                      'ios' 
               UNION ALL 
               SELECT 0, 
                      'WW', 
                      'worldwide', 
                      'ios' 
               UNION ALL 
               SELECT 36, 
                      'CZ', 
                      'CZ', 
                      'android' 
               UNION ALL 
               SELECT 5, 
                      'ES', 
                      'ES', 
                      'android' );
            WITH country_category_mapping_raw AS 
                ( 
                   SELECT     app_id, 
                              country_code, 
                              device_code, 
                              free_app_download, 
                              paid_app_download, 
                              revenue, 
                              category_id 
                   FROM       country_code_mapping 
                   INNER JOIN category_mapping_raw 
                   ON         country_code_mapping.store_id=category_mapping_raw.store_id 
                   AND        country_code_mapping.market_code=category_mapping_raw.platform 
                   WHERE      country_name!='Global' );
           '''


sql_text_est = '''
WITH filter_top_n_raw_data AS 
( 
                SELECT DISTINCT id, 
                                Sum(est) AS est, 
                                store_id, 
                                platform_id, 
                                feed, 
                                vertical, 
                                platform 
                FROM            ( 
                                                SELECT DISTINCT d1.id, 
                                                                d1.est, 
                                                                d1.store_id, 
                                                                d1.date, 
                                                                d1.feed, 
                                                                d1.vertical, 
                                                                d1.platform_id, 
                                                                d1.platform 
                                                FROM            daily_data AS d1 
                                                JOIN            daily_data AS d2 
                                                ON              d1.id = d2.id 
                                                AND             d1.store_id = d2.store_id 
                                                AND             d1.feed = d2.feed 
                                                AND             d1.vertical = d2.vertical 
                                                AND             d1.platform_id = d2.platform_id 
                                                WHERE           ( 
                                                                                d1.rank <= 4000 
                                                                AND             d2.rank<=4000 
                                                                AND             d1.store_id == 0 
                                                                AND             d1.platform = 'ios' )
                                                OR              ( 
                                                                                d1.rank <= 1000 
                                                                AND             d2.rank<=1000 
                                                                AND             d1.store_id != 0 
                                                                AND             d1.platform = 'ios' )
                                                OR              ( 
                                                                                d1.rank <= 4000 
                                                                AND             d2.rank<=4000 
                                                                AND             d1.store_id == 1000
                                                                AND             d1.platform = 'android' )
                                                OR              ( 
                                                                                d1.rank <= 1000 
                                                                AND             d2.rank<=1000 
                                                                AND             d1.store_id != 1000
                                                                AND             d1.platform = 'android' ) ) AS t
                WHERE           feed IN ( 0, 
                                         1, 
                                         2, 
                                         101, 
                                         100, 
                                         102 ) 
                GROUP BY        id, 
                                store_id, 
                                platform_id, 
                                vertical, 
                                feed, 
                                platform); 
with replace_metric AS 
( 
       SELECT * , 
              CASE 
                     WHEN feed='0' 
                     AND    platform='ios' THEN 'free_app_download' 
                     WHEN feed='1' 
                     AND    platform='ios' THEN 'paid_app_download' 
                     WHEN feed='2' 
                     AND    platform='ios' THEN 'revenue' 
                     WHEN feed='101' 
                     AND    platform='ios' THEN 'free_app_download' 
                     WHEN feed='100' 
                     AND    platform='ios' THEN 'paid_app_download' 
                     WHEN feed='102' 
                     AND    platform='ios' THEN 'revenue' 
                     WHEN feed='0' 
                     AND    platform='android' THEN 'free_app_download' 
                     WHEN feed='1' 
                     AND    platform='android' THEN 'paid_app_download' 
                     WHEN feed='2' 
                     AND    platform='android' THEN 'revenue' 
              END AS metric 
       FROM   filter_top_n_raw_data);
       WITH replace_metric_device_code AS 
( 
       SELECT * , 
              CASE 
                     WHEN feed='0' 
                     AND    platform='ios' THEN 'ios-phone' 
                     WHEN feed='1' 
                     AND    platform='ios' THEN 'ios-phone' 
                     WHEN feed='2' 
                     AND    platform='ios' THEN 'ios-phone' 
                     WHEN feed='101' 
                     AND    platform='ios' THEN 'ios-tablet' 
                     WHEN feed='100' 
                     AND    platform='ios' THEN 'ios-tablet' 
                     WHEN feed='102' 
                     AND    platform='ios' THEN 'ios-tablet' 
                     WHEN feed='0' 
                     AND    platform='android' THEN 'android-all' 
                     WHEN feed='1' 
                     AND    platform='android' THEN 'android-all' 
                     WHEN feed='2' 
                     AND    platform='android' THEN 'android-all' 
              END AS device_code 
       FROM   replace_metric); 
WITH group_by_metric_1 AS 
( 
         SELECT   max(est) AS est, 
                  id, 
                  metric, 
                  device_code, 
                  store_id, 
                  platform 
         FROM     replace_metric_device_code 
         WHERE    store_id NOT IN (3,4,5,6) 
         AND      device_code  IN ('ios-phone' , 
                                   'ios-tablet' ) 
         AND      feed NOT IN (1000, 
                               1001, 
                               1002) 
         GROUP BY id, 
                  store_id, 
                  metric, 
                  device_code, 
                  platform );
                  WITH group_by_metric_2 AS 
( 
         SELECT   Max(est) AS est, 
                  id, 
                  metric, 
                  device_code, 
                  store_id, 
                  platform 
         FROM     replace_metric_device_code 
         WHERE    store_id NOT IN ( 1003, 
                                   1005, 
                                   1006,1007) 
         AND      device_code='android-all' 
         AND      feed NOT IN (1000, 
                               1001, 
                               1002) 
         GROUP BY id, 
                  store_id, 
                  metric, 
                  device_code, 
                  platform ); 
WITH group_by_metric AS 
( 
       SELECT * 
       FROM   group_by_metric_1 
       UNION ALL 
       SELECT * 
       FROM   group_by_metric_2 ); 
-- pivot metric column

WITH pivot_metric_raw AS 
( 
                SELECT DISTINCT id AS app_id, 
                                store_id, 
                                platform, 
                                device_code, 
                                free_app_download, 
                                revenue, 
                                paid_app_download 
                FROM            group_by_metric PIVOT ( Max(est) FOR metric IN ('free_app_download',
                                                                                'revenue', 
                                                                                'paid_app_download') ) );
-- union all platform with country_code mapping 
with country_code_mapping AS 
( 
       SELECT *, 
              'android' AS market_code 
       FROM   android_country_mapping 
       UNION ALL 
       SELECT *, 
              'ios' market_code 
       FROM   ios_country_mapping 
       UNION ALL 
       SELECT 143502, 
              'VE', 
              'VESA', 
              'ios' 
       UNION ALL 
       SELECT 0, 
              'WW', 
              'worldwide', 
              'ios' 
       UNION ALL 
       SELECT 36, 
              'CZ', 
              'CZ', 
              'android' 
       UNION ALL 
       SELECT 5, 
              'ES', 
              'ES', 
              'android' ); 
-- map raw with country_code

WITH country_est_mapping_raw AS 
( 
           SELECT     app_id, 
                      country_code, 
                      device_code, 
                      free_app_download, 
                      paid_app_download, 
                      revenue 
           FROM       country_code_mapping 
           INNER JOIN pivot_metric_raw 
           ON         country_code_mapping.store_id=pivot_metric_raw.store_id 
           AND        country_code_mapping.market_code=pivot_metric_raw.platform 
           WHERE      country_name!='Global' ); 
WITH filter_top_n_raw_data AS 
( 
                SELECT DISTINCT id, 
                                sum(est) AS est, 
                                store_id, 
                                platform_id, 
                                feed, 
                                vertical, 
                                platform 
                FROM            ( 
                                                SELECT DISTINCT d1.id, 
                                                                d1.est, 
                                                                d1.store_id, 
                                                                d1.date, 
                                                                d1.feed, 
                                                                d1.vertical, 
                                                                d1.platform_id, 
                                                                d1.platform 
                                                FROM            daily_data AS d1 
                                                JOIN            daily_data AS d2 
                                                ON              d1.id = d2.id 
                                                AND             d1.store_id = d2.store_id 
                                                AND             d1.feed = d2.feed 
                                                AND             d1.vertical = d2.vertical 
                                                AND             d1.platform_id = d2.platform_id 
                                                WHERE           ( 
                                                                                d1.rank <= 4000 
                                                                AND             d2.rank<=4000 
                                                                AND             d1.store_id == 0 
                                                                AND             d1.platform = 'ios' )
                                                OR              ( 
                                                                                d1.rank <= 1000 
                                                                AND             d2.rank<=1000 
                                                                AND             d1.store_id != 0 
                                                                AND             d1.platform = 'ios' )
                                                OR              ( 
                                                                                d1.rank <= 4000 
                                                                AND             d2.rank<=4000 
                                                                AND             d1.store_id == 1000
                                                                AND             d1.platform = 'android' )
                                                OR              ( 
                                                                                d1.rank <= 1000 
                                                                AND             d2.rank<=1000 
                                                                AND             d1.store_id != 1000
                                                                AND             d1.platform = 'android' ) ) AS t
                WHERE           feed IN ( 0, 
                                         1, 
                                         2, 
                                         101, 
                                         100, 
                                         102 ) 
                GROUP BY        id, 
                                store_id, 
                                platform_id, 
                                vertical, 
                                feed, 
                                platform); 
with replace_metric AS 
( 
       SELECT * , 
              CASE 
                     WHEN feed='0' 
                     AND    platform='ios' THEN 'free_app_download' 
                     WHEN feed='1' 
                     AND    platform='ios' THEN 'paid_app_download' 
                     WHEN feed='2' 
                     AND    platform='ios' THEN 'revenue' 
                     WHEN feed='101' 
                     AND    platform='ios' THEN 'free_app_download' 
                     WHEN feed='100' 
                     AND    platform='ios' THEN 'paid_app_download' 
                     WHEN feed='102' 
                     AND    platform='ios' THEN 'revenue' 
                     WHEN feed='0' 
                     AND    platform='android' THEN 'free_app_download' 
                     WHEN feed='1' 
                     AND    platform='android' THEN 'paid_app_download' 
                     WHEN feed='2' 
                     AND    platform='android' THEN 'revenue' 
              END AS metric 
       FROM   filter_top_n_raw_data);WITH replace_metric_device_code AS 
( 
       SELECT * , 
              CASE 
                     WHEN feed='0' 
                     AND    platform='ios' THEN 'ios-phone' 
                     WHEN feed='1' 
                     AND    platform='ios' THEN 'ios-phone' 
                     WHEN feed='2' 
                     AND    platform='ios' THEN 'ios-phone' 
                     WHEN feed='101' 
                     AND    platform='ios' THEN 'ios-tablet' 
                     WHEN feed='100' 
                     AND    platform='ios' THEN 'ios-tablet' 
                     WHEN feed='102' 
                     AND    platform='ios' THEN 'ios-tablet' 
                     WHEN feed='0' 
                     AND    platform='android' THEN 'android-all' 
                     WHEN feed='1' 
                     AND    platform='android' THEN 'android-all' 
                     WHEN feed='2' 
                     AND    platform='android' THEN 'android-all' 
              END AS device_code 
       FROM   replace_metric); 
WITH group_by_metric_1 AS 
( 
         SELECT   max(est) AS est, 
                  id, 
                  metric, 
                  device_code, 
                  store_id, 
                  platform 
         FROM     replace_metric_device_code 
         WHERE    store_id NOT IN (3,4,5,6) 
         AND      device_code  IN ('ios-phone' , 
                                   'ios-tablet' ) 
         AND      feed NOT IN (1000, 
                               1001, 
                               1002) 
         GROUP BY id, 
                  store_id, 
                  metric, 
                  device_code, 
                  platform );WITH group_by_metric_2 AS 
( 
         SELECT   Max(est) AS est, 
                  id, 
                  metric, 
                  device_code, 
                  store_id, 
                  platform 
         FROM     replace_metric_device_code 
         WHERE    store_id NOT IN ( 1003, 
                                   1005, 
                                   1006,1007) 
         AND      device_code='android-all' 
         AND      feed NOT IN (1000, 
                               1001, 
                               1002) 
         GROUP BY id, 
                  store_id, 
                  metric, 
                  device_code, 
                  platform ); 
WITH group_by_metric AS 
( 
       SELECT * 
       FROM   group_by_metric_1 
       UNION ALL 
       SELECT * 
       FROM   group_by_metric_2 ); 
-- pivot metric column

WITH pivot_metric_raw AS 
( 
                SELECT DISTINCT id AS app_id, 
                                store_id, 
                                platform, 
                                device_code, 
                                free_app_download, 
                                revenue, 
                                paid_app_download 
                FROM            group_by_metric PIVOT ( Max(est) FOR metric IN ('free_app_download',
                                                                                'revenue', 
                                                                                'paid_app_download') ) );
-- union all platform with country_code mapping 
with country_code_mapping AS 
( 
       SELECT *, 
              'android' AS market_code 
       FROM   android_country_mapping 
       UNION ALL 
       SELECT *, 
              'ios' market_code 
       FROM   ios_country_mapping 
       UNION ALL 
       SELECT 143502, 
              'VE', 
              'VESA', 
              'ios' 
       UNION ALL 
       SELECT 0, 
              'WW', 
              'worldwide', 
              'ios' 
       UNION ALL 
       SELECT 36, 
              'CZ', 
              'CZ', 
              'android' 
       UNION ALL 
       SELECT 5, 
              'ES', 
              'ES', 
              'android' ); 
-- map raw with country_code

WITH country_est_mapping_raw AS 
( 
           SELECT     app_id, 
                      country_code, 
                      device_code, 
                      free_app_download, 
                      paid_app_download, 
                      revenue 
           FROM       country_code_mapping 
           INNER JOIN pivot_metric_raw 
           ON         country_code_mapping.store_id=pivot_metric_raw.store_id 
           AND        country_code_mapping.market_code=pivot_metric_raw.platform 
           WHERE      country_name!='Global' );
'''



"""
get date:  [ [month, [days]], [month, [days]], [month, [days]], ....... ]
"""
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in
                 list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list


def test_daily_trancate_category_data(spark, test_data):
    # CSV schema
    from pyspark.sql import types as T
    from pyspark.sql import functions as F

    csv_schema = T.StructType(
        [
            T.StructField("store_id", T.IntegerType(), True),
            T.StructField("date", T.DateType(), True),
            T.StructField("platform_id", T.IntegerType(), True),
            T.StructField("vertical", T.IntegerType(), True),
            T.StructField("feed", T.IntegerType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("est", T.IntegerType(), True),
            T.StructField("category_id", T.IntegerType(), True),
            T.StructField("rank", T.IntegerType(), True)
        ]
    )

    print test_data
    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator < '2012-01-01':
        df_1 = spark.read.option("basePath",
                                 "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (test_data),
            sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id',
                                                                  'platform_id', 'vertical', 'rank', 'feed',
                                                                  'est', 'date', 'platform').cache()


    ### 2. only csv
    elif month_indicator >= '2012-01-01' and month_indicator < '2019-07-14':
        df_ios = spark.read.option("basePath",
                                   "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (test_data),
            sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id',
                                                                  'platform_id', 'vertical', 'rank', 'feed',
                                                                  'est', 'date', 'platform').cache()
        df_android = spark.read.option("basePath",
                                       "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/" % (
                test_data), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id',
                                                                                      'category_id',
                                                                                      'platform_id',
                                                                                      'vertical', 'rank',
                                                                                      'feed', 'est', 'date',
                                                                                      'platform').cache()
        df_1 = df_ios.union(df_android)


    ### 3. only parquet ###
    else:
        df_1 = spark.read.option("basePath",
                                 "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet(
            "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (
                test_data)).cache()

    df_1.createOrReplaceTempView("daily_data")

    daily_est_load = spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-load.v3/fact/granularity=daily/").where(
        "granularity='daily' and date='{}'".format(test_data)).cache()
    daily_est_load.createOrReplaceTempView("daily_unified_category")

    # store_unified , rank_unified
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "category_mapping_deminsion_service",
                "path": ["s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping"],
            },
            {
                "data_encoding": "csv",
                "compression": "gzip",
                "name": "ios_country_mapping",
                "data_schema": [
                    {"name": "store_id", "type": "int", "nullable": False},
                    {"name": "country_code", "type": "string", "nullable": False},
                    {"name": "country_name", "type": "string", "nullable": False}
                ],
                "csv_options": {
                    'header': True,
                    'sep': '\t',
                    'quote': '',
                    'encoding': 'utf-8',
                    'escape': ''
                },

                "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
            },
            {
                "data_encoding": "csv",
                "compression": "gzip",
                "name": "android_country_mapping",
                "data_schema": [
                    {"name": "store_id", "type": "int", "nullable": False},
                    {"name": "country_code", "type": "string", "nullable": False},
                    {"name": "country_name", "type": "string", "nullable": False}
                ],
                "csv_options": {
                    'header': True,
                    'sep': '\t',
                    'quote': '',
                    'encoding': 'utf-8',
                    'escape': ''
                },

                "path": [
                    "s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
            }
        ]
    }

    run(spark, ingest_msg, sql_text_category)

    diff_df1 = spark.sql('''
                            SELECT app_id, country_code, category_id, device_code, free_app_download, paid_app_download, revenue
                            FROM  country_category_mapping_raw 
                            EXCEPT ALL 
                            SELECT app_id, 
                                   country_code, 
                                   category_id, 
                                   device_code, 
                                   est_free_app_download AS free_app_download, 
                                   est_paid_app_download AS paid_app_download, 
                                   est_revenue           AS revenue 
                            FROM   daily_unified_category
                            ''')
    diff_df2 = spark.sql('''
                            SELECT app_id, 
                               country_code, 
                               category_id, 
                               device_code, 
                               est_free_app_download AS free_app_download, 
                               est_paid_app_download AS paid_app_download, 
                               est_revenue           AS revenue 
                            FROM   daily_unified_category 
                            EXCEPT ALL 
                            SELECT app_id, country_code, category_id, device_code, free_app_download, paid_app_download, revenue 
                            FROM   country_category_mapping_raw
                            ''')

    print diff_df1.take(3)
    print diff_df2.take(3)
    return diff_df1, diff_df2


def test_daily_truncate_est_data(spark, test_data):
    # CSV schema
    from pyspark.sql import types as T
    from pyspark.sql import functions as F

    csv_schema = T.StructType(
        [
            T.StructField("store_id", T.IntegerType(), True),
            T.StructField("date", T.DateType(), True),
            T.StructField("platform_id", T.IntegerType(), True),
            T.StructField("vertical", T.IntegerType(), True),
            T.StructField("feed", T.IntegerType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("est", T.IntegerType(), True),
            T.StructField("category_id", T.IntegerType(), True),
            T.StructField("rank", T.IntegerType(), True)
        ]
    )

    print test_data
    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator == '2010-07-31':
        temp_date_range = get_date_list('2010-07-04', '2010-07-31', freq='D')
        df_1 = spark.read.option("basePath",
                                 "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (
                temp_date_range), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id',
                                                                                        'category_id',
                                                                                        'platform_id',
                                                                                        'vertical',
                                                                                        'rank', 'feed',
                                                                                        'est', 'date',
                                                                                        'platform').cache()


    ### 2. only csv
    elif month_indicator > '2010-08-01' and month_indicator < '2019-07-14':
        df_ios = spark.read.option("basePath",
                                   "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (
                test_data), sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id',
                                                                                  'category_id',
                                                                                  'platform_id',
                                                                                  'vertical', 'rank',
                                                                                  'feed', 'est', 'date',
                                                                                  'platform').cache()
        df_android = spark.read.option("basePath",
                                       "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/" % (
                test_data), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id',
                                                                                      'category_id',
                                                                                      'platform_id',
                                                                                      'vertical', 'rank',
                                                                                      'feed', 'est',
                                                                                      'date',
                                                                                      'platform').cache()
        df_1 = df_ios.union(df_android)



    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath",
                                 "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet(
            "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (
                test_data)).cache()

    df_1.createOrReplaceTempView("daily_data")

    daily_est_load = spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/").where(
        "granularity='daily' and date='{}'".format(test_data)).cache()
    daily_est_load.createOrReplaceTempView("daily_unified_est")

    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "data_encoding": "csv",
                "compression": "gzip",
                "name": "ios_country_mapping",
                "data_schema": [
                    {"name": "store_id", "type": "int", "nullable": False},
                    {"name": "country_code", "type": "string", "nullable": False},
                    {"name": "country_name", "type": "string", "nullable": False}
                ],
                "csv_options": {
                    'header': True,
                    'sep': '\t',
                    'quote': '',
                    'encoding': 'utf-8',
                    'escape': ''
                },

                "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
            },
            {
                "data_encoding": "csv",
                "compression": "gzip",
                "name": "android_country_mapping",
                "data_schema": [
                    {"name": "store_id", "type": "int", "nullable": False},
                    {"name": "country_code", "type": "string", "nullable": False},
                    {"name": "country_name", "type": "string", "nullable": False}
                ],
                "csv_options": {
                    'header': True,
                    'sep': '\t',
                    'quote': '',
                    'encoding': 'utf-8',
                    'escape': ''
                },

                "path": [
                    "s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
            }
        ]
    }

    run(spark, ingest_msg, sql_text_est)

    diff_df1 = spark.sql(
        "select * from country_est_mapping_raw except all select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue from daily_unified_est ")
    diff_df2 = spark.sql(
        "select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue from daily_unified_est  except all select * from country_est_mapping_raw")

    return diff_df1, diff_df2


sql_text_unified_est_category='''
-- mapping feed as metrc in raw
WITH feed_metric AS (
select *, 'free_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='0' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='1' and platform='ios'
UNION ALL
select *, 'revenue' as metric , "ios-phone" as device_code from rank_raw where  feed='2' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-tablet" as device_code from rank_raw where  feed='101' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-tablet" as device_code from rank_raw  where  feed='100' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-tablet" as device_code from rank_raw  where  feed='102' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1000' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1001' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-all" as device_code from rank_raw  where  feed='1002' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric , "android-all" as device_code from rank_raw   where  feed='0' and platform='android'
UNION ALL
select *, 'paid_app_download' as metric, "android-all" as device_code from rank_raw  where  feed='1' and platform='android'
UNION ALL
select *, 'revenue' as metric,  "android-all" as device_code from rank_raw  where  feed='2' and platform='android'
);


-- select tested column from raw data
WITH metric_raw_data AS (
SELECT id, category_id as raw_category_id,rank,store_id as raw_store_id , metric,device_code,date , platform from feed_metric where store_id not in (3,4,5,6, 1002,1003, 1005,1004, 1006,1007)
);


-- group by and count data in raw data
WITH group_by_raw AS (
SELECT count(id) AS total_count , raw_category_id, raw_store_id, metric,device_code,date,platform from metric_raw_data where raw_store_id not in (3,4,5,6, 1002,1003, 1004, 1005, 1006,1007) group by raw_category_id, raw_store_id, metric,device_code,date, platform
);


-- pivot metric column
WITH pivot_metric_rank_raw AS (

SELECT 
free_app_download,revenue, paid_app_download, raw_category_id,raw_store_id,device_code, platform,date
FROM
      group_by_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);




-- select tested column from raw data
WITH metric_raw_store_data AS (
SELECT distinct id, est, store_id as raw_store_id , metric,device_code, date , platform from feed_metric where store_id not in (3,4,5,6, 1002,1003, 1005,1004, 1006,1007)

);


-- group by and count data in raw data
WITH group_by_store_raw AS (
SELECT count(est) AS total_count ,raw_store_id, metric,device_code,date,platform from metric_raw_store_data where raw_store_id not in (3,4,5,6,1002,1003, 1004, 1005, 1006,1007) group by raw_store_id, metric,device_code,date, platform
);


-- pivot metric column
WITH pivot_metric_store_raw AS (

SELECT 
free_app_download,revenue, paid_app_download,raw_store_id,device_code, platform, date
FROM
      group_by_store_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);



-- map raw with category
WITH category_mapping_raw AS (

SELECT * from ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
 ) as mapping right join pivot_metric_rank_raw on mapping.legacy_category_id=pivot_metric_rank_raw.raw_category_id and 
mapping.mapping_platform=pivot_metric_rank_raw.platform
);


-- map raw with rank country_code
WITH country_category_mapping_rank_raw AS (
select date, raw_store_id, country_code,device_code,category_id,free_app_download,paid_app_download,revenue from country_code_mapping right join category_mapping_raw on country_code_mapping.country_code_store_id=category_mapping_raw.raw_store_id and country_code_mapping.market_code=category_mapping_raw.platform
);



-- map raw with store country_code
WITH country_mapping_store_raw AS (
select date, raw_store_id, country_code,device_code,free_app_download,paid_app_download,revenue from country_code_mapping right join pivot_metric_store_raw on country_code_mapping.country_code_store_id=pivot_metric_store_raw.raw_store_id and country_code_mapping.market_code=pivot_metric_store_raw.platform
);


-- group by unified data
WITH unified_group_data AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code, category_id as unified_category_id from ( select distinct  app_id, free_app_download_rank as free_app_download , paid_app_download_rank as paid_app_download, revenue_rank as revenue, country_code, device_code, category_id from  rank_unified where data_stage='final' ) as unified
group by  category_id,  country_code,  device_code );


-- group by unified data
WITH unified_group_data_store AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code from  store_unified where data_stage='final' ) as unified
group by   country_code,  device_code );




-- compare raw vs unified data
WITH compared_data_rank AS (
    SELECT * from country_category_mapping_rank_raw left join unified_group_data on unified_group_data.unified_country_code==country_category_mapping_rank_raw.country_code and unified_group_data.unified_category_id==country_category_mapping_rank_raw.category_id and unified_group_data.unified_device_code==country_category_mapping_rank_raw.device_code
);

WITH miss_data_rank AS (
select * from compared_data_rank where unified_count_paid_app_download!=paid_app_download or unified_count_free_app_download != free_app_download  or unified_count_revenue != revenue or unified_count_app_id is null
)



-- compare raw vs unified data store
WITH compared_store_data AS (
    SELECT * from country_mapping_store_raw left join unified_group_data_store on unified_group_data_store.unified_country_code==country_mapping_store_raw.country_code and unified_group_data_store.unified_device_code==country_mapping_store_raw.device_code
);


WITH miss_data_store AS (
select * from compared_store_data where free_app_download!=unified_count_free_app_download or paid_app_download!=unified_count_paid_app_download or revenue!=unified_count_revenue or unified_count_app_id is null
)

'''

def test_daily_unified_est_category_data(spark, test_date):
    # df_1 = spark.read.option("basePath",
    #                          "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet(
    #     "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (
    #         test_date)).cache()

    # df_1.createOrReplaceTempView("daily_data")
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "name": "rank_raw",
                "data_encoding": "parquet",
                "compression": "gzip",
                "path": [
                    "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={}/".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "rank_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v4/fact/granularity=daily/date={}".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "store_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v4/fact/granularity=daily/date={}".format(
                        test_date)],
            }
        ]
    }
    run(spark, ingest_msg, sql_text_unified_est_category)
    miss_category = spark.sql("select * from miss_data_rank")
    miss_est= spark.sql("select * from miss_data_store")
    return miss_category, miss_est


DEFAULT_RECIPIENTS = "fzhang@appannie.com"


def send_db_check_email(title, text_context, key=None):
    default_recipients = [DEFAULT_RECIPIENTS]
    email.send(title, text_context, default_recipients, sender='dev-qa-data-quality@appannie.com')





class demoTest(PipelineTest):
    trigger_date_config = ("0 12 * * *", 8)


    # def test_daily_unified_est_category_data(self):
    #     trigger_datetime = datetime.datetime.strptime("2020-07-03", '%Y-%m-%d')
    #     self._get_check_date_from_routing_config(trigger_datetime).strftime(
    #         "%Y-%m-%d")

    #     df_category, df_est = test_daily_unified_est_category_data(self.spark, self.check_date)

    #     self.assertTrue(df_category.rdd.isEmpty(), "category raw is not equal unified")
    #     self.assertTrue(df_est.rdd.isEmpty(), "est raw is not equal to unified")

    def test_store_daily_category(self):
        trigger_datetime = datetime.datetime.strptime("2020-06-01", '%Y-%m-%d')
        check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime(
            "%Y-%m-%d")

        df1, df2 = test_daily_trancate_category_data(self.spark, self.check_date)

        self.assertTrue(df1.rdd.isEmpty(), "category raw is more than unified")
        self.assertTrue(df2.rdd.isEmpty(), "category unified is more than raw")

    # def test_store_daily_est(self):
    #     trigger_datetime = datetime.datetime.strptime("2020-06-01", '%Y-%m-%d')
    #     check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime(
    #         "%Y-%m-%d")

    #     df_1, df_2 = test_daily_truncate_est_data(self.spark, check_date_str_actual)

    #     self.assertTrue(df_1.rdd.isEmpty(), "est raw is more than unified")
    #     self.assertTrue(df_2.rdd.isEmpty(), "est unified is more than raw")


def send_message(spark):
    log_file = "/tmp/db_check.log"
    with open(log_file, "w") as html_file:
        suite = unittest.TestSuite()
        suite.addTests(unittest.TestLoader().loadTestsFromTestCase(demoTest))
        runner = HTMLTestRunner(
            stream=html_file,
            title='Transform Test Report',
            description='This db_check the report output by Tech Team.'
        )

        failed_count = 0
        result_list = runner.run(suite).result
        for result in result_list:
            if result[0] == 1 or result[0] == 2:
                failed_count += 1

    with open(log_file, 'r') as html_file:
        today = datetime.date.today()
        str_today = today.strftime("%Y-%m-%d")

        title = "store daily check - preload est/category - " + str_today + " - "
        if failed_count == 0:
            title += "Passed"
        else:
            title += "Failed"
        send_db_check_email(title, html_file.read())


send_message(spark)


In [0]:


diff_df1 = spark.sql('''
                        SELECT * 
                        FROM  country_category_mapping_raw 
                        EXCEPT ALL 
                        SELECT app_id, 
                               country_code, 
                               category_id, 
                               device_code, 
                               est_free_app_download AS free_app_download, 
                               est_paid_app_download AS paid_app_download, 
                               est_revenue           AS revenue 
                        FROM   daily_unified_category
                        ''')
diff_df2 = spark.sql('''
                        SELECT app_id, 
                           country_code, 
                           category_id, 
                           device_code, 
                           est_free_app_download AS free_app_download, 
                           est_paid_app_download AS paid_app_download, 
                           est_revenue           AS revenue 
                        FROM   daily_unified_category 
                        EXCEPT ALL 
                        SELECT * 
                        FROM   country_category_mapping_raw
                        ''')
                        
diff_df1.show()
diff_df2.show()


In [0]:

spark.sql('''select app_id, country_code, 
                                   category_id, 
                                   device_code, free_app_download, paid_app_download,revenue
 from country_category_mapping_raw where app_id=281736535 and country_code='CA' and category_id=100001 except SELECT app_id, 
                                   country_code, 
                                   category_id, 
                                   device_code, 
                                   est_free_app_download AS free_app_download, 
                                   est_paid_app_download AS paid_app_download, 
                                   est_revenue           AS revenue 
                            FROM   daily_unified_category where app_id=281736535 and country_code='CA' and category_id=100001''').show()
                            
                            
spark.sql('''SELECT app_id, 
                                   country_code, 
                                   category_id, 
                                   device_code, 
                                   est_free_app_download AS free_app_download, 
                                   est_paid_app_download AS paid_app_download, 
                                   est_revenue           AS revenue 
                            FROM   daily_unified_category where app_id=281736535 and country_code='CA' and category_id=100001
                            except select app_id, country_code, 
                                   category_id, 
                                   device_code, free_app_download, paid_app_download,revenue
                            from country_category_mapping_raw where app_id=281736535 and country_code='CA' and category_id=100001''').show()



In [0]:


print spark.read.option("basePath",
                                 "s3://b2c-prod-data-pipeline-qa/aa.store/publisher_id/").parquet("s3://b2c-prod-data-pipeline-qa/aa.store/publisher_id/date=2020-0[1,2,3]-01/").select("publisher_id").distinct().count()

# print spark.read.parquet("s3://b2c-prod-data-pipeline-qa/aa.store/publisher_id/date=2020-01-01/").select("publisher_id").distinct().count()
# spark.sql("select * from rank_unified").show()
# spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date=2020-06-29").createOrReplaceTempView("rank_unified_old")
# spark.sql("select * from rank_unified_old").show()



In [0]:

spark.sql("select * from country_category_mapping_rank_raw where country_code='AE' and device_code='ios-phone' and category_id=100000 ").show()
spark.sql("select * from unified_group_data where unified_country_code='AE' and unified_device_code='ios-phone' and unified_category_id=100000").show()

In [0]:

spark.sql("select * from country_est_mapping_raw where app_id = 20600006432273 and country_code='CA'").show()
spark.sql("select * from daily_unified_est where app_id = 20600006432273 and country_code='CA'").show()

In [0]:

test_date='2020-07-05'
daily_est_load = spark.read.format("delta").load(
    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/").where(
    "granularity='daily' and date='{}'".format(test_date)).cache()
daily_est_load.createOrReplaceTempView("daily_test_est_truncate")
spark.sql("select * from daily_test_est_truncate where app_id = 20600006432273 and country_code='CA'").show()


In [0]:

test_date='2020-07-03'
daily_est_load = spark.read.format("delta").load(
    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/").where(
    "granularity='daily' and date='{}'".format(test_date)).cache()
daily_est_load.createOrReplaceTempView("daily_test_est_truncate")
spark.sql("select * from daily_test_est_truncate where app_id = 20600006432273 and country_code='CA'").show()

In [0]:

import unittest
import datetime
import croniter

from aadatapipelinecore.core.utils.commandline import env
from aadatapipelinecore.core.utils.encode import activate_system_utf8
from aadatapipelinecore.core.utils.spark import create_spark

check_date = '2020-07-01'
trigger_date_config = ("0 12 * * *", 5)



trigger_datetime = datetime.datetime.strptime("2020-07-02", '%Y-%m-%d')

print trigger_datetime

def _get_check_date_from_routing_config(trigger_datetime):
    """
    return the date of : <days_delta> ago from previous scheduled date&time according to <cron_time>.
    e.g.
    config = ("0 9 * * *", 1), today is 2019-10-27 8:00
    so previous scheduled date&time is 2019-10-26 9:00
    will return "2019-10-25"

    e.g.
    config = ("0 7 * * *", 1), today is 2019-10-27 8:00
    so previous scheduled date&time is 2019-10-27 7:00
    will return "2019-10-26"

    Cron Time Format
    Character	Descriptor	        Acceptable values
    1	        Minute	            0 to 59, or * (no specific value)
    2	        Hour	            0 to 23, or * for any value. All times UTC.
    3	        Day of the month	1 to 31, or * (no specific value)
    4	        Month	            1 to 12, or * (no specific value)
    5	        Day of the week	    0 to 7 (0 and 7 both represent Sunday), or * (no specific value)

    :param trigger_datetime: the test trigger date
    :type trigger_datetime: object
    :return: date obj of "%Y-%m-%d"
    :type return: object
    """
    schedule, days_delta = trigger_date_config
    # here use UTC now
    cron = croniter.croniter(schedule, trigger_datetime)
    date = cron.get_prev(datetime.datetime) - datetime.timedelta(days=days_delta)
    return date


print _get_check_date_from_routing_config(trigger_datetime)
# def test_store_daily_category(self):
#     trigger_datetime = datetime.datetime.strptime("2020-07-02", '%Y-%m-%d')
#     check_date_str_actual = self._get_check_date_from_routing_config(trigger_datetime).strftime(
#             "%Y-%m-%d")
# df1, df2 = test_daily_trancate_category_data(self.spark, self.check_date)



In [0]:


#!/usr/bin/env python
# Copyright (c) 2017 App Annie Inc. All rights reserved.
# pylint: disable=E1101,E0633,W9018,W9012,
# coding=utf-8
"""

Implement test cases here
"""
import unittest
import datetime
import croniter

from aadatapipelinecore.core.utils.commandline import env
from aadatapipelinecore.core.utils.encode import activate_system_utf8
from aadatapipelinecore.core.utils.spark import create_spark


class PipelineTest(unittest.TestCase):
    trigger_date_config = None
    trigger_datetime = None
    prev_etl_datetime = None
    only_check_in_24hr = False

    def __init__(self, methodName='runTest', trigger_datetime=None):
        super(PipelineTest, self).__init__(methodName)
        self.trigger_datetime = trigger_datetime or datetime.datetime.utcnow()
        self.check_date_str = self._get_check_date_from_routing_config(self.trigger_datetime).strftime("%Y-%m-%d")
        self.check_date = self.check_date_str  # for compatibility with send email
        self.prev_etl_datetime = self._get_pre_etl_completed_date()

    def setUp(self):
        super(PipelineTest, self).setUp()
        # print "Triggered Datetime : {}".format(self.trigger_datetime)
        # print "Check date str : {}".format(self.check_date_str)
        self._verify_config()

    @classmethod
    def setUpClass(cls):
        super(PipelineTest, cls).setUpClass()
        activate_system_utf8()
        env(PYTHONIOENCODING='utf8')
        cls.spark = create_spark()
        cls.sc = cls.spark.sparkContext

    def _verify_config(self):
        self.assertIsNotNone(self.trigger_date_config)
        self.assertIsNotNone(self.trigger_datetime)
        self.assertIsNotNone(self.prev_etl_datetime)
        self.assertIsNotNone(self.check_date_str)
        self.assertIsNotNone(self.check_date)

    def _get_check_date_from_routing_config(self, trigger_datetime):
        """
        return the date of : <days_delta> ago from previous scheduled date&time according to <cron_time>.
        e.g.
        config = ("0 9 * * *", 1), today is 2019-10-27 8:00
        so previous scheduled date&time is 2019-10-26 9:00
        will return "2019-10-25"

        e.g.
        config = ("0 7 * * *", 1), today is 2019-10-27 8:00
        so previous scheduled date&time is 2019-10-27 7:00
        will return "2019-10-26"

        Cron Time Format
        Character	Descriptor	        Acceptable values
        1	        Minute	            0 to 59, or * (no specific value)
        2	        Hour	            0 to 23, or * for any value. All times UTC.
        3	        Day of the month	1 to 31, or * (no specific value)
        4	        Month	            1 to 12, or * (no specific value)
        5	        Day of the week	    0 to 7 (0 and 7 both represent Sunday), or * (no specific value)

        :param trigger_datetime: the test trigger date
        :type trigger_datetime: object
        :return: date obj of "%Y-%m-%d"
        :type return: object
        """
        schedule, days_delta = self.trigger_date_config
        # here use UTC now
        cron = croniter.croniter(schedule, trigger_datetime)
        date = cron.get_prev(datetime.datetime) - datetime.timedelta(days=days_delta)
        return date

    def _get_pre_etl_completed_date(self):
        schedule, _ = self.trigger_date_config
        cron = croniter.croniter(schedule, self.trigger_datetime)
        date = cron.get_prev(datetime.datetime)
        return date



# Copyright (c) 2019 App Annie Inc. All rights reserved.
# URL: http://tungwaiyip.info/software/HTMLTestRunner.html

"""
CREATE HTML TEMPLATE TABLE
"""
import datetime
import StringIO
import sys
import unittest
from xml.sax import saxutils


class OutputRedirector(object):
    """ Wrapper to redirect stdout or stderr """

    def __init__(self, fp):
        self.fp = fp

    def write(self, s):
        self.fp.write(s)

    def writelines(self, lines):
        self.fp.writelines(lines)

    def flush(self):
        self.fp.flush()


stdout_redirector = OutputRedirector(sys.stdout)
stderr_redirector = OutputRedirector(sys.stderr)


# ----------------------------------------------------------------------
# Template

class Template_mixin(object):
    STATUS = {
        0: 'SUCCESS',
        1: 'FAILED',
        2: 'ERROR',
        3: 'SKIP',
    }

    DEFAULT_TITLE = 'DB Test Report'
    DEFAULT_DESCRIPTION = ''

    # ------------------------------------------------------------------------
    # HTML Template

    HTML_TMPL = r"""

%(heading)s
%(report)s
%(ending)s

"""
    # variables: (title, generator, stylesheet, heading, report, ending)

    # ------------------------------------------------------------------------
    # Heading
    #

    HEADING_TMPL = """<div>
<h1>%(title)s</h1>
%(parameters)s
<p>%(description)s</p>
</div>

"""

    HEADING_ATTRIBUTE_TMPL = """<p><strong>%(name)s:</strong> %(value)s</p>
"""

    REPORT_TMPL = """
<table id='result_table' border='1' cellspacing='0' cellpadding='10'>
<colgroup>
<col align='left' />
<col align='right' />
<col align='right' />
<col align='right' />
</colgroup>
<tr>
    <th>Test Case</th>
    <th>Status</th>
    <th>Check Date</th>
    <th>Error Information</th>
</tr>
%(test_list)s
</table>
"""  # variables: (test_list, count, Pass, fail, error)

    REPORT_CLASS_TMPL = r"""
<tr>
    <td colspan='5' align='center'>%(desc)s</td>
</tr>
"""  # variables: (style, desc, count, Pass, fail, error, cid)

    REPORT_TEST_WITH_OUTPUT_TMPL = r"""
<tr>
    <td>%(desc)s</td>
    <td bgcolor=%(status_color)s>%(status)s</td>
    <td>%(check_date)s</td>
    <td colspan='2' align='left'>
        <pre>
        %(script)s
        </pre>
    </td>
</tr>
"""  # variables: (tid, Class, style, desc, status)

    REPORT_TEST_NO_OUTPUT_TMPL = r"""
<tr>
    <td>%(desc)s</td>
    <td bgcolor=%(status_color)s  align='center'>%(status)s</td>
    <td align='center'>%(check_date)s</td>
    <td colspan='2' align='center'>
    </td>
</tr>
"""  # variables: (tid, Class, style, desc, status)

    REPORT_TEST_OUTPUT_TMPL = r"""
%(output)s
"""  # variables: (id, output)

    # ------------------------------------------------------------------------
    # ENDING
    #

    ENDING_TMPL = """<div id='ending'>&nbsp;</div>"""


# -------------------- The end of the Template class -------------------


TestResult = unittest.TestResult


class _TestResult(TestResult):
    # note: _TestResult is a pure representation of results.
    # It lacks the output and reporting ability compares to unittest._TextTestResult.

    def __init__(self, verbosity=1):
        TestResult.__init__(self)
        self.outputBuffer = StringIO.StringIO()
        self.stdout0 = None
        self.stderr0 = None
        self.success_count = 0
        self.failure_count = 0
        self.error_count = 0
        self.skipped_count = 0
        self.verbosity = verbosity

        # result is a list of result in 4 tuple
        # (
        #   result code (0: success; 1: fail; 2: error),
        #   TestCase object,
        #   Test output (byte string),
        #   stack trace,
        # )
        self.result = []

    def startTest(self, test):
        TestResult.startTest(self, test)
        # just one buffer for both stdout and stderr
        stdout_redirector.fp = self.outputBuffer
        stderr_redirector.fp = self.outputBuffer
        self.stdout0 = sys.stdout
        self.stderr0 = sys.stderr
        sys.stdout = stdout_redirector
        sys.stderr = stderr_redirector

    def complete_output(self):
        """
        Disconnect output redirection and return buffer.
        Safe to call multiple times.
        """
        if self.stdout0:
            sys.stdout = self.stdout0
            sys.stderr = self.stderr0
            self.stdout0 = None
            self.stderr0 = None
        return self.outputBuffer.getvalue()

    def stopTest(self, test):
        # Usually one of addSuccess, addError or addFailure would have been called.
        # But there are some path in unittest that would bypass this.
        # We must disconnect stdout in stopTest(), which is guaranteed to be called.
        self.complete_output()

    def addSuccess(self, test):
        self.success_count += 1
        TestResult.addSuccess(self, test)
        output = self.complete_output()
        self.result.append((0, test, output, ''))
        if self.verbosity > 1:
            sys.stderr.write('ok ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('.')

    def addError(self, test, err):
        self.error_count += 1
        TestResult.addError(self, test, err)
        _, _exc_str = self.errors[-1]
        output = self.complete_output()
        self.result.append((2, test, output, _exc_str))
        if self.verbosity > 1:
            sys.stderr.write('E  ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('E')

    def addFailure(self, test, err):
        self.failure_count += 1
        TestResult.addFailure(self, test, err)
        _, _exc_str = self.failures[-1]
        output = self.complete_output()
        self.result.append((1, test, output, _exc_str))
        if self.verbosity > 1:
            sys.stderr.write('F  ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('F')

    def addSkip(self, test, reason):
        self.skipped_count += 1
        TestResult.addSkip(self, test, reason)
        _, _exc_str = self.skipped[-1]
        output = self.complete_output()
        self.result.append((3, test, output, _exc_str))
        if self.verbosity > 1:
            sys.stderr.write('skip  ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('skip')


class HTMLTestRunner(Template_mixin):
    """
    """

    def __init__(self, stream=sys.stdout, verbosity=1, title=None, description=None):
        self.stopTime = None
        self.stream = stream
        self.verbosity = verbosity
        if title is None:
            self.title = self.DEFAULT_TITLE
        else:
            self.title = title
        if description is None:
            self.description = self.DEFAULT_DESCRIPTION
        else:
            self.description = description

        self.startTime = datetime.datetime.now()

    def run(self, test):
        """Run the given test case or test suite."""
        result = _TestResult(self.verbosity)
        test(result)
        self.stopTime = datetime.datetime.now()
        self.generateReport(result)
        print >> sys.stderr, '\nTime Elapsed: %s' % (self.stopTime - self.startTime)
        return result

    @staticmethod
    def push_failed_case_to_front(result_list):
        k = 0
        for i, result in enumerate(result_list):
            if result[0] == 1 or result[0] == 2:
                result_list[i], result_list[k] = result_list[k], result_list[i]
                k += 1
        return result_list

    @staticmethod
    def get_db_check_name(method_name):
        remove_test_name = method_name[len("test_"):]
        name_word_list = remove_test_name.split('_')
        final_name = ""
        for name in name_word_list:
            final_name += name.capitalize() + " "
        return final_name

    def getReportAttributes(self, result):
        """
        Return report attributes as a list of (name, value).
        Override this to add custom attributes.
        """
        startTime = str(self.startTime)[:19]
        duration = str(self.stopTime - self.startTime)
        status = []
        if result.success_count:
            status.append('Pass %s' % result.success_count)
        if result.failure_count:
            status.append('Failure %s' % result.failure_count)
        if result.error_count:
            status.append('Error %s' % result.error_count)
        if result.skipped_count:
            status.append('Skip %s' % result.skipped_count)
        if status:
            status = ' '.join(status)
        else:
            status = 'none'
        return [
            ('Start Time', startTime),
            ('Duration', duration),
            ('Status', status),
        ]

    def generateReport(self, result):
        report_attrs = self.getReportAttributes(result)
        generator = 'HTMLTestRunner'
        heading = self._generate_heading(report_attrs)
        report = self._generate_report(result)
        ending = self._generate_ending()
        output = self.HTML_TMPL % dict(
            title=saxutils.escape(self.title),
            generator=generator,
            heading=heading,
            report=report,
            ending=ending,
        )
        self.stream.write(output.encode('utf8'))

    def _generate_heading(self, report_attrs):
        a_lines = []
        for name, value in report_attrs:
            line = self.HEADING_ATTRIBUTE_TMPL % dict(
                name=saxutils.escape(name),
                value=saxutils.escape(value),
            )
            a_lines.append(line)
        heading = self.HEADING_TMPL % dict(
            title=saxutils.escape(self.title),
            parameters=''.join(a_lines),
            description=saxutils.escape(self.description),
        )
        return heading

    def _generate_report(self, result):
        rows = []
        case_list = self.push_failed_case_to_front(result.result)
        for case in case_list:
            n = case[0]  # status number eg. 0, 1, 2, 3
            t = case[1]  # test case class
            o = case[2]  # out put
            e = case[3]  # error message

            self._generate_report_test(rows, n, t, o, e)

        report = self.REPORT_TMPL % dict(
            test_list=''.join(rows),
            count=str(result.success_count + result.failure_count + result.error_count),
            Pass=str(result.success_count),
            fail=str(result.failure_count),
            error=str(result.error_count),
            skip=str(result.skipped_count),
        )
        return report

    def _generate_report_test(self, rows, n, t, o, e):
        # e.g. 'pt1.1', 'ft1.1', etc
        has_output = bool(o or e)
        name = t.id().split('.')[-1]
        check_date = t.check_date

        # test_class_name = t.__class__.__name__[len("Test"):]
        desc = HTMLTestRunner.get_db_check_name(name)
        tmpl = (self.REPORT_TEST_WITH_OUTPUT_TMPL if has_output else self.REPORT_TEST_NO_OUTPUT_TMPL)

        # o and e should be byte string because they are collected from stdout and stderr?
        if isinstance(o, str):
            # TODO: some problem with 'string_escape': it escape \n and mess up formating
            # uo = unicode(o.encode('string_escape'))
            uo = o.decode('latin-1')
        else:
            uo = o
        if isinstance(e, str):
            # TODO: some problem with 'string_escape': it escape \n and mess up formating
            # ue = unicode(e.encode('string_escape'))
            ue = e.decode('latin-1')
        else:
            ue = e

        script = self.REPORT_TEST_OUTPUT_TMPL % dict(
            output=saxutils.escape(uo + ue),
        )

        status_color = "green"

        if n == 1:
            status_color = "red"
        if n == 2:
            status_color = "yellow"
        if n == 3:
            status_color = "grey"

        row = tmpl % dict(
            desc=desc,
            script=script,
            status=self.STATUS[n],
            status_color=status_color,
            check_date=check_date,
        )
        rows.append(row)
        if not has_output:
            return

    def _generate_ending(self):
        return self.ENDING_TMPL



In [0]:


daily_est_load = spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v1/fact/").where(
        "granularity='daily' and date='2020-03-01'").cache()
daily_est_load.createOrReplaceTempView("daily_unified_est")
spark.sql("select * from daily_unified_est").show()


In [0]:


import datetime
from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
from aadatapipelinecore.core.utils.spark import eject_all_caches
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")

import aaplproxy



class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass

def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()

sql_text = """

    WITH filter_top_N_raw_data AS(
    SELECT
     distinct
      id,
      Sum(est) AS est,
      store_id,
      platform_id,
      feed,
      vertical,
      platform
    FROM
      (
        SELECT
          DISTINCT d1.id,
          d1.est,
          d1.store_id,
          d1.date,
          d1.feed,
          d1.vertical,
          d1.platform_id,
          d1.platform
        FROM
          daily_data AS d1
          JOIN daily_data AS d2 
          ON d1.id = d2.id
          AND d1.store_id = d2.store_id
          AND d1.feed = d2.feed
          AND d1.vertical = d2.vertical
          AND d1.platform_id = d2.platform_id
        WHERE (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 0 and d1.platform = 'ios' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 0 and d1.platform = 'ios' )
            OR  (d1.rank <= 4000 and d2.rank<=4000 and d1.store_id == 1000 and d1.platform = 'android' ) 
            OR (d1.rank <= 1000 and d2.rank<=1000 and  d1.store_id != 1000 and d1.platform = 'android' )
      ) AS t
    WHERE
      feed IN (
        0,
        1,
        2,
        101,
        100,
        102
      )
    GROUP BY
      id,
      store_id,
      platform_id,
      vertical,
      feed,
      platform);

     WITH replace_metric AS (
     SELECT * ,
         case 
        when feed='0' and platform='ios' then 'free_app_download'
        when feed='1' and platform='ios' then 'paid_app_download'
        when feed='2' and platform='ios' then 'revenue' 
        when feed='101' and platform='ios' then 'free_app_download' 
        when feed='100' and platform='ios' then 'paid_app_download' 
        when feed='102' and platform='ios' then 'revenue' 
        when feed='0' and platform='android' then 'free_app_download' 
        when feed='1' and platform='android' then 'paid_app_download' 
        when feed='2' and platform='android' then 'revenue' 
        end as metric from filter_top_N_raw_data);


         WITH replace_metric_device_code AS (
        SELECT * ,
         case 
        when feed='0' and platform='ios' then 'ios-phone'
        when feed='1' and platform='ios' then 'ios-phone'
        when feed='2' and platform='ios' then 'ios-phone' 
        when feed='101' and platform='ios' then 'ios-tablet' 
        when feed='100' and platform='ios' then 'ios-tablet' 
        when feed='102' and platform='ios' then 'ios-tablet' 
        when feed='0' and platform='android' then 'android-all' 
        when feed='1' and platform='android' then 'android-all' 
        when feed='2' and platform='android' then 'android-all' 
        end as device_code from replace_metric);


    WITH group_by_metric_1 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in (3,4,5,6) and device_code in ('ios-phone' ,'ios-tablet' ) and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
        );

    WITH group_by_metric_2 AS (
        SELECT max(est) as est, id, metric,device_code, store_id, platform from replace_metric_device_code where store_id not in ( 1003, 1005, 1006,1007) and device_code='android-all' and feed not in (1000, 1001, 1002) group by id, store_id, metric,device_code, platform
    );

    WITH group_by_metric AS(
        SELECT * FROM group_by_metric_1
        UNION ALL
        SELECT * FROM group_by_metric_2
        );

      -- pivot metric column
    WITH pivot_metric_raw AS (

    SELECT 
        distinct id as app_id, store_id, platform, device_code, free_app_download,revenue, paid_app_download
    FROM
          group_by_metric
     PIVOT (
        max(est) 
    	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
      )
    );


    -- union all platform with country_code mapping

    WITH country_code_mapping AS (
    select *, 'android' as market_code from android_country_mapping 
    UNION ALL select *, 'ios' market_code from ios_country_mapping
    UNION ALL select 143502, 'VE', 'VESA', 'ios'
    UNION ALL select 0, 'WW', 'worldwide', 'ios'
    UNION ALL select 36, 'CZ', 'CZ', 'android'
    UNION ALL select 5, 'ES', 'ES', 'android'

    );



    -- map raw with country_code

    WITH country_category_mapping_raw AS (
    select app_id, country_code, device_code, free_app_download, paid_app_download, revenue 
     from country_code_mapping 
     inner join 
         pivot_metric_raw 
     on 
         country_code_mapping.store_id=pivot_metric_raw.store_id 
     and 
         country_code_mapping.market_code=pivot_metric_raw.platform
    where country_name!='Global'
    );


      """




"""
get date:  [ [month, [days]], [month, [days]], [month, [days]], ....... ]
"""
def get_date_list(start_date, end_date, freq="D"):
    import pandas as pd
    """
    freq:   D: calendar day frequency
            M: month end frequency
            MS: month start frequency
            A, Y: year end frequency
            AS, YS: year start frequency
    """
    date_list = [x.strftime('%Y-%m-%d') for x in list(pd.date_range(start=start_date, end=end_date, freq=freq))]
    return date_list





def test_daily_pre_data(spark, test_data):
    # CSV schema
    from pyspark.sql import types as T
    from pyspark.sql import functions as F

    csv_schema = T.StructType(
        [
            T.StructField("store_id", T.IntegerType(), True),
            T.StructField("date", T.DateType(), True),
            T.StructField("platform_id", T.IntegerType(), True),
            T.StructField("vertical", T.IntegerType(), True),
            T.StructField("feed", T.IntegerType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("est", T.IntegerType(), True),
            T.StructField("category_id", T.IntegerType(), True),
            T.StructField("rank", T.IntegerType(), True)
        ]
    )

    print test_data
    month_indicator = test_data
    ### 1. only csv, but date range is '2010-07-04' to '2010-07-31' ###
    if month_indicator < '2012-01-01':
        df_1 = spark.read.option("basePath",
                                   "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (test_data),
            sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id',
                                                                  'platform_id', 'vertical', 'rank', 'feed',
                                                                  'est', 'date', 'platform').cache()


    ### 2. only csv
    elif month_indicator >= '2012-01-01' and month_indicator < '2019-07-14':
        df_ios = spark.read.option("basePath",
                                   "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/ios/sbe_est_app/*/" % (test_data),
            sep="\t").withColumn("platform", F.lit("ios")).select('id', 'store_id', 'category_id',
                                                                  'platform_id', 'vertical', 'rank', 'feed',
                                                                  'est', 'date', 'platform').cache()
        df_android = spark.read.option("basePath",
                                       "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
            csv_schema).csv(
            "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{%s}/android/sbe_est_app/*/" % (
                test_data), sep="\t").withColumn("platform", F.lit("android")).select('id', 'store_id',
                                                                                      'category_id',
                                                                                      'platform_id',
                                                                                      'vertical', 'rank',
                                                                                      'feed', 'est', 'date',
                                                                                      'platform').cache()
        df_1 = df_ios.union(df_android)


    ### 4. only parquet ###
    else:  # month_indicator >= '2019-08-31'
        df_1 = spark.read.option("basePath",
                                 "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/").parquet(
            "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={%s}/platform=*/*/" % (
                test_data)).cache()

    df_1.createOrReplaceTempView("daily_data")

    daily_est_load = spark.read.format("delta").load(
        "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-load.v3/fact/").where(
        "granularity='daily' and date='{}'".format(test_data)).cache()
    daily_est_load.createOrReplaceTempView("daily_unified_est")



    # store_unified , rank_unified
    namespace = "aa.store.market-size.v1"
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "data_encoding": "csv",
                "compression": "gzip",
                "name": "ios_country_mapping",
                "data_schema": [
                    {"name": "store_id", "type": "int", "nullable": False},
                    {"name": "country_code", "type": "string", "nullable": False},
                    {"name": "country_name", "type": "string", "nullable": False}
                ],
                "csv_options": {
                    'header': True,
                    'sep': '\t',
                    'quote': '',
                    'encoding': 'utf-8',
                    'escape': ''
                },

                "path": ["s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING"],
            },
            {
                "data_encoding": "csv",
                "compression": "gzip",
                "name": "android_country_mapping",
                "data_schema": [
                    {"name": "store_id", "type": "int", "nullable": False},
                    {"name": "country_code", "type": "string", "nullable": False},
                    {"name": "country_name", "type": "string", "nullable": False}
                ],
                "csv_options": {
                    'header': True,
                    'sep': '\t',
                    'quote': '',
                    'encoding': 'utf-8',
                    'escape': ''
                },

                "path": [
                    "s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING"],
            }
        ]
    }

    run(spark, ingest_msg, sql_text)

    diff_df1 = spark.sql(
        "select * from country_category_mapping_raw except all select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue from daily_unified_est ").persist()
    diff_df2 = spark.sql(
        "select app_id, country_code, device_code, est_free_app_download as free_app_download, est_paid_app_download as paid_app_download, est_revenue as revenue from daily_unified_est  except all select * from country_category_mapping_raw").persist()

    # diff_df1.show()
    # diff_df2.show()
    print diff_df1.take(3)
    print diff_df2.take(3)
    return diff_df1, diff_df2



DEFAULT_RECIPIENTS = "fzhang@appannie.com"

def send_db_check_email(title, text_context, key=None):
    default_recipients = [DEFAULT_RECIPIENTS]
    email.send(title, text_context, default_recipients, sender='dev-qa-data-quality@appannie.com')






import unittest



class demoTest(unittest.TestCase):
    check_date = '2020-07-01'
    df1, df2 = test_daily_pre_data(spark, check_date)

    def test_store_daily_est_category(self):
        self.assertTrue(self.df1.rdd.isEmpty(), "raw is more than unified")
        self.assertTrue(self.df2.rdd.isEmpty(), "unified is more than raw")



def send_message():
    log_file = "/tmp/db_check.log"
    with open(log_file, "w") as html_file:
        suite = unittest.TestSuite()
        suite.addTests(unittest.TestLoader().loadTestsFromTestCase(demoTest))
        runner = HTMLTestRunner(
            stream=html_file,
            title='DB Test Report',
            description='This db_check the report output by Tech Team.'
        )

        failed_count = 0
        result_list = runner.run(suite).result
        for result in result_list:
            if result[0] == 1 or result[0] == 2:
                failed_count += 1

    with open(log_file, 'r') as html_file:
        today = datetime.date.today()
        str_today = today.strftime("%Y-%m-%d")

        title = "store daily check - unified est/category - " + str_today + " - "
        if failed_count == 0:
            title += "Passed"
        else:
            title += "Failed"
        send_db_check_email(title, html_file.read())


send_message()


In [0]:


a = list()
print len(a)

In [0]:

import datetime
import StringIO
import sys
import unittest
from xml.sax import saxutils


class OutputRedirector(object):
    """ Wrapper to redirect stdout or stderr """

    def __init__(self, fp):
        self.fp = fp

    def write(self, s):
        self.fp.write(s)

    def writelines(self, lines):
        self.fp.writelines(lines)

    def flush(self):
        self.fp.flush()


stdout_redirector = OutputRedirector(sys.stdout)
stderr_redirector = OutputRedirector(sys.stderr)


# ----------------------------------------------------------------------
# Template

class Template_mixin(object):
    STATUS = {
        0: 'SUCCESS',
        1: 'FAILED',
        2: 'ERROR',
        3: 'SKIP',
    }

    DEFAULT_TITLE = 'DB Test Report'
    DEFAULT_DESCRIPTION = ''

    # ------------------------------------------------------------------------
    # HTML Template

    HTML_TMPL = r"""

%(heading)s
%(report)s
%(ending)s

"""
    # variables: (title, generator, stylesheet, heading, report, ending)

    # ------------------------------------------------------------------------
    # Heading
    #

    HEADING_TMPL = """<div>
<h1>%(title)s</h1>
%(parameters)s
<p>%(description)s</p>
</div>

"""

    HEADING_ATTRIBUTE_TMPL = """<p><strong>%(name)s:</strong> %(value)s</p>
"""

    REPORT_TMPL = """
<table id='result_table' border='1' cellspacing='0' cellpadding='10'>
<colgroup>
<col align='left' />
<col align='right' />
<col align='right' />
<col align='right' />
</colgroup>
<tr>
    <th>Test Case</th>
    <th>Status</th>
    <th>Check Date</th>
    <th>Error Information</th>
</tr>
%(test_list)s
</table>
"""  # variables: (test_list, count, Pass, fail, error)

    REPORT_CLASS_TMPL = r"""
<tr>
    <td colspan='5' align='center'>%(desc)s</td>
</tr>
"""  # variables: (style, desc, count, Pass, fail, error, cid)

    REPORT_TEST_WITH_OUTPUT_TMPL = r"""
<tr>
    <td>%(desc)s</td>
    <td bgcolor=%(status_color)s>%(status)s</td>
    <td>%(check_date)s</td>
    <td colspan='2' align='left'>
        <pre>
        %(script)s
        </pre>
    </td>
</tr>
"""  # variables: (tid, Class, style, desc, status)

    REPORT_TEST_NO_OUTPUT_TMPL = r"""
<tr>
    <td>%(desc)s</td>
    <td bgcolor=%(status_color)s  align='center'>%(status)s</td>
    <td align='center'>%(check_date)s</td>
    <td colspan='2' align='center'>
    </td>
</tr>
"""  # variables: (tid, Class, style, desc, status)

    REPORT_TEST_OUTPUT_TMPL = r"""
%(output)s
"""  # variables: (id, output)

    # ------------------------------------------------------------------------
    # ENDING
    #

    ENDING_TMPL = """<div id='ending'>&nbsp;</div>"""


# -------------------- The end of the Template class -------------------


TestResult = unittest.TestResult


class _TestResult(TestResult):
    # note: _TestResult is a pure representation of results.
    # It lacks the output and reporting ability compares to unittest._TextTestResult.

    def __init__(self, verbosity=1):
        TestResult.__init__(self)
        self.outputBuffer = StringIO.StringIO()
        self.stdout0 = None
        self.stderr0 = None
        self.success_count = 0
        self.failure_count = 0
        self.error_count = 0
        self.skipped_count = 0
        self.verbosity = verbosity

        # result is a list of result in 4 tuple
        # (
        #   result code (0: success; 1: fail; 2: error),
        #   TestCase object,
        #   Test output (byte string),
        #   stack trace,
        # )
        self.result = []

    def startTest(self, test):
        TestResult.startTest(self, test)
        # just one buffer for both stdout and stderr
        stdout_redirector.fp = self.outputBuffer
        stderr_redirector.fp = self.outputBuffer
        self.stdout0 = sys.stdout
        self.stderr0 = sys.stderr
        sys.stdout = stdout_redirector
        sys.stderr = stderr_redirector

    def complete_output(self):
        """
        Disconnect output redirection and return buffer.
        Safe to call multiple times.
        """
        if self.stdout0:
            sys.stdout = self.stdout0
            sys.stderr = self.stderr0
            self.stdout0 = None
            self.stderr0 = None
        return self.outputBuffer.getvalue()

    def stopTest(self, test):
        # Usually one of addSuccess, addError or addFailure would have been called.
        # But there are some path in unittest that would bypass this.
        # We must disconnect stdout in stopTest(), which is guaranteed to be called.
        self.complete_output()

    def addSuccess(self, test):
        self.success_count += 1
        TestResult.addSuccess(self, test)
        output = self.complete_output()
        self.result.append((0, test, output, ''))
        if self.verbosity > 1:
            sys.stderr.write('ok ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('.')

    def addError(self, test, err):
        self.error_count += 1
        TestResult.addError(self, test, err)
        _, _exc_str = self.errors[-1]
        output = self.complete_output()
        self.result.append((2, test, output, _exc_str))
        if self.verbosity > 1:
            sys.stderr.write('E  ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('E')

    def addFailure(self, test, err):
        self.failure_count += 1
        TestResult.addFailure(self, test, err)
        _, _exc_str = self.failures[-1]
        output = self.complete_output()
        self.result.append((1, test, output, _exc_str))
        if self.verbosity > 1:
            sys.stderr.write('F  ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('F')

    def addSkip(self, test, reason):
        self.skipped_count += 1
        TestResult.addSkip(self, test, reason)
        _, _exc_str = self.skipped[-1]
        output = self.complete_output()
        self.result.append((3, test, output, _exc_str))
        if self.verbosity > 1:
            sys.stderr.write('skip  ')
            sys.stderr.write(str(test))
            sys.stderr.write('\n')
        else:
            sys.stderr.write('skip')


class HTMLTestRunner(Template_mixin):
    """
    """

    def __init__(self, stream=sys.stdout, verbosity=1, title=None, description=None):
        self.stopTime = None
        self.stream = stream
        self.verbosity = verbosity
        if title is None:
            self.title = self.DEFAULT_TITLE
        else:
            self.title = title
        if description is None:
            self.description = self.DEFAULT_DESCRIPTION
        else:
            self.description = description

        self.startTime = datetime.datetime.now()

    def run(self, test):
        """Run the given test case or test suite."""
        result = _TestResult(self.verbosity)
        test(result)
        self.stopTime = datetime.datetime.now()
        self.generateReport(result)
        print >> sys.stderr, '\nTime Elapsed: %s' % (self.stopTime - self.startTime)
        return result

    @staticmethod
    def push_failed_case_to_front(result_list):
        k = 0
        for i, result in enumerate(result_list):
            if result[0] == 1 or result[0] == 2:
                result_list[i], result_list[k] = result_list[k], result_list[i]
                k += 1
        return result_list

    @staticmethod
    def get_db_check_name(method_name):
        remove_test_name = method_name[len("test_"):]
        name_word_list = remove_test_name.split('_')
        final_name = ""
        for name in name_word_list:
            final_name += name.capitalize() + " "
        return final_name

    def getReportAttributes(self, result):
        """
        Return report attributes as a list of (name, value).
        Override this to add custom attributes.
        """
        startTime = str(self.startTime)[:19]
        duration = str(self.stopTime - self.startTime)
        status = []
        if result.success_count:
            status.append('Pass %s' % result.success_count)
        if result.failure_count:
            status.append('Failure %s' % result.failure_count)
        if result.error_count:
            status.append('Error %s' % result.error_count)
        if result.skipped_count:
            status.append('Skip %s' % result.skipped_count)
        if status:
            status = ' '.join(status)
        else:
            status = 'none'
        return [
            ('Start Time', startTime),
            ('Duration', duration),
            ('Status', status),
        ]

    def generateReport(self, result):
        report_attrs = self.getReportAttributes(result)
        generator = 'HTMLTestRunner'
        heading = self._generate_heading(report_attrs)
        report = self._generate_report(result)
        ending = self._generate_ending()
        output = self.HTML_TMPL % dict(
            title=saxutils.escape(self.title),
            generator=generator,
            heading=heading,
            report=report,
            ending=ending,
        )
        self.stream.write(output.encode('utf8'))

    def _generate_heading(self, report_attrs):
        a_lines = []
        for name, value in report_attrs:
            line = self.HEADING_ATTRIBUTE_TMPL % dict(
                name=saxutils.escape(name),
                value=saxutils.escape(value),
            )
            a_lines.append(line)
        heading = self.HEADING_TMPL % dict(
            title=saxutils.escape(self.title),
            parameters=''.join(a_lines),
            description=saxutils.escape(self.description),
        )
        return heading

    def _generate_report(self, result):
        rows = []
        case_list = self.push_failed_case_to_front(result.result)
        for case in case_list:
            n = case[0]  # status number eg. 0, 1, 2, 3
            t = case[1]  # test case class
            o = case[2]  # out put
            e = case[3]  # error message

            self._generate_report_test(rows, n, t, o, e)

        report = self.REPORT_TMPL % dict(
            test_list=''.join(rows),
            count=str(result.success_count + result.failure_count + result.error_count),
            Pass=str(result.success_count),
            fail=str(result.failure_count),
            error=str(result.error_count),
            skip=str(result.skipped_count),
        )
        return report

    def _generate_report_test(self, rows, n, t, o, e):
        # e.g. 'pt1.1', 'ft1.1', etc
        has_output = bool(o or e)
        name = t.id().split('.')[-1]
        check_date = t.check_date

        # test_class_name = t.__class__.__name__[len("Test"):]
        desc = HTMLTestRunner.get_db_check_name(name)
        tmpl = (self.REPORT_TEST_WITH_OUTPUT_TMPL if has_output else self.REPORT_TEST_NO_OUTPUT_TMPL)

        # o and e should be byte string because they are collected from stdout and stderr?
        if isinstance(o, str):
            # TODO: some problem with 'string_escape': it escape \n and mess up formating
            # uo = unicode(o.encode('string_escape'))
            uo = o.decode('latin-1')
        else:
            uo = o
        if isinstance(e, str):
            # TODO: some problem with 'string_escape': it escape \n and mess up formating
            # ue = unicode(e.encode('string_escape'))
            ue = e.decode('latin-1')
        else:
            ue = e

        script = self.REPORT_TEST_OUTPUT_TMPL % dict(
            output=saxutils.escape(uo + ue),
        )

        status_color = "green"

        if n == 1:
            status_color = "red"
        if n == 2:
            status_color = "yellow"
        if n == 3:
            status_color = "grey"

        row = tmpl % dict(
            desc=desc,
            script=script,
            status=self.STATUS[n],
            status_color=status_color,
            check_date=check_date,
        )
        rows.append(row)
        if not has_output:
            return

    def _generate_ending(self):
        return self.ENDING_TMPL


In [0]:

import unittest
import psycopg2
import datetime
spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")

import aaplproxy
from aadatapipelinecore.core.urn import Urn
from aaplproxy.da.local_sqlrunner import LocalSqlRunner
from aadatapipelinecore.core.utils.module import application_settings
from pyspark.sql import Row
from pyspark.sql.types import *
from pyspark.sql import functions as F
from aadatapipelinecore.core.fs import Conf

from aadatapipelinecore.core.monitoring.pipeline_monitor import running, fail, task_success
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.utils.identifier import package_id
from aadatapipelinecore.core.utils.spark import canned_spark, stop
from aadatapipelinecore.core.utils import email

ios_feed = {1: "0,1,2,100,101,102"}
android_feed = {0: "0,1,2"}


def citus_row(date):
    def get_data_in_citus(date):
        citus_dsn_ = (
            "dbname='{db}' user='{user}' password='{password}' "
            "host='{host}' port='{port}'".format(
                db="aa_store_db",
                user="citus_bdp_prod_app_int_qa",
                host="10.2.6.141",
                password="wZw8cfBuuklIskVG",
                port=5432
            )
        )
        sql = "select device_code, cast(sum(est_free_app_download) as int) as est_free_app_download, cast(sum(est_paid_app_download) as int) as est_paid_app_download, cast(sum(est_revenue) as int) as est_revenue from store.store_est_fact_v1 where date='{}' group by device_code".format(
            date)
        db_data = query(citus_dsn_, sql)
        return db_data

    def query(dsn, sql):
        with psycopg2.connect(dsn) as conn:
            conn.autocommit = True
            with conn.cursor() as cur:
                cur.execute(sql)
                result = cur.fetchall()
                conn.commit()
        return result

    result = get_data_in_citus(date)
    return [Row(device_code=r[0], sum_est_free_app_download=r[1], sum_est_paid_app_download=r[2],
                sum_est_revenue=r[3]) for r in result]


def plproxy_row(date, device_feed_dict):
    urn = Urn(
        namespace='app-qa.db-check.v1',
        owner='app_qa'
    )

    def build_db_settings(urn, db_conf_name):
        template = "PG_" + db_conf_name.upper() + "_{property}"
        settings = application_settings(urn)
        host, port = getattr(settings, template.format(property='HOSTS'))[0]
        return {
            'HOST': host,
            'PORT': port,
            'NAME': getattr(settings, template.format(property='NAME')),
            'PASSWORD': getattr(settings, template.format(property='SECRET_KEY')),
            'USER': getattr(settings, template.format(property='ACCESS_ID')),
            'NODE_NUM': int(getattr(settings, template.format(property='NODE_NUM'))),
            'CLUSTER': getattr(settings, template.format(property='CLUSTER'))
        }

    settings = build_db_settings(urn, "DAILY_EST")

    if device_feed_dict.keys()[0] == 1:
        store_id = 1
    else:
        store_id = 1001

    sql = '''select feed_id, Cast(sum(cnt) as int) as metric_sum from plproxy.execute_select_nestloop($$
    select feed_id, sum(estimate)  as cnt
    from (select distinct app_id, feed_id, store_id, estimate,category_id, device_id from aa.app_store_daily_estimate_{}
        where date = '{}'  and  feed_id in ({}) and store_id not in ({}) and isunique='t' ) as prod group by feed_id 
        $$) tbl (feed_id smallint, cnt bigint) group by feed_id '''.format(
        device_feed_dict.keys()[0], date, ','.join(device_feed_dict.values()), store_id
    )

    runner = LocalSqlRunner(settings)
    rows, _, columns = runner.select_return_columns(sql)
    return [Row(feed_id=r[0], metric_sum=r[1]) for r in rows]


def generate_plploxy_result(spark, r1, r2):
    schema = StructType([
        StructField("feed_id", StringType(), True),
        StructField("metric_sum", IntegerType(), True)])
    df1 = spark.createDataFrame(r1, schema)
    df1.createOrReplaceTempView("plploxy_ios")

    df2 = spark.createDataFrame(r2, schema)
    df2.createOrReplaceTempView("plploxy_android")

    spark.sql('''
    SELECT *,
    CASE 
    WHEN feed_id='101' THEN "sum_est_free_app_download"
    WHEN feed_id='100' THEN "sum_est_paid_app_download"
    WHEN feed_id='102' THEN "sum_est_revenue"
    WHEN feed_id='0' THEN "sum_est_free_app_download"
    WHEN feed_id='1' THEN "sum_est_paid_app_download"
    WHEN feed_id='2' THEN "sum_est_revenue"
    END AS metric from plploxy_ios
    ''').createOrReplaceTempView("plploxy_metric")

    spark.sql('''
    SELECT *,
    CASE 
    WHEN feed_id='0' THEN "sum_est_free_app_download"
    WHEN feed_id='1' THEN "sum_est_paid_app_download"
    WHEN feed_id='2' THEN "sum_est_revenue"
    END AS metric from plploxy_android
    ''').createOrReplaceTempView("plploxy_metric_android")

    spark.sql('''
    SELECT *,
    CASE 
    WHEN feed_id='101' THEN "ios-tablet"
    WHEN feed_id='100' THEN "ios-tablet"
    WHEN feed_id='102' THEN "ios-tablet"
    WHEN feed_id='0' THEN "ios-phone"
    WHEN feed_id='1' THEN "ios-phone"
    WHEN feed_id='2' THEN "ios-phone"
    END AS device_code from plploxy_metric
    ''').createOrReplaceTempView("plploxy_metric_device_code")

    spark.sql('''
    SELECT *,
    CASE 
    WHEN feed_id='0' THEN "android-all"
    WHEN feed_id='1' THEN "android-all"
    WHEN feed_id='2' THEN "android-all"
    END AS device_code from plploxy_metric_android
    ''').createOrReplaceTempView("plploxy_metric_device_code_android")

    spark.sql('''
    SELECT * FROM plploxy_metric_device_code_android
    UNION ALL
    SELECT * FROM plploxy_metric_device_code
    ''').createOrReplaceTempView("all_device")

    spark.sql('''
    SELECT 
        device_code, sum_est_free_app_download , sum_est_paid_app_download , sum_est_revenue
    FROM
          all_device
     PIVOT (
        max(metric_sum) 
        FOR metric IN ('sum_est_free_app_download','sum_est_paid_app_download', 'sum_est_revenue')
      )
    ''').createOrReplaceTempView("after_pivot")

    spark.sql('''
    SELECT device_code, sum_est_free_app_download FROM after_pivot
    WHERE sum_est_free_app_download is not null
    ''').createOrReplaceTempView("after_pivot_1")

    spark.sql('''
    SELECT device_code, sum_est_paid_app_download FROM after_pivot
    WHERE sum_est_paid_app_download is not null
    ''').createOrReplaceTempView("after_pivot_2")

    spark.sql('''
    SELECT device_code, sum_est_revenue FROM after_pivot
    WHERE sum_est_revenue is not null
    ''').createOrReplaceTempView("after_pivot_3")

    return spark.sql('''
    SELECT c.device_code, c.sum_est_free_app_download, c.sum_est_paid_app_download, d.sum_est_revenue
    FROM (
    SELECT a.device_code, a.sum_est_free_app_download, b.sum_est_paid_app_download FROM after_pivot_1 a
    JOIN after_pivot_2 b
    ON a.device_code=b.device_code
    ) AS c
    JOIN after_pivot_3 d
    on c.device_code=d.device_code
    order by device_code desc
    ''')


def generate_citus_result(spark, citus_data):
    schema = StructType([
        StructField("device_code", StringType(), True),
        StructField("sum_est_free_app_download", IntegerType(), True),
        StructField("sum_est_paid_app_download", IntegerType(), True),
        StructField("sum_est_revenue", IntegerType(), True)])

    df_3 = spark.createDataFrame(citus_data, schema)
    df_3.createOrReplaceTempView("citus_data")
    return spark.sql("select * from citus_data order by device_code desc")


def test_store_prproxy_data(spark, d):
    print d
    r1 = plproxy_row(d, ios_feed)
    r2 = plproxy_row(d, android_feed)
    citus_reseult = citus_row(d)

    d1 = generate_plploxy_result(spark, r1, r2)
    d2 = generate_citus_result(spark, citus_reseult)

    d1.createOrReplaceTempView("plploxy_r")
    d2.createOrReplaceTempView("citus_r")
    plproxy_except_result = spark.sql("select * from plploxy_r except all select * from citus_r")
    citus_except_result = spark.sql("select * from citus_r except all select * from plploxy_r")

    plproxy_except_result = plproxy_except_result.withColumn("result_from", F.lit("proproxy")).withColumn(
        "date", F.lit(str(d)))
    citus_except_result = citus_except_result.withColumn("result_from", F.lit("citus")).withColumn("date",
                                                                                                   F.lit(str(
                                                                                                       d)))

    result = plproxy_except_result.union(citus_except_result)
    return plproxy_except_result.take(5), citus_except_result.take(5)



DEFAULT_RECIPIENTS = "fzhang@appannie.com"

def send_db_check_email(title, text_context, key=None):
    default_recipients = [DEFAULT_RECIPIENTS]
    email.send(title, text_context, default_recipients, sender='dev-qa-data-quality@appannie.com')



# start = params.get('start_date')
# end = params.get('end_date')
start = '2020-06-01'
end = '2020-06-02'
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append((str(real_date1 + datetime.timedelta(days))))
dates.sort(reverse=True)

result = list()
for d in dates:
    result.append(test_store_prproxy_data(spark, d))

title = "store daily check"
text_context = result
send_db_check_email(title, text_context)



In [0]:

df_routine = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.1/fact/granularity=daily/date=2020-07-03/")
df_previously = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-log.v2/fact/granularity=daily/date=2020-07-03/")

df_routine.createOrReplaceTempView("routine")
df_previously.createOrReplaceTempView("previously")

spark.sql("select app_id, country_code, free_app_download_rank as free_app_download, paid_app_download_rank as paid_app_download, revenue_rank as revenue, revenue_iap_rank as revenue_iap, revenue_non_iap_rank as revenue_non_iap, device_code, category_id from routine where data_stage='final' except all select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code, category_id from previously ").show()

spark.sql("select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code, category_id from previously except all select app_id, country_code, free_app_download_rank as free_app_download, paid_app_download_rank as paid_app_download, revenue_rank as revenue, revenue_iap_rank as revenue_iap, revenue_non_iap_rank as revenue_non_iap, device_code, category_id from routine where data_stage='final' ").show()



In [0]:

df_routine = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category.v4/fact/granularity=daily/date=2020-07-03/")
df_previously = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date=2020-07-03/")

df_routine.createOrReplaceTempView("routine")
df_previously.createOrReplaceTempView("previously")

spark.sql("select app_id, country_code, free_app_download_rank as free_app_download, paid_app_download_rank as paid_app_download, revenue_rank as revenue, revenue_iap_rank as revenue_iap, revenue_non_iap_rank as revenue_non_iap, device_code, category_id from routine where data_stage='final' except all select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code, category_id from previously ").show()

spark.sql("select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code, category_id from previously except all select app_id, country_code, free_app_download_rank as free_app_download, paid_app_download_rank as paid_app_download, revenue_rank as revenue, revenue_iap_rank as revenue_iap, revenue_non_iap_rank as revenue_non_iap, device_code, category_id from routine where data_stage='final' ").show()



In [0]:

df_routine = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v4/fact/granularity=daily/date=2020-07-03/")
df_previously = spark.read.parquet("s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date=2020-07-03/")

df_routine.createOrReplaceTempView("routine")
df_previously.createOrReplaceTempView("previously")

spark.sql("select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code from routine where data_stage='final' except all select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code from previously ").show()

spark.sql("select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code from previously except all select app_id, country_code, free_app_download, paid_app_download, revenue, revenue_iap, revenue_non_iap, device_code from routine ").show()



In [0]:
%%sh

aws s3 ls s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-dna-

In [0]:
CREATE TABLE unified_est (
    data_stage text NOT NULL,
    country_code text NOT NULL,
    device_code text NOT NULL,
    granularity text NOT NULL,
    date date NOT NULL,
    app_id int NOT NULL,
    revenue_non_iap int,
    revenue_iap int ,
    free_app_download int ,
    paid_app_download int ,
    revenue int,
    PRIMARY KEY(data_stage,country_code,device_code,date,granularity,app_id)
)
PARTITION BY COLUMNS(granularity,date,data_stage,device_code)
WITH (
    NAMESPACE = 'aa.store.app-est.v4',
    COALESCE = 1,
    SAVEMODE = 'overwrite',
    DIMENSIONS = COLUMNS(data_stage,country_code,device_code,date,granularity,app_id),
    METRICS = COLUMNS(revenue_non_iap,revenue_iap,free_app_download,paid_app_download,revenue)
);
CREATE TABLE unified_category (
  data_stage text NOT NULL,
  country_code text NOT NULL,
  device_code text NOT NULL,
  granularity text NOT NULL,
  date date NOT NULL,
  category_id int NOT NULL,
  app_id bigint NOT NULL,
  revenue_non_iap int,
  revenue_non_iap_rank int,
  revenue_iap int ,
  revenue_iap_rank int,
  free_app_download int,
  free_app_download_rank int,
  paid_app_download int ,
  paid_app_download_rank int ,
  revenue int ,
  revenue_rank int,
  PRIMARY KEY(data_stage,country_code,device_code,date,granularity,category_id,app_id)
)
PARTITION BY COLUMNS(granularity,date,data_stage,device_code)
WITH (
  NAMESPACE = 'aa.store.app-est-category.v4',
  COALESCE = 2,
  SAVEMODE = 'overwrite',
  DIMENSIONS = COLUMNS(data_stage,country_code,device_code,date,granularity,category_id,app_id),
  METRICS = COLUMNS(revenue_non_iap,revenue_iap,free_app_download,paid_app_download,revenue)
);
CREATE TABLE unified_est_preload_dna_log (
    granularity text NOT NULL,
    date date NOT NULL,
    country_code text NOT NULL,
    device_code text NOT NULL,
    app_id bigint NOT NULL,
    publisher_id bigint NOT NULL,
    company_id bigint NOT NULL,
    parent_company_id bigint NOT NULL,
    est_free_app_download int,
    est_paid_app_download int,
    est_revenue int,
    est_paid_download int,
    est_organic_download int,
    PRIMARY KEY(granularity,date,country_code,device_code,app_id,company_id, parent_company_id, publisher_id)
)
PARTITION BY COLUMNS(granularity,date,device_code)
WITH (
    NAMESPACE = 'aa.store.app-est-dna-log.v2',
    COALESCE = 1,
    SAVEMODE = 'overwrite',
    DIMENSIONS = COLUMNS(country_code,device_code,date,granularity,app_id, company_id, parent_company_id, publisher_id),
    METRICS = COLUMNS(est_free_app_download,est_paid_app_download,est_revenue,est_paid_download,est_organic_download)
);
CREATE TABLE unified_category_preload (
    granularity text NOT NULL,
    date date NOT NULL,
    country_code text NOT NULL,
    device_code text NOT NULL,
    app_id bigint NOT NULL,
    category_id int NOT NULL,
    est_free_app_download int,
    est_paid_app_download int,
    est_revenue int,
    est_paid_download int,
    est_organic_download int,
  PRIMARY KEY(country_code,device_code,date,granularity,app_id,category_id)
)
PARTITION BY COLUMNS(granularity,date,device_code)
WITH (
  NAMESPACE = 'aa.store.app-est-category-load.v4',
  COALESCE = 1,
  SAVEMODE = 'overwrite',
  DIMENSIONS = COLUMNS(country_code,device_code,date,granularity,app_id,category_id),
  METRICS = COLUMNS(est_free_app_download,est_paid_app_download,est_revenue, est_paid_download, est_organic_download)
);




CREATE TABLE unified_download_attribution (
    country_code text NOT NULL,
    device_code text NOT NULL,
    granularity text NOT NULL,
    date date NOT NULL,
    app_id bigint NOT NULL,
    organic_download_share decimal(36,20) NOT NULL,
    PRIMARY KEY(country_code,device_code,date,granularity,app_id)
)
    PARTITION BY COLUMNS(granularity,date,device_code)
    WITH (
    NAMESPACE = 'aa.store.download-attribution.v4',
    COALESCE = 1,
    SAVEMODE = 'overwrite',
    DIMENSIONS = COLUMNS(country_code,device_code,date,granularity,app_id),
    METRICS = COLUMNS(organic_download_share)
);
CREATE TABLE est_table (
    country_code text NOT NULL,
    device_code text NOT NULL,
    granularity text NOT NULL,
    date date NOT NULL,
    app_id bigint NOT NULL,
    company_id bigint NOT NULL,
    parent_company_id bigint NOT NULL,
    publisher_id bigint NOT NULL,
    est_free_app_download int ,
    est_paid_app_download int ,
    est_revenue int,
    est_organic_download int,
    est_paid_download int,
    PRIMARY KEY(country_code, device_code, date, granularity, app_id, company_id, parent_company_id, publisher_id)
)
PARTITION BY COLUMNS(granularity, date, device_code)
WITH (
    NAMESPACE = 'aa.store.download-attribution-dna-log.v2',
    coalesce=1,
    SAVEMODE = 'overwrite',
    DIMENSIONS = COLUMNS(country_code, device_code, date, granularity, app_id, company_id, parent_company_id, publisher_id),
    METRICS = COLUMNS(est_free_app_download, est_paid_app_download, est_revenue, est_paid_download, est_organic_download)
);
CREATE TABLE cate_table (
    country_code text NOT NULL,
    device_code text NOT NULL,
    granularity text NOT NULL,
    date date NOT NULL,
    app_id bigint NOT NULL,
    category_id int NOT NULL,
    est_free_app_download int ,
    est_paid_app_download int ,
    est_revenue int,
    est_organic_download int,
    est_paid_download int,
    PRIMARY KEY(country_code, device_code, date, granularity, app_id, category_id)
)
PARTITION BY COLUMNS(granularity, date, device_code)
WITH (
    NAMESPACE = 'aa.store.download-attribution-category-load.v4',
    coalesce=1,
    SAVEMODE = 'overwrite',
    DIMENSIONS = COLUMNS(country_code, device_code, date, granularity, app_id, category_id),
    METRICS = COLUMNS(est_free_app_download, est_paid_app_download, est_revenue, est_paid_download, est_organic_download)
);