In [0]:


from pyspark.sql import types as T
from pyspark.sql import functions as F
from aadatapipelinecore.core.utils.spark import eject_all_caches
from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
import datetime


spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()


sql_text = """

-- mapping feed as metrc in raw
WITH feed_metric AS (
select *, 'free_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='0' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='1' and platform='ios'
UNION ALL
select *, 'revenue' as metric , "ios-phone" as device_code from rank_raw where  feed='2' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-tablet" as device_code from rank_raw where  feed='101' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-tablet" as device_code from rank_raw  where  feed='100' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-tablet" as device_code from rank_raw  where  feed='102' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1000' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1001' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-all" as device_code from rank_raw  where  feed='1002' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric , "android-all" as device_code from rank_raw   where  feed='0' and platform='android'
UNION ALL
select *, 'paid_app_download' as metric, "android-all" as device_code from rank_raw  where  feed='1' and platform='android'
UNION ALL
select *, 'revenue' as metric,  "android-all" as device_code from rank_raw  where  feed='2' and platform='android'
);


-- select tested column from raw data
WITH metric_raw_data AS (
SELECT id, category_id as raw_category_id,rank,store_id as raw_store_id , metric,device_code,date , platform from feed_metric where store_id not in (3,4,5,6, 1002,1003, 1005,1004, 1006,1007)
);


-- group by and count data in raw data
WITH group_by_raw AS (
SELECT count(id) AS total_count , raw_category_id, raw_store_id, metric,device_code,date,platform from metric_raw_data where raw_store_id not in (3,4,5,6, 1002,1003, 1004, 1005, 1006,1007) group by raw_category_id, raw_store_id, metric,device_code,date, platform
);


-- pivot metric column
WITH pivot_metric_rank_raw AS (

SELECT 
free_app_download,revenue, paid_app_download, raw_category_id,raw_store_id,device_code, platform,date
FROM
      group_by_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);




-- select tested column from raw data
WITH metric_raw_store_data AS (
SELECT distinct id, est, store_id as raw_store_id , metric,device_code, date , platform from feed_metric where store_id not in (3,4,5,6, 1002,1003, 1005,1004, 1006,1007)

);


-- group by and count data in raw data
WITH group_by_store_raw AS (
SELECT count(est) AS total_count ,raw_store_id, metric,device_code,date,platform from metric_raw_store_data where raw_store_id not in (3,4,5,6,1002,1003, 1004, 1005, 1006,1007) group by raw_store_id, metric,device_code,date, platform
);


-- pivot metric column
WITH pivot_metric_store_raw AS (

SELECT 
free_app_download,revenue, paid_app_download,raw_store_id,device_code, platform, date
FROM
      group_by_store_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);



-- map raw with category
WITH category_mapping_raw AS (

SELECT * from ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
 ) as mapping right join pivot_metric_rank_raw on mapping.legacy_category_id=pivot_metric_rank_raw.raw_category_id and 
mapping.mapping_platform=pivot_metric_rank_raw.platform
);


-- map raw with rank country_code
WITH country_category_mapping_rank_raw AS (
select date, raw_store_id, country_code,device_code,category_id,free_app_download,paid_app_download,revenue from country_code_mapping right join category_mapping_raw on country_code_mapping.country_code_store_id=category_mapping_raw.raw_store_id and country_code_mapping.market_code=category_mapping_raw.platform
);



-- map raw with store country_code
WITH country_mapping_store_raw AS (
select date, raw_store_id, country_code,device_code,free_app_download,paid_app_download,revenue from country_code_mapping right join pivot_metric_store_raw on country_code_mapping.country_code_store_id=pivot_metric_store_raw.raw_store_id and country_code_mapping.market_code=pivot_metric_store_raw.platform
);


-- group by unified data
WITH unified_group_data AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code, category_id as unified_category_id from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code, category_id from  rank_unified ) as unified
group by  category_id,  country_code,  device_code );


-- group by unified data
WITH unified_group_data_store AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code from  store_unified ) as unified
group by   country_code,  device_code );




-- compare raw vs unified data
WITH compared_data_rank AS (
    SELECT * from country_category_mapping_rank_raw left join unified_group_data on unified_group_data.unified_country_code==country_category_mapping_rank_raw.country_code and unified_group_data.unified_category_id==country_category_mapping_rank_raw.category_id and unified_group_data.unified_device_code==country_category_mapping_rank_raw.device_code
);

WITH miss_data_rank AS (
select * from compared_data_rank where unified_count_paid_app_download!=paid_app_download or unified_count_free_app_download != free_app_download  or unified_count_revenue != revenue or unified_count_app_id is null
)



-- compare raw vs unified data store
WITH compared_store_data AS (
    SELECT * from country_mapping_store_raw left join unified_group_data_store on unified_group_data_store.unified_country_code==country_mapping_store_raw.country_code and unified_group_data_store.unified_device_code==country_mapping_store_raw.device_code
);


WITH miss_data_store AS (
select * from compared_store_data where free_app_download!=unified_count_free_app_download or paid_app_download!=unified_count_paid_app_download or revenue!=unified_count_revenue or unified_count_app_id is null
)


"""


start = '2020-04-07'
end = '2020-05-28'
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))

# dates=['2020-03-01','2020-01-24',"2019-12-06","2020-01-02","2020-01-05","2020-01-10","2020-01-11","2020-01-13","2020-01-20"]

d1 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("ios"))
d1 = spark.createDataFrame([(0, 'WW', 'Worldwide', 'ios')],
                           schema=["store_id", "country_code", "_c2", "market_code"]).union(d1)

d2 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("android"))
country_code_df = d1.union(d2).where("country_code is not null").cache()
country_code_df = country_code_df.withColumnRenamed("store_id", "country_code_store_id")
print 'country mapping table'
country_code_df.show(2)
country_code_df.createOrReplaceTempView("country_code_mapping")

category_mapping_table = spark.read.parquet(
    "s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping")
category_mapping_table.createOrReplaceTempView("category_mapping_deminsion_service")

namespace = "aa.store.market-size.v1"
for test_date in dates:
    print test_date
    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "name": "rank_raw",
                "data_encoding": "parquet",
                "compression": "gzip",
                "path": [
                    "s3://b2c-prod-dca-store-estimates/store_estv2/APP_ESTIMATES_FINAL/version=2.0.0/range_type=DAY/date={}/".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "rank_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={}".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "store_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}".format(
                        test_date)],
            }
        ]
    }
    run(spark, ingest_msg, sql_text)
    result = spark.sql("select * from miss_data_rank")
    result.show()
    result.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_rank_data_v2_{}/".format(start),
                                      mode="append",
                                      partitionBy=["date"])


    result_store = spark.sql("select * from miss_data_store")
    result_store.show()
    result_store.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_store_data_v2_{}/".format(start),
                                      mode="append",
                                      partitionBy=["date"])

    eject_all_caches(spark)



In [0]:



from pyspark.sql import types as T
from pyspark.sql import functions as F

from aadatapipelinecore.core.urn import Urn
from aadatapipelinecore.core.pipeline import type_
from applications.common.parser import SqlParser
from applications.common.executor import SqlExecutor
from applications.auto_pipeline.transform import _view
import datetime
from aadatapipelinecore.core.utils.spark import eject_all_caches


spark.sparkContext.addPyFile("/home/hadoop/bdp/application/libs/python/dependencies.zip")
import aaplproxy

class DryRunSqlExecutor(SqlExecutor):
    def _verify_tasks(self):
        pass


def run(spark, raw_data, sql_text, dry_run=True):
    urn = Urn(namespace=raw_data["namespace"])
    source_data_list = raw_data.pop("source")
    raw_data.update(raw_data.pop("options"))
    _view(spark, sql_text, None, source_data_list)
    context = raw_data
    tasks = SqlParser(spark, sql_text, context).parse()
    if dry_run:
        sql_executor = DryRunSqlExecutor
    else:
        sql_executor = SqlExecutor
    sql_executor(urn, spark, tasks, type_.EventType.TRANSFORM, context).run()


sql_text = """

-- mapping feed as metrc in raw
WITH feed_metric AS (
select *, 'free_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='0' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-phone" as device_code from rank_raw where  feed='1' and platform='ios'
UNION ALL
select *, 'revenue' as metric , "ios-phone" as device_code from rank_raw where  feed='2' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-tablet" as device_code from rank_raw where  feed='101' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-tablet" as device_code from rank_raw  where  feed='100' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-tablet" as device_code from rank_raw  where  feed='102' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1000' and platform='ios'
UNION ALL
select *, 'paid_app_download' as metric, "ios-all" as device_code from rank_raw  where  feed='1001' and platform='ios'
UNION ALL
select *, 'revenue' as metric, "ios-all" as device_code from rank_raw  where  feed='1002' and platform='ios'
UNION ALL
select *, 'free_app_download' as metric , "android-all" as device_code from rank_raw   where  feed='0' and platform='android'
UNION ALL
select *, 'paid_app_download' as metric, "android-all" as device_code from rank_raw  where  feed='1' and platform='android'
UNION ALL
select *, 'revenue' as metric,  "android-all" as device_code from rank_raw  where  feed='2' and platform='android'
);


-- select tested column from raw data
WITH metric_raw_data AS (
SELECT id, category_id as raw_category_id,rank,store_id as raw_store_id , metric,device_code,date , platform from feed_metric where store_id not in (2,3,4,5,6,1002, 1003, 1005,1004, 1006,1007)
);


-- group by and count data in raw data
WITH group_by_raw AS (
SELECT count(id) AS total_count , raw_category_id, raw_store_id, metric,device_code,date,platform from metric_raw_data where raw_store_id not in (2,3,4,5,6, 1002, 1003, 1004, 1005, 1006,1007) group by raw_category_id, raw_store_id, metric,device_code,date, platform
);


-- pivot metric column
WITH pivot_metric_rank_raw AS (

SELECT 
free_app_download,revenue, paid_app_download, raw_category_id,raw_store_id,device_code, platform,date
FROM
      group_by_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);




-- select tested column from raw data
WITH metric_raw_store_data AS (
SELECT distinct id, est, store_id as raw_store_id , metric,device_code,date , platform from feed_metric where store_id not in (2, 3,4,5,6, 1002,1003, 1005,1004, 1006,1007)
);


-- group by and count data in raw data
WITH group_by_store_raw AS (
SELECT count(est) AS total_count ,raw_store_id, metric,device_code,date,platform from metric_raw_store_data where raw_store_id not in (2, 3,4,5,6, 1002, 1003, 1005, 1004, 1006,1007) group by raw_store_id, metric,device_code,date, platform
);


-- pivot metric column
WITH pivot_metric_store_raw AS (

SELECT 
free_app_download,revenue, paid_app_download,raw_store_id,device_code, platform, date
FROM
      group_by_store_raw
 PIVOT (
    max(total_count) 
	FOR metric IN ('free_app_download','revenue', 'paid_app_download')
  )
);



-- map raw with category
WITH category_mapping_raw AS (

SELECT * from ( select *, 'ios' as mapping_platform from category_mapping_deminsion_service where market_code='apple-store' 
UNION ALL select *, 'android' as mapping_platform from category_mapping_deminsion_service where market_code='google-play'
 ) as mapping right join pivot_metric_rank_raw on mapping.legacy_category_id=pivot_metric_rank_raw.raw_category_id and 
mapping.mapping_platform=pivot_metric_rank_raw.platform
);


-- map raw with rank country_code
WITH country_category_mapping_rank_raw AS (
select date, raw_store_id, country_code,device_code,category_id,free_app_download,paid_app_download,revenue from country_code_mapping right join category_mapping_raw on country_code_mapping.country_code_store_id=category_mapping_raw.raw_store_id and country_code_mapping.market_code=category_mapping_raw.platform
);



-- map raw with store country_code
WITH country_mapping_store_raw AS (
select date, raw_store_id, country_code,device_code,free_app_download,paid_app_download,revenue from country_code_mapping right join pivot_metric_store_raw on country_code_mapping.country_code_store_id=pivot_metric_store_raw.raw_store_id and country_code_mapping.market_code=pivot_metric_store_raw.platform
);


-- group by unified data
WITH unified_group_data AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code, category_id as unified_category_id from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code, category_id from  rank_unified ) as unified
group by  category_id,  country_code,  device_code );


-- group by unified data
WITH unified_group_data_store AS (
select count(app_id) as unified_count_app_id, count(free_app_download) as unified_count_free_app_download, count(paid_app_download) as unified_count_paid_app_download, count(revenue) as unified_count_revenue,
  country_code as unified_country_code, device_code as unified_device_code from ( select distinct  app_id, free_app_download, paid_app_download, revenue, country_code, device_code from  store_unified ) as unified
group by   country_code,  device_code );




-- compare raw vs unified data
WITH compared_data_rank AS (
    SELECT * from country_category_mapping_rank_raw left join unified_group_data on unified_group_data.unified_country_code==country_category_mapping_rank_raw.country_code and unified_group_data.unified_category_id==country_category_mapping_rank_raw.category_id and unified_group_data.unified_device_code==country_category_mapping_rank_raw.device_code
);

WITH miss_data_rank AS (
select * from compared_data_rank where unified_count_paid_app_download!=paid_app_download or unified_count_free_app_download != free_app_download  or unified_count_revenue != revenue or unified_count_app_id is null
)



-- compare raw vs unified data store
WITH compared_store_data AS (
    SELECT * from country_mapping_store_raw left join unified_group_data_store on unified_group_data_store.unified_country_code==country_mapping_store_raw.country_code and unified_group_data_store.unified_device_code==country_mapping_store_raw.device_code
);


WITH miss_data_store AS (
select * from compared_store_data where free_app_download!=unified_count_free_app_download or paid_app_download!=unified_count_paid_app_download or revenue!=unified_count_revenue or unified_count_app_id is null
)


"""



start = "2015-01-01"
end = "2015-01-11"
real_date1 = datetime.date(*[int(x) for x in start.split('-')])
real_date2 = datetime.date(*[int(x) for x in end.split('-')])
date_range = real_date2 - real_date1
dates = list()
for days in xrange(date_range.days):
    dates.append(real_date1 + datetime.timedelta(days))

# dates=["2013-06-03"]

d1 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/IOS_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("ios"))
d1 = spark.createDataFrame([(0, 'WW', 'Worldwide', 'ios')],
                           schema=["store_id", "country_code", "_c2", "market_code"]).union(d1)

d2 = spark.read.csv("s3://b2c-prod-dca-store-estimates/store_back/dimension/ANDROID_COUNTRY_MAPPING",
                    sep="\t").withColumnRenamed("_c0", "store_id").withColumnRenamed("_c1",
                                                                                     "country_code").withColumn(
    "market_code", F.lit("android"))

country_code_df = d1.union(d2).where("country_code is not null").cache()
country_code_df = country_code_df.withColumnRenamed("store_id", "country_code_store_id")
print 'country mapping table'
country_code_df.show(2)
country_code_df.createOrReplaceTempView("country_code_mapping")

category_mapping_table = spark.read.parquet(
    "s3://b2c-prod-data-pipeline-qa/aa.store/store_cateogry_mapping")
category_mapping_table.createOrReplaceTempView("category_mapping_deminsion_service")

namespace = "aa.store.market-size.v1"
for test_date in dates:
    print test_date
    csv_schema = T.StructType(
        [
            T.StructField("store_id", T.IntegerType(), True),
            T.StructField("date", T.StringType(), True),
            T.StructField("platform_id", T.IntegerType(), True),
            T.StructField("vertical", T.IntegerType(), True),
            T.StructField("feed", T.IntegerType(), True),
            T.StructField("id", T.LongType(), True),
            T.StructField("est", T.IntegerType(), True),
            T.StructField("category_id", T.IntegerType(), True),
            T.StructField("rank", T.IntegerType(), True)
        ]
    )

    raw1 = spark.read.option("basePath",
                             "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
        csv_schema).csv(
        "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{}/android/sbe_est_app/*/".format(
            test_date), sep="\t").withColumn("platform", F.lit("android")).cache()
    raw2 = spark.read.option("basePath",
                             "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/").schema(
        csv_schema).csv(
        "s3://b2c-prod-dca-store-estimates/store_est/v_final/DAY/{}/ios/sbe_est_app/*/".format(test_date),
        sep="\t").withColumn("platform", F.lit("ios")).cache()

    df_raw = raw1.union(raw2).cache()
    df_raw.createOrReplaceTempView("rank_raw")

    ingest_msg = {
        "namespace": "aa.store.market-size.v1",
        "job_type": "routine",
        "options": {},
        "source": [
            {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "rank_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est-category-rank.v1/fact/granularity=daily/date={}".format(
                        test_date)],
            }, {
                "data_encoding": "parquet",
                "compression": "gzip",
                "name": "store_unified",
                "path": [
                    "s3://b2c-prod-data-pipeline-unified-store-paid/unified/store.app-est.v1/fact/granularity=daily/date={}".format(
                        test_date)],
            }
        ]
    }
    run(spark, ingest_msg, sql_text)
    result = spark.sql("select * from miss_data_rank")
    result.show()
    # result.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_rank_data_v1_{}/".format(test_date),
    #                                   mode="append",
    #                                   partitionBy=["date"])


    result_store = spark.sql("select * from miss_data_store")
    result.show()

    # result_store.write.format("delta").save("s3://b2c-prod-data-pipeline-qa/aa.store/result_store_data_v1_{}/".format(test_date),
    #                                   mode="append",
    #                                   partitionBy=["date"])
    eject_all_caches(spark)
